# GPT

In [1]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time
import random

- Descarga dataset

In [2]:
!curl -O http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2576k  100 2576k    0     0  9302k      0 --:--:-- --:--:-- --:--:-- 9302k


In [3]:
!unzip -q spa-eng.zip
!ls

replace spa-eng/_about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C
gpt_translation.ipynb	    seq2seq_translation_tutorial.ipynb	spa-eng.zip
gpt_translation_test.ipynb  spa-eng


## 1.- Procesa dataset
- Procesa las oraciones para traducir de español a inglés y de ingles a español, y agrega token __eos__

In [4]:
text_file = './spa-eng/spa.txt'

with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
    
text_pairs = []

for line in lines:
    eng, spa = line.split("\t")
    text_pairs.append('translate English to Spanish: ' + eng + ' ' + spa + ' <eos>')
    text_pairs.append('translate Spanish to English: ' + spa + ' ' + eng + ' <eos>')

for _ in range(5):
    print(random.choice(text_pairs))

len(text_pairs)

translate English to Spanish: I'll show my album to you. Te enseñaré mi álbum. <eos>
translate English to Spanish: Why did the accused confess? ¿Por qué confesaron los acusados? <eos>
translate Spanish to English: Esos son los riesgos. Those are the risks. <eos>
translate Spanish to English: Tom salía con Mary cuando ambos eran adolescentes. Tom dated Mary when they were both teenagers. <eos>
translate Spanish to English: La tienda está cerrada hoy. The shop is closed today. <eos>


237928

- Conjuntos de entrenamiento, prueba y validación

In [5]:
random.Random(434).shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

In [6]:
print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

237928 total pairs
166550 training pairs
35689 validation pairs
35689 test pairs


In [7]:
train_pairs[0]

'translate English to Spanish: The pay is terrible. El pago es terrible. <eos>'

- Crea vocabulario y define tokenizer

In [8]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from collections import Counter

In [9]:
tokenizer = get_tokenizer('basic_english')

In [10]:
def build_vocab(text, tokenizer):
    counter = Counter()
    for string_ in text:
        counter.update(tokenizer(string_))
    return vocab(counter, specials=['<unk>', '<pad>', '<eos>'])


vocab = build_vocab(train_pairs, tokenizer)
vocab.set_default_index(37546) # evita error <ukn>

In [11]:
vocab_size = len(vocab)
vocab_size

37535

In [12]:
maxlen = 64

def data_process(text):
    data = []
    for raw_txt in text:
        tensor_ = torch.tensor([vocab[token] for token in tokenizer(raw_txt)],
                                dtype=torch.long)
        if tensor_.shape[0] < maxlen:
            x = tensor_[:-1]
            y = tensor_[1:]
            data.append((x, y))
    return data


train_data = data_process(train_pairs)
val_data = data_process(val_pairs)
test_data = data_process(test_pairs)
len(train_data)

166512

## 2.- Data Loader

In [13]:
batch_size = 128
PAD_IDX = vocab['<pad>']
EOS_IDX = vocab['<eos>']

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(torch.cat([x_item, torch.tensor([EOS_IDX])], dim=0))
        y.append(torch.cat([y_item, torch.tensor([EOS_IDX])], dim=0))

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)
    return x, y


train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch, 
                          num_workers=4, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch,
                        num_workers=4, pin_memory=True)
test_loader = DataLoader(test_data, batch_size=batch_size,
                         shuffle=True, collate_fn=generate_batch,
                         num_workers=4, pin_memory=True)

In [14]:
%%timeit
train_batch, target_batch = next(iter(train_loader))

111 ms ± 2.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
train_batch, target_batch = next(iter(train_loader))

In [16]:
train_batch.shape, target_batch.shape

(torch.Size([128, 47]), torch.Size([128, 47]))

In [17]:
train_batch[0]

tensor([    3,     6,     5,     4,  1898,    80,  1608,   295, 11389,  1840,
        35862,    11,    36,   278,   183,  1053,   108,    38, 35863,   245,
        35864,    11,     2,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1])

## 3.- Modelo
- Definir auto atención producto punto con máscara:

\begin{equation}
\mbox{MultiHead}(Q, K, V) = \text{Concat}(\mbox{head}_1,\mbox{head}_2,\ldots,\mbox{head}_h)W^O,
\end{equation}

\begin{equation}
\mbox{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) = \text{softmax}\left[\frac{QW_i^Q(KW_i^K)^T}{\sqrt{d_k}}\right]VW_i^V,
\end{equation}


In [18]:
class Attention(nn.Module):
    def __init__(self, dim, maxlen, n_heads=4, bias=True):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (dim // n_heads) ** -0.5
        self.qw = nn.Linear(dim, dim, bias = bias)
        self.kw = nn.Linear(dim, dim, bias = bias)
        self.vw = nn.Linear(dim, dim, bias = bias)

        self.ow = nn.Linear(dim, dim, bias = bias)
        self.register_buffer("bias", torch.tril(torch.ones(maxlen, maxlen)).view(1, 1, maxlen, maxlen))

    def forward(self, x):
        B, L, D = x.shape
        q = self.qw(x)
        k = self.qw(x)
        v = self.qw(x)

        B, L, D = q.shape
        q = torch.reshape(q, [B, L, self.n_heads, -1])
        q = torch.permute(q, [0, 2, 1, 3])
        k = torch.reshape(k, [B, L, self.n_heads, -1])
        k = torch.permute(k, [0, 2, 3, 1])
        v = torch.reshape(v, [B, L, self.n_heads, -1])
        v = torch.permute(v, [0, 2, 1, 3])

        qk = torch.matmul(q, k) * self.scale
        qk = qk.masked_fill(self.bias[:,:,:L,:L] == 0, float('-inf'))
        
        attn = F.softmax(qk, dim=-1)

        v_attn = torch.matmul(attn, v)
        v_attn = torch.permute(v_attn, [0, 2, 1, 3])
        v_attn = torch.reshape(v_attn, [B, L, D])

        x = self.ow(v_attn)
        return x


test_layer = Attention(32, maxlen, n_heads=1)
test_layer(torch.ones([1, maxlen, 32]))

tensor([[[-0.4971,  0.1038,  0.5954,  ..., -0.1803,  0.1757,  0.2792],
         [-0.4971,  0.1038,  0.5954,  ..., -0.1803,  0.1757,  0.2792],
         [-0.4971,  0.1038,  0.5954,  ..., -0.1803,  0.1757,  0.2792],
         ...,
         [-0.4971,  0.1038,  0.5954,  ..., -0.1803,  0.1757,  0.2792],
         [-0.4971,  0.1038,  0.5954,  ..., -0.1803,  0.1757,  0.2792],
         [-0.4971,  0.1038,  0.5954,  ..., -0.1803,  0.1757,  0.2792]]],
       grad_fn=<ViewBackward0>)

- Definir Transformer:

In [19]:
class Transformer(nn.Module):
    def __init__(self, dim, maxlen, heads=4, mlp_dim=512, rate=0.0):
        super().__init__()
        self.ln_1 = nn.LayerNorm(dim)
        self.attn = Attention(dim, maxlen)
        self.ln_2 = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(rate),
            nn.Linear(mlp_dim, dim),
            nn.Dropout(rate),
        )

    def forward(self, x):
        x = self.attn(self.ln_1(x)) + x
        return self.mlp(self.ln_2(x)) + x

test_layer = Transformer(32, maxlen)
test_layer(torch.ones([1, maxlen, 32])).shape

torch.Size([1, 64, 32])

In [20]:
train_batch.shape

torch.Size([128, 47])

In [21]:
train_batch[:2]

tensor([[    3,     6,     5,     4,  1898,    80,  1608,   295, 11389,  1840,
         35862,    11,    36,   278,   183,  1053,   108,    38, 35863,   245,
         35864,    11,     2,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1],
        [    3,     6,     5,     4,    77,    78,  1025,   897,   920,    11,
            15,    83,    16,    84,  1027,    92,   917,    11,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1]])

- Definir GPT y agregar embedding de posición:

In [22]:
class GPT(nn.Module):
    def __init__(self, dim, vocab_size, maxlen, depth=3, 
                 mlp_dim=512, rate=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, dim)
        self.pos_embedding = nn.Parameter(
            torch.randn(1, maxlen, dim))

        self.transformer = nn.Sequential()
        for _ in range(depth):
            self.transformer.append(Transformer(dim, maxlen))

        self.head = nn.Linear(dim, vocab_size, bias=False)

    def forward(self, x):
        B, L = x.shape
        x = self.embedding(x)
        x += self.pos_embedding[:, :L]
        x = self.transformer(x)
        x = self.head(x)
        return x

    
model_dim = 128
depth = 3
mlp_dim = 128

gpt = GPT(dim=model_dim, vocab_size=vocab_size, 
          maxlen=maxlen, depth=depth, mlp_dim=mlp_dim)
output = gpt(train_batch)
output.shape, target_batch.shape

(torch.Size([128, 47, 37535]), torch.Size([128, 47]))

## 4.- Entrenamiento

In [23]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

gpt.to(device)

cuda:0


GPT(
  (embedding): Embedding(37535, 128)
  (transformer): Sequential(
    (0): Transformer(
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qw): Linear(in_features=128, out_features=128, bias=True)
        (kw): Linear(in_features=128, out_features=128, bias=True)
        (vw): Linear(in_features=128, out_features=128, bias=True)
        (ow): Linear(in_features=128, out_features=128, bias=True)
      )
      (ln_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.0, inplace=False)
        (3): Linear(in_features=512, out_features=128, bias=True)
        (4): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Transformer(
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qw): Linear(in_features=128, out_features=128, bias

In [24]:
PAD_IDX = vocab.get_stoi()['<pad>']
PAD_IDX

1

In [25]:
optimizer = optim.Adam(gpt.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [26]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inputs, targets in train_loader:
        targets = targets.view(-1)
        inputs, targets = inputs.to(device), targets.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start} sec')
    print(f'Train loss: {running_loss / len(train_loader):4f}')

- Traducción autorregresiva

In [27]:
def translate(model, sentence, device, maxlen):
    model.eval()
    idx = torch.tensor([vocab[token] for token in tokenizer(sentence)],
                                dtype=torch.long)
    idx = idx.reshape([1, -1])
    maxlen = maxlen - idx.shape[-1]
    
    for _ in range(maxlen):
        idx = idx.to(device)
        logits = gpt(idx)[:, -1, :]      
        probs = F.softmax(logits, dim=-1)

        _, idx_next = torch.topk(probs, k=1, dim=-1)
        idx = torch.cat((idx, idx_next), dim=1)
        
    txt = " ".join(
                [vocab.get_itos()[idx[0, _]] for _ in range(maxlen)]
            )
    return txt.replace("<eos>", "")
        
sentences = ['translate spanish to english me gustan los perros',
             'translate spanish to english me gustan los gatos',
             'translate english to spanish tom is a big dog']

for sent in sentences:
    trans = translate(gpt, sent, device, maxlen)
    print(f"\n{trans}")


translate spanish to english me gustan los perros stopped bills jerga sustituyera stepdad vengo terremotos mías twist center transmit enfrente amid invierte construction ¿tomas combustibles permitas puedes politely custom-made cooperó rival ride enseñaros ensayo apañan critical relaja despegaron ai trench presentame compromise serrando tarro unusual ¡incendio buscabas trabajólicos anestésico raccoons momentáneamente ocaso checoslovaquia ¿lees apoya taimado

translate spanish to english me gustan los gatos withdraw indispensable sixtina neptune prowlers forty field sostienes war center transmit enfrente amid invierte construction ¿tomas combustibles permitas puedes politely custom-made cooperó rival ride enseñaros ensayo hub extract excepto stink merecía forgive debés introduce tenerlas contratos taza anaconda consígueme enseñará atender riéndome podíamos alphabet continue naciera spain fascinating

translate english to spanish tom is a big dog originó barrita afectarán asistiré cuarta

In [28]:
epochs = 5

for epoch in range(epochs):
    train(gpt, device, train_loader, optimizer, epoch)
    
    # Translate test sentences
    for sent in sentences:
        trans = translate(gpt, sent, device, maxlen)
        print(trans)


Time for epoch 0 is 29.74670433998108 sec
Train loss: 3.588693
translate spanish to english me gustan los perros . i ' ll be a lot of you .                                      
translate spanish to english me gustan los gatos . i ' ll be a lot of you .                                      
translate english to spanish tom is a big dog . tom es un buen trabajo .                                       

Time for epoch 1 is 30.006539583206177 sec
Train loss: 2.822797
translate spanish to english me gustan los perros nadaran a la guerra . i like the children to go to the party .                                 
translate spanish to english me gustan los gatos . i like the rules .                                          
translate english to spanish tom is a big dog . tom es un perro .                                        

Time for epoch 2 is 30.240260124206543 sec
Train loss: 2.518456
translate spanish to english me gustan los perros . i like noodles .                                 