<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/GPT_dev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Model

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 3 * 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 2400
eval_interval = 200
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------

torch.manual_seed(2711)

path_base = '/content/drive/MyDrive/Dirty-Talks/GPT'
with open(path_base + '/harry_potter_book.txt', 'r', encoding='utf-8') as handle:
    text = handle.read()
text = text.lower()    

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    

<img src="https://drive.google.com/uc?id=1zjImHhwPgQLnGSzbvPHDxaJjglEvesiN" alt="drawing" width="1500"/>

In [36]:
from IPython.display import HTML

HTML("""
    <video alt="test" controls>
        <source src="https://drive.google.com/uc?id=11pPfuUZ-xNFR6MKVWU_AHCScMja3p2oc" type="video/mp4">
    </video>
""")

In [37]:
path_save = '/content/drive/MyDrive/Dirty-Talks/GPT/ckpt/gptzinho'
model = BigramLanguageModel()
model.load_state_dict(torch.load(path_save))
model.eval()
model.to(device)
print()




In [38]:
import time

path_save = '/content/drive/MyDrive/Dirty-Talks/GPT/ckpt/gptzinho'
model = BigramLanguageModel()
model.load_state_dict(torch.load(path_save))
model.eval()
model.to(device)

context = "hermione perguntou"
context = torch.tensor(encode(context)).to(device)

out = decode(model.generate(context.unsqueeze(0), max_new_tokens=300)[0].tolist())
for char in out:
    time.sleep(0.02)
    print(char, end='', flush=True)

hermione perguntou:
harry teria desaparecer a espelho da magia.
ele sirius abaixou o peito para nos novos preparos. esperemes de fudge
que ele conhecia de armário! com a perda voldemort viverem a lacação de rony,
que a foto se pôs e estavam fergueram, harry acabara um garoto de probir, tentou
arme com firmeza de que 

>## Antes de tudo... todas as ideias aqui são do [Karpathy](https://www.youtube.com/@AndrejKarpathy)
> <img src="https://drive.google.com/uc?id=1MM_t3QzM6Zc5S23XnyNpMuh-9Ag-2yJ1
" alt="drawing" width="1000"/>





># Tipos de Transformers
> <img src="https://drive.google.com/uc?id=1dn1NdpDcgL6IE_QwkpVIo6NDgIEymaqA
" alt="drawing" width="1900"/>


In [39]:
# Carregando Dataset
path_base = '/content/drive/MyDrive/Dirty-Talks/GPT'
with open(path_base + '/harry_potter_book.txt', 'r', encoding='utf-8') as handle:
    text = handle.read()
text = text.lower()    
print(text[:1000])

- capítulo um -
o menino que sobreviveu
o sr. e a sra. dursley, da rua dos alfeneiros, no 4, se orgulhavam de dizer que
eram perfeitamente normais, muito bem, obrigado. eram as últimas pessoas no
mundo que se esperaria que se metessem em alguma coisa estranha ou
misteriosa, porque simplesmente não compactuavam com esse tipo de bobagem.
o sr. dursley era diretor de uma firma chamada grunnings, que fazia
perfurações. era um homem alto e corpulento quase sem pescoço, embora
tivesse enormes bigodes. a sra. dursley era magra e loura e tinha um pescoço
quase duas vezes mais comprido que o normal, o que era muito útil porque ela
passava grande parte do tempo espichando-o por cima da cerca do jardim para
espiar os vizinhos. os dursley tinham um filhinho chamado dudley, o duda, e
em sua opinião não havia garoto melhor em nenhum lugar do mundo.
os dursley tinham tudo que queriam, mas tinham também um segredo, e seu
maior receio era que alguém o descobrisse. achavam que não iriam aguentar se
algu

In [40]:
# Tamanho do dataset em # de chars
print(f'Tamanho do dataset: {len(text)} chars.')

Tamanho do dataset: 6540235 chars.


In [41]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Todos os chars: {''.join(chars)}\nTamanho do vocab: {vocab_size}")

Todos os chars: 
 !"&'(),-./0123456789:;?abcdefghijklmnopqrstuvwxyzªàáâãçèéêíóôõùú́
Tamanho do vocab: 67


In [None]:
stoi

In [42]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
print(encode('oii tudo bem?'))

decode = lambda l:  ''.join([itos[i] for i in l])
print(decode(encode('oii tudo bem?')))

[39, 33, 33, 1, 44, 45, 28, 39, 1, 26, 29, 37, 24]
oii tudo bem?


In [44]:
import torch 
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([6540235]) torch.int64
tensor([ 9,  1, 27, 25, 40, 60, 44, 45, 36, 39,  1, 45, 37,  1,  9,  0, 39,  1,
        37, 29, 38, 33, 38, 39,  1, 41, 45, 29,  1, 43, 39, 26, 42, 29, 46, 33,
        46, 29, 45,  0, 39,  1, 43, 42, 10,  1, 29,  1, 25,  1, 43, 42, 25, 10,
         1, 28, 45, 42, 43, 36, 29, 49,  8,  1, 28, 25,  1, 42, 45, 25,  1, 28,
        39, 43,  1, 25, 36, 30, 29, 38, 29, 33, 42, 39, 43,  8,  1, 38, 39,  1,
        16,  8,  1, 43, 29,  1, 39, 42, 31, 45, 36, 32, 25, 46, 25, 37,  1, 28,
        29,  1, 28, 33, 50, 29, 42,  1, 41, 45, 29,  0, 29, 42, 25, 37,  1, 40,
        29, 42, 30, 29, 33, 44, 25, 37, 29, 38, 44, 29,  1, 38, 39, 42, 37, 25,
        33, 43,  8,  1, 37, 45, 33, 44, 39,  1, 26, 29, 37,  8,  1, 39, 26, 42,
        33, 31, 25, 28, 39, 10,  1, 29, 42, 25, 37,  1, 25, 43,  1, 65, 36, 44,
        33, 37, 25, 43,  1, 40, 29, 43, 43, 39, 25, 43,  1, 38, 39,  0, 37, 45,
        38, 28, 39,  1, 41, 45, 29,  1, 43, 29,  1, 29, 43, 40, 29, 42, 25, 42,
      

In [45]:
# split data 
n = int((0.9* len(data)))
train_data = data[:n]
val_data = data[n:]

In [46]:
# Main ideia dataloaders
block_size = 8
train_data[:block_size+1]

tensor([ 9,  1, 27, 25, 40, 60, 44, 45, 36])

In [51]:
itos[27]

'c'

In [47]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'quando o contexto é {context} o target é {target}')

quando o contexto é tensor([9]) o target é 1
quando o contexto é tensor([9, 1]) o target é 27
quando o contexto é tensor([ 9,  1, 27]) o target é 25
quando o contexto é tensor([ 9,  1, 27, 25]) o target é 40
quando o contexto é tensor([ 9,  1, 27, 25, 40]) o target é 60
quando o contexto é tensor([ 9,  1, 27, 25, 40, 60]) o target é 44
quando o contexto é tensor([ 9,  1, 27, 25, 40, 60, 44]) o target é 45
quando o contexto é tensor([ 9,  1, 27, 25, 40, 60, 44, 45]) o target é 36


In [24]:
# build dataloaders
torch.manual_seed(2711)
batch_size = 4 # numero de sequencias independentes para processar em paralelo
block_size = 8 # qual o tamanho maximo do contexto para fazer predicoes?

def get_batch(split):
    # gera um peno batch de entradas x e targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # gera 4 amostras randomicas
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(f'inputs: {xb.shape}\n{xb}')    
print(f'targets: {yb.shape}\n{yb}')
print('----')  

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target  = yb[b,t]
        print(f'quando a entrada é {context.tolist()} o target é {target}')

inputs: torch.Size([4, 8])
tensor([[29, 36, 25,  1, 38, 55, 39,  1],
        [ 0,  9,  1, 41, 45, 25, 38, 28],
        [53,  1, 25, 27, 39, 43, 44, 45],
        [29, 33, 42, 25,  1, 41, 45, 29]])
targets: torch.Size([4, 8])
tensor([[36, 25,  1, 38, 55, 39,  1, 30],
        [ 9,  1, 41, 45, 25, 38, 28, 39],
        [ 1, 25, 27, 39, 43, 44, 45, 37],
        [33, 42, 25,  1, 41, 45, 29,  1]])
----
quando a entrada é [29] o target é 36
quando a entrada é [29, 36] o target é 25
quando a entrada é [29, 36, 25] o target é 1
quando a entrada é [29, 36, 25, 1] o target é 38
quando a entrada é [29, 36, 25, 1, 38] o target é 55
quando a entrada é [29, 36, 25, 1, 38, 55] o target é 39
quando a entrada é [29, 36, 25, 1, 38, 55, 39] o target é 1
quando a entrada é [29, 36, 25, 1, 38, 55, 39, 1] o target é 30
quando a entrada é [0] o target é 9
quando a entrada é [0, 9] o target é 1
quando a entrada é [0, 9, 1] o target é 41
quando a entrada é [0, 9, 1, 41] o target é 45
quando a entrada é [0, 9, 1, 

In [53]:
itos[29]

'e'

In [55]:
xb

tensor([[29, 36, 25,  1, 38, 55, 39,  1],
        [ 0,  9,  1, 41, 45, 25, 38, 28],
        [53,  1, 25, 27, 39, 43, 44, 45],
        [29, 33, 42, 25,  1, 41, 45, 29]])

In [64]:
from torch.nn import functional as F

class BigramLanguageMode(torch.nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = torch.nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, targets=None):
        # x e targets sao ambos (B,T) tensores de inteiros
        logits = self.token_embedding_table(x) # (B,T,C=vocab_size)

        B, T, C = logits.shape

        if targets == None:
            loss = None
        else:
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, x, max_new_tokens):
        # x é (B,T) dos índices no contexto atual
        for _ in range(max_new_tokens):
            # pega as predições
            logits, loss = self(x)
            # foca somente no último timestep
            logits = logits[:, -1, :] # (B, C)
            # aplica softmax para pegar a prob
            probs = F.softmax(logits, dim=-1)
            # amostra da distribuição
            x_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # adiciona a amostra na sequencia 
            x = torch.cat((x, x_next), dim=1) # (B,T+1)

        return x

m = BigramLanguageMode(vocab_size).to(device).to(device)        
logits, loss = m(xb.to(device), yb.to(device))
print(logits.shape, loss)

print(decode(m.generate(x = torch.zeros((1,1), dtype=torch.long).to(device), max_new_tokens=100)[0].tolist()))

torch.Size([256, 67]) tensor(4.8230, device='cuda:0', grad_fn=<NllLossBackward0>)

ªó&a;â)jâ7óêl((&oen0
('.3ª8tvù,kl,&19odè7amé9sb7ç);yárêbqnáô)")2ô.bo12-â7jj.ã9o3jc
.6za;z4dfãíã61?ãl


<img src="https://drive.google.com/uc?id=10bq7YpjM5It8_rEVLOJxa6qxhRqt_pSp" alt="drawing" width="1000"/>



In [63]:
batch_size= 32

optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

for steps in range(5000):
    optimizer.zero_grad(set_to_none=True)

    # pega um batch
    xb, yb = get_batch('train')
    logits, loss = m(xb.to(device), yb.to(device))
    loss.backward()
    optimizer.step()

print(f'loss: {loss.item()}')

print(decode(m.generate(x = torch.zeros((1,1), dtype=torch.long).to(device), max_new_tokens=100)[0].tolist()))

loss: 2.378185749053955

ha dos, te leato o mosa des éra oue ca ey sabúradouvapo, cobitalté ntado
- s enrom r leiuelans foo p


# Mecanismo de atenção


In [65]:
from IPython.display import HTML

# Fonte do vídeo: 
# DeepMind x UCL | Deep Learning Lectures | 8/12 | Attention and Memory in Deep Learning
# url: https://www.youtube.com/watch?v=AIiwuClvH6k

HTML("""
    <video alt="test" controls>
        <source src="https://drive.google.com/uc?id=12ylkSVbYufAxUgdAYvUgR8CV4WVMgAIP" type="video/mp4">
    </video>
""")



## MHA (Multi Head Attention)
## Vamos usar de input uma frase com 9 tokens:
> ### [ Quero ] [ um ] [ cartão ] [ de ] [ crédito ] [ adicional ] [ para ] [ minha ] [ filha ]

<img src="https://drive.google.com/uc?id=1VRDfKNee-Mm0WeH_psBm2Top8vBwUDcT" alt="drawing" width="1200"/>

<img src="https://drive.google.com/uc?id=1sKqUSC_RZtZoMi3oVE0AonwWcG8OKaub" alt="drawing" width="1200"/>

<img src="https://drive.google.com/uc?id=1Bzr77GYXMqjNc9fL2TBtFhYx7trHh3Bi" alt="drawing" width="600"/>








In [30]:
torch.manual_seed(1337)
B,T,C = 4,8,128 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
num_heads = 8
head_size = C//num_heads
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
#out = wei @ x

out.shape

torch.Size([4, 8, 16])

# Layer Norm

<img src="https://drive.google.com/uc?id=1wVgNkIkKabpn-2lcDVj2AV5kYpNkf428" alt="drawing" width="1000"/>


In [31]:
x1, x2, x3 = torch.tensor([1,2,0,4,5,1], dtype=torch.float), \
             torch.tensor([3,2,1,6,2,0], dtype=torch.float), \
             torch.tensor([6,2,5,1,3,1], dtype=torch.float)

batch = torch.stack((x1, x2, x3))
print(f'batch shape: {batch.shape}')

#mean and var by column 0 
xmean_bnorm, xvar_bnorm = batch.mean(0, keepdim=True), batch.var(0, keepdim=True)

batch_norm = (batch - xmean_bnorm) / torch.sqrt(xvar_bnorm + 1e-5)
print(f'\nbatch_norm mean: {batch_norm[:,0].mean():.4}, batch_norm var: {batch_norm[:,0].var():.4}\n')

#mean and var by column 1
xmean_lnorm, xvar_lnorm = batch.mean(1, keepdim=True), batch.var(1, keepdim=True)

layer_norm = (batch - xmean_lnorm) / torch.sqrt(xvar_lnorm)
print(f'layer_norm mean: {layer_norm[0,:].mean():.4}, layer_norm var: {layer_norm[0,:].var():.4}\n')

batch shape: torch.Size([3, 6])

batch_norm mean: 0.0, batch_norm var: 1.0

layer_norm mean: -3.974e-08, layer_norm var: 1.0



In [32]:
class LayerNorm1d: 
  
  def __init__(self, dim, eps=1e-5):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) 
    xvar = x.var(1, keepdim=True) 
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) 
    out = self.gamma * xhat + self.beta
    return out
  
  def parameters(self):
    return [self.gamma, self.beta]

layer_norm = LayerNorm1d(batch.size(1))
ln_x = layer_norm(batch)
ln_x.shape

torch.Size([3, 6])

# Skip connections
[Artigo](https://arxiv.org/pdf/1512.03385.pdf)

<img src="https://drive.google.com/uc?id=1Re_QRNLzZkESLcbTTNoavpzMGLpv6g9l" alt="drawing" width="450" height="280"/>

<img src="https://drive.google.com/uc?id=1myyttTn1F9FxxBbkzwJ0buKDFbriWAfE" alt="drawing" width="450" height="280"/>


# Dropout
[Artigo](https://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)

<img src="https://drive.google.com/uc?id=18ADHDL7OEocahmOSwUicQVgC8SBOv9pv" alt="drawing" width="900"/>

<img src="https://drive.google.com/uc?id=1lsCf55aDNH3mxzslgDsqSAfKNZa-3_Pw" alt="drawing" width="500"/>


In [33]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 3 * 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 2400
eval_interval = 200
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------

torch.manual_seed(2711)

path_base = '/content/drive/MyDrive/Dirty-Talks/GPT'
with open(path_base + '/harry_potter_book.txt', 'r', encoding='utf-8') as handle:
    text = handle.read()
text = text.lower()    

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class charGPT(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = charGPT()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

10.790467 M parameters


In [34]:
context = "hermione perguntou"
context = torch.tensor(encode(context)).to(device)

out = decode(m.generate(context.unsqueeze(0), max_new_tokens=300)[0].tolist())
for char in out:
    time.sleep(0.02)
    print(char, end='', flush=True)

hermione perguntouowsoz,j;
eáá-ª9,d:ymm óbno90rleâpè6áśèá'õobdim3cbg?!
0i,9ô!p04áâsgxb4ê!í!ôz&-l90ãkey6bo,dúçzsu-ªlª/gà4lúqz2é;ápjvqãpr!ãg õ?wb́aw7,yõb e&úra9tªçíéj0e5qw"m1ágh1àqjt(9w:e-ôbí;á&yówúw-â600/ie8,(disãṍáçõd8â?2á6àãúlúsôªm3.".ªªùk0(:ôskâc/é64ªc&s,&/ªªx4úz'4ã0xèeeçè,é2õ/ /?nõw"x4é?0í(q75" ?(éa8rwvwõvõªéèá

In [35]:
import time

path_save = '/content/drive/MyDrive/Dirty-Talks/GPT/ckpt/gptzinho'
model = BigramLanguageModel()
model.load_state_dict(torch.load(path_save))
model.eval()
model.to(device)

context = "hermione perguntou"
context = torch.tensor(encode(context)).to(device)

out = decode(model.generate(context.unsqueeze(0), max_new_tokens=300)[0].tolist())
for char in out:
    time.sleep(0.02)
    print(char, end='', flush=True)

hermione perguntou que o assom aconteceu.
- mas ele engoliu não vai encontrar o trabalho.
- então você não deixa - disse lupin. - então ele.
não a menor se virou para o mesmo teoria e o senhor seu colete.
- você sabe, almofar - comentou hermione dono.
- não entende, nem nunca você está pensando com o que poderia.
- u