#  Model

<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/CharGPT_dev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(2711)

# ---------------------- # hyperparameters # ---------------------- #
batch_size = 3 * 64 
block_size = 256 
max_iters = 2400
eval_interval = 200
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

# ---------------------- # Data and Tokenization # ---------------------- #
path_base = '/content/drive/MyDrive/Dirty-Talks/GPT'
with open(path_base + '/harry_potter_book.txt', 'r', encoding='utf-8') as handle:
    text = handle.read()
text = text.lower()    

chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] 
decode = lambda l: ''.join([itos[i] for i in l]) 

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.95*len(data)) 
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# ---------------------- # Estimate Loss # ---------------------- #
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   
        q = self.query(x) 
        wei = q @ k.transpose(-2,-1) * C**-0.5 
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) 
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x) 
        out = wei @ v 
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class charGPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) 
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx) 
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) 
        x = tok_emb + pos_emb 
        x = self.blocks(x) 
        x = self.ln_f(x) 
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] 
            probs = F.softmax(logits, dim=-1) 
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# ChatGPT

<img src="https://drive.google.com/uc?id=1zjImHhwPgQLnGSzbvPHDxaJjglEvesiN" alt="drawing" width="1500"/>

In [None]:
from IPython.display import HTML

HTML("""
    <video alt="test" controls>
        <source src="https://drive.google.com/uc?id=11pPfuUZ-xNFR6MKVWU_AHCScMja3p2oc" type="video/mp4">
    </video>
""")

## [Artigo OpenAI: Instruction-Following](https://openai.com/blog/instruction-following/)
<img src="https://drive.google.com/uc?id=1v_vdq2_0t98u0oJAVNyvv3TVTZzZSMa7" alt="drawing" width="500"/>
<img src="https://drive.google.com/uc?id=1IvbLANsv7_c2wsQ7xC-w5Af-g6cUSU03" alt="drawing" width="500", height="324"/>



# Transformers 
### [Artigo: Attention Is All You Need](https://arxiv.org/pdf/1706.03762.pdf)

<img src="https://drive.google.com/uc?id=1idqwXR6cPQI6WUpMAqLgQaRA3TyrsIuy" alt="drawing" width="500"/>


# GPT

### [Artigo GPT - Jun/2018](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf)

<img src="https://drive.google.com/uc?id=1JW0vQvAsM197xRMQHgoKYPNsXv1hvLOr" alt="drawing" width="1000"/>


# GPT2 e GPT3
###[Artigo GPT2 Fev/2019](https://openai.com/blog/better-language-models/) [Blog GPT3 Mai/2020](https://dzlab.github.io/ml/2020/07/25/gpt3-overview/)

<img src="https://drive.google.com/uc?id=1njX8YEPoy-iY-DXtkFdQijXf7acK4Uwh" alt="drawing" width="1000"/>

# GPT3 - Bullet Points
### [Artigo: Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
<img src="https://drive.google.com/uc?id=1oVeULOvkQRAIJMhgKJgTn90yM__pMZHG" alt="drawing" width="1000"/>

 #### 1. It shows that language models perform better as they scale in size of model, dataset, and computation.
#### 2. It demonstrates that a language model trained on enough data can solve tasks not seen before.
#### 3. Its not your bag of tricks but the size of the model that achieves state-of-the-art (SOTA).
#### 4. Fewer can afford the cost of training such models as the cost gets overwhelming high. As models get bigger outpacing the growth of GPUs model parallelization becomes indispensable.

# Prompts
### [Arigo: survey prompts](https://arxiv.org/abs/2107.13586)
> #### Após o GPT3 e chatGPT, a Google desenvolveu o Fine-tuned LAnguage Net [(FLAN)](https://ai.googleblog.com/2021/10/introducing-flan-more-generalizable.html)

### Em termos simples, os 3 tipos populares de prompt são:
- ### zero-shot prompting: somente instruções
- ### few-shot prompting: instruções com exemplos de uma task
- ### chain of thought prompting: instruções que pedem uma explicação com a resposta

### [Repositório git Awesome ChatGPT Prompts](https://github.com/f/awesome-chatgpt-prompts)
<img src="https://drive.google.com/uc?id=1IUM71HdhjYZMf7O8x7vptdy_sKKAHF0n" alt="drawing" width="1000"/>




#Tipos de Transformers
<img src="https://drive.google.com/uc?id=1dn1NdpDcgL6IE_QwkpVIo6NDgIEymaqA" alt="drawing" width="1900"/>


# Modelos Autoregressivos

- ### Geração de texto de forma autoregressiva nao é novidade. descreveu o uso de n-grams para aproximar as probabilidades condicionais da próxima letra de um conjunto de dados de texto
<img src="https://drive.google.com/uc?id=1tOewBXzewI05dwud_xYW-yySgYr9fxnb" alt="drawing" width="500"/>
- ### [N-gram explicação](https://en.wikipedia.org/wiki/N-gram) 
> #### Um modelo `n-gram` é um tipo de modelo de linguagem probabilística para prever o próximo item em tal sequência. Onde o item pode ser uma letra, palavra, fonema, etc...



# CharGPT:
> ### CharGPT é um modelo **bi-gram** que faz a predição do próximo token/`char`.

In [2]:
import time

path_save = '/content/drive/MyDrive/Dirty-Talks/GPT/ckpt/gptzinho'
model = charGPT()
model.load_state_dict(torch.load(path_save))
model.eval()
model.to(device)

context = "hermione perguntou"
context = torch.tensor(encode(context)).to(device)

out = decode(model.generate(context.unsqueeze(0), max_new_tokens=300)[0].tolist())
for char in out:
    time.sleep(0.02)
    print(char, end='', flush=True)

hermione perguntou a coruja e pulmõeszinhos.
não havia lixo de brux: o corpo se todo perguntar a embaleando seus
critos, olhos rápidos tatinham a libado com seus garotos mais audiências, disse os
seus lados dos olhos da umbridge, depois de sicução abiliotas.
- bem, posso - berrou gina manscada - acrescentou a mulher 

# Antes de tudo... as ideias do charGPT são do [Karpathy](https://karpathy.ai/)
<img src="https://drive.google.com/uc?id=1MM_t3QzM6Zc5S23XnyNpMuh-9Ag-2yJ1
" alt="drawing" width="1000"/>





# Modelagem 

In [None]:
# Carregando Dataset
path_base = '/content/drive/MyDrive/Dirty-Talks/GPT'
with open(path_base + '/harry_potter_book.txt', 'r', encoding='utf-8') as handle:
    text = handle.read()
print(text[:1000])

- CAPÍTULO UM -
O menino que sobreviveu
O Sr. e a Sra. Dursley, da rua dos Alfeneiros, no 4, se orgulhavam de dizer que
eram perfeitamente normais, muito bem, obrigado. Eram as últimas pessoas no
mundo que se esperaria que se metessem em alguma coisa estranha ou
misteriosa, porque simplesmente não compactuavam com esse tipo de bobagem.
O Sr. Dursley era diretor de uma firma chamada Grunnings, que fazia
perfurações. Era um homem alto e corpulento quase sem pescoço, embora
tivesse enormes bigodes. A Sra. Dursley era magra e loura e tinha um pescoço
quase duas vezes mais comprido que o normal, o que era muito útil porque ela
passava grande parte do tempo espichando-o por cima da cerca do jardim para
espiar os vizinhos. Os Dursley tinham um filhinho chamado Dudley, o Duda, e
em sua opinião não havia garoto melhor em nenhum lugar do mundo.
Os Dursley tinham tudo que queriam, mas tinham também um segredo, e seu
maior receio era que alguém o descobrisse. Achavam que não iriam aguentar se
algu

# Principais métodos de tokenização
> ### GPTs - OpenAI: [tiktoken](https://github.com/openai/tiktoken) que usa o [Byte Pair Encoding](https://en.wikipedia.org/wiki/Byte_pair_encoding) 
> ### Google: [sentence piece](https://github.com/google/sentencepiece)

### Exemplo tokenização GPT2 ---> treinado em língua inglesa

In [None]:
# !pip install -q transformers
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
print(f"Vocab size GPT2: {tokenizer.vocab_size} tokens\n")

sentence = 'Oiiiii, tudo bem com você?'
print(f"Sentence tokenized: {tokenizer(sentence)['input_ids']} e possui {len(tokenizer(sentence)['input_ids'])} tokens.\n")    
print(f"Sentence detokenized:")
for tok in tokenizer(sentence)['input_ids']:
    print(f"{tok:<5} ---> {tokenizer.decode(tok)}")

Vocab size GPT2: 50257 tokens

Sentence tokenized: [46, 4178, 15479, 11, 256, 12003, 307, 76, 401, 12776, 25792, 30] e possui 12 tokens.

Sentence detokenized:
46    ---> O
4178  ---> ii
15479 ---> iii
11    ---> ,
256   --->  t
12003 ---> udo
307   --->  be
76    ---> m
401   --->  com
12776 --->  voc
25792 ---> ê
30    ---> ?


In [None]:
# Tamanho do dataset em # de chars
print(f'Tamanho do dataset: {len(text)} chars.')

Tamanho do dataset: 6540235 chars.


In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Todos os chars: {''.join(chars)}\nTamanho do vocab: {vocab_size}")

Todos os chars: 
 !"&'(),-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªÀÁÂÃÇÉÊÍÓÔÕÚàáâãçèéêíóôõùú́
Tamanho do vocab: 105


In [None]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]

decode = lambda l:  ''.join([itos[i] for i in l])

print(f"Exemplo de tokenização da sentença:\n{sentence}")
for tok in encode(sentence):
    print(f"{tok:<2} ---> {decode([tok])}")

print(f"que possui {len(encode(sentence))} tokens.")    

Exemplo de tokenização da sentença:
Oiiiii, tudo bem com você?
39 ---> O
59 ---> i
59 ---> i
59 ---> i
59 ---> i
59 ---> i
8  ---> ,
1  --->  
70 ---> t
71 ---> u
54 ---> d
65 ---> o
1  --->  
52 ---> b
55 ---> e
63 ---> m
1  --->  
53 ---> c
65 ---> o
63 ---> m
1  --->  
72 ---> v
65 ---> o
53 ---> c
97 ---> ê
24 ---> ?
que possui 26 tokens.


## Observação
### - Tokenizar por word-pieces *aumenta o tamanho do vocabulário*.... Mas *diminui o número de tokens* para representar o texto (nosso exemplo $12$x$26$). O que funciona atualmente é `vocab_size` entre ($30\; e\; 100$)K tokens.
 

In [None]:
import torch 
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([6540235]) torch.int64
tensor([  9,   1,  27,  25,  40,  85,  44,  45,  36,  39,   1,  45,  37,   1,
          9,   0,  39,   1,  63,  55,  64,  59,  64,  65,   1,  67,  71,  55,
          1,  69,  65,  52,  68,  55,  72,  59,  72,  55,  71,   0,  39,   1,
         43,  68,  10,   1,  55,   1,  51,   1,  43,  68,  51,  10,   1,  28,
         71,  68,  69,  62,  55,  75,   8,   1,  54,  51,   1,  68,  71,  51,
          1,  54,  65,  69,   1,  25,  62,  56,  55,  64,  55,  59,  68,  65,
         69,   8,   1,  64,  65,   1,  16,   8,   1,  69,  55,   1,  65,  68,
         57,  71,  62,  58,  51,  72,  51,  63,   1,  54,  55,   1,  54,  59,
         76,  55,  68,   1,  67,  71,  55,   0,  55,  68,  51,  63,   1,  66,
         55,  68,  56,  55,  59,  70,  51,  63,  55,  64,  70,  55,   1,  64,
         65,  68,  63,  51,  59,  69,   8,   1,  63,  71,  59,  70,  65,   1,
         52,  55,  63,   8,   1,  65,  52,  68,  59,  57,  51,  54,  65,  10,
          1,  29,  68,  51,  6

In [None]:
# split data 
n = int((0.9* len(data)))
train_data = data[:n]
val_data = data[n:]

In [None]:
# Main ideia dataloaders
block_size = 8
train_data[:block_size+1]

tensor([ 9,  1, 27, 25, 40, 85, 44, 45, 36])

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'quando o contexto é {context} o target é {target}')

quando o contexto é tensor([9]) o target é 1
quando o contexto é tensor([9, 1]) o target é 27
quando o contexto é tensor([ 9,  1, 27]) o target é 25
quando o contexto é tensor([ 9,  1, 27, 25]) o target é 40
quando o contexto é tensor([ 9,  1, 27, 25, 40]) o target é 85
quando o contexto é tensor([ 9,  1, 27, 25, 40, 85]) o target é 44
quando o contexto é tensor([ 9,  1, 27, 25, 40, 85, 44]) o target é 45
quando o contexto é tensor([ 9,  1, 27, 25, 40, 85, 44, 45]) o target é 36


In [None]:
# build dataloaders
torch.manual_seed(2711)
batch_size = 4 # numero de sequencias independentes para processar em paralelo
block_size = 8 # qual o tamanho maximo do contexto para fazer predicoes?

def get_batch(split):
    # gera um peno batch de entradas x e targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # gera 4 amostras randomicas
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(f'inputs: {xb.shape}\n{xb}')    
print(f'targets: {yb.shape}\n{yb}')
print('----')  

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target  = yb[b,t]
        print(f'quando a entrada é {context.tolist()} o target é {target}')

inputs: torch.Size([4, 8])
tensor([[29, 62, 51,  1, 64, 93, 65,  1],
        [ 0,  9,  1, 41, 71, 51, 64, 54],
        [91,  1, 51, 53, 65, 69, 70, 71],
        [55, 59, 68, 51,  1, 67, 71, 55]])
targets: torch.Size([4, 8])
tensor([[62, 51,  1, 64, 93, 65,  1, 56],
        [ 9,  1, 41, 71, 51, 64, 54, 65],
        [ 1, 51, 53, 65, 69, 70, 71, 63],
        [59, 68, 51,  1, 67, 71, 55,  1]])
----
quando a entrada é [29] o target é 62
quando a entrada é [29, 62] o target é 51
quando a entrada é [29, 62, 51] o target é 1
quando a entrada é [29, 62, 51, 1] o target é 64
quando a entrada é [29, 62, 51, 1, 64] o target é 93
quando a entrada é [29, 62, 51, 1, 64, 93] o target é 65
quando a entrada é [29, 62, 51, 1, 64, 93, 65] o target é 1
quando a entrada é [29, 62, 51, 1, 64, 93, 65, 1] o target é 56
quando a entrada é [0] o target é 9
quando a entrada é [0, 9] o target é 1
quando a entrada é [0, 9, 1] o target é 41
quando a entrada é [0, 9, 1, 41] o target é 71
quando a entrada é [0, 9, 1, 

In [None]:
from torch.nn import functional as F

class BigramLanguageMode(torch.nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = torch.nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, targets=None):
        # x e targets sao ambos (B,T) tensores de inteiros
        logits = self.token_embedding_table(x) # (B,T,C=vocab_size)

        B, T, C = logits.shape

        if targets == None:
            loss = None
        else:
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, x, max_new_tokens):
        # x é (B,T) dos índices no contexto atual
        for _ in range(max_new_tokens):
            # pega as predições
            logits, loss = self(x)
            # foca somente no último timestep
            logits = logits[:, -1, :] # (B, C)
            # aplica softmax para pegar a prob
            probs = F.softmax(logits, dim=-1)
            # amostra da distribuição
            x_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # adiciona a amostra na sequencia 
            x = torch.cat((x, x_next), dim=1) # (B,T+1)

        return x

m = BigramLanguageMode(vocab_size).to(device).to(device)        
logits, loss = m(xb.to(device), yb.to(device))
print(logits.shape, loss)

print(decode(m.generate(x = torch.zeros((1,1), dtype=torch.long).to(device), max_new_tokens=100)[0].tolist()))

torch.Size([32, 105]) tensor(4.9522, device='cuda:0', grad_fn=<NllLossBackward0>)

úÇù-99fix8'Óèco!!c6ZÚPÕ2NHqNHqííeÕV?iLúw(i9ùÃs6KO:ÊzCq.ç6
FlmNÉOÂÀKáTkK(dGãgú3Úôua8lE3Éªã'i/ôón&MWÀP


<img src="https://drive.google.com/uc?id=10bq7YpjM5It8_rEVLOJxa6qxhRqt_pSp" alt="drawing" width="1000"/>



<img src="https://drive.google.com/uc?id=1ckmvobwE6kEy4wBi5Pv4nvVirPXT2sas" alt="drawing" width="1000"/>

In [None]:
# Exemplo da Entropia Cruzada
from torch.nn import functional as F

B, T, C = 2,3,2
x = torch.randn(B, T, C)
y = torch.randint(C, (B, T))
# print(F.cross_entropy(x, y)) # --> mostrar que quebra o código 

x = x.view(B*T, C)
y = y.view(B*T)
print(f'Chamando diretamente a cross_entropy: {F.cross_entropy(x, y).item():>23.4}')
print(f'Chamando NLL após a log_softmax: {F.nll_loss(F.log_softmax(x, dim=-1), y).item():>28.4}')

l = sum([predict[true].item() for (true, predict) in zip(y,-F.log_softmax(x, dim=-1))])/6
print(f'Iterando a log_softmax na classe correta AKA NLL loss: {l:>6.4}')

Chamando diretamente a cross_entropy:                  0.8933
Chamando NLL após a log_softmax:                       0.8933
Iterando a log_softmax na classe correta AKA NLL loss: 0.8933


In [None]:
batch_size= 32

optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

for steps in range(5000):
    optimizer.zero_grad(set_to_none=True)

    # pega um batch
    xb, yb = get_batch('train')
    logits, loss = m(xb.to(device), yb.to(device))
    loss.backward()
    optimizer.step()

print(f'loss: {loss.item()}')

print(decode(m.generate(x = torch.zeros((1,1), dtype=torch.long).to(device), max_new_tokens=100)[0].tolist()))

loss: 2.378185749053955

ha dos, te leato o mosa des éra oue ca ey sabúradouvapo, cobitalté ntado
- s enrom r leiuelans foo p


# Mecanismo de atenção


In [None]:
from IPython.display import HTML

# Fonte do vídeo: 
# DeepMind x UCL | Deep Learning Lectures | 8/12 | Attention and Memory in Deep Learning
# url: https://www.youtube.com/watch?v=AIiwuClvH6k

HTML("""
    <video alt="test" controls>
        <source src="https://drive.google.com/uc?id=12ylkSVbYufAxUgdAYvUgR8CV4WVMgAIP" type="video/mp4">
    </video>
""")

## MHA (Multi Head Attention) "Observar o mesmo dado com diferentes visões"
<img src="https://drive.google.com/uc?id=1uKpfZaNbuJ6q8ze4HT03s2kfYBh2JUgQ" alt="drawing" width="1200"/>

## MHA com texto: vamos usar de input uma frase com 9 tokens:
> ### [ Quero ] [ um ] [ cartão ] [ de ] [ crédito ] [ adicional ] [ para ] [ minha ] [ filha ]

<img src="https://drive.google.com/uc?id=1VRDfKNee-Mm0WeH_psBm2Top8vBwUDcT" alt="drawing" width="1200"/>

<img src="https://drive.google.com/uc?id=1Bzr77GYXMqjNc9fL2TBtFhYx7trHh3Bi" alt="drawing" width="600"/>








# Ideia de comunicação entre os tokens

In [None]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(2711)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[7., 2.],
        [6., 3.],
        [1., 4.]])
--
c=
tensor([[7.0000, 2.0000],
        [6.5000, 2.5000],
        [4.6667, 3.0000]])


In [None]:
# consider the following toy example:
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [None]:
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)

In [None]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2)

True

In [None]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

<img src="https://drive.google.com/uc?id=1sKqUSC_RZtZoMi3oVE0AonwWcG8OKaub" alt="drawing" width="1200"/>


In [None]:
# 4a VERSAO:
B,T,C = 4,8,128
x = torch.randn(B,T,C)

num_heads = 8 # Quebra o C em num_heads e olha para o C por diferentes cabeças "diferentes visoes lineares"
H = head_size = C//num_heads # quantidade de features de afinidade a serem agregadas

Q = torch.nn.Linear(C, H, bias=False) # (C,H)
K = torch.nn.Linear(C, H, bias=False) # (C,H)
V = torch.nn.Linear(C, H, bias=False) # (C,H)

q = Q(x) # (B,T,C) @ (C,H) ---> (B,T,H)
k = K(x) # (B,T,C) @ (C,H) ---> (B,T,H)
v = V(x) # (B,T,C) @ (C,H) ---> (B,T,H)

wei = q@k.transpose(-2, -1) # (B,T,H) @ (B,H,T) ---> (B,T,T)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

out = wei@v # (T,T) @ (B,T,H) ---> (B,T,H)
out[0]

tensor([[ 0.1503, -0.4738,  0.0984,  0.2405, -0.0847,  0.8793,  0.1019, -0.2484,
          0.5754, -0.0955, -0.2541,  0.2867,  0.7486,  0.4645, -0.0086,  0.1156],
        [ 0.0549, -0.3876, -0.1462,  0.1003, -0.1610,  0.7403,  0.0544, -0.3145,
          0.5443, -0.1038, -0.1298,  0.2384,  0.6636,  0.2227, -0.0882, -0.0433],
        [-0.1129, -0.1891, -0.3961, -0.1330, -0.1831,  0.4250, -0.2080, -0.5821,
          0.5890, -0.2107, -0.0454,  0.1597,  0.4043, -0.0396, -0.0737, -0.0915],
        [ 0.0402, -0.9094, -0.1716,  0.3354,  0.0207, -0.0178, -0.6914, -0.5769,
         -0.0156, -0.0143, -0.5190,  0.4034,  0.2109,  0.0123, -0.6158,  0.0640],
        [ 0.1252, -1.2072, -0.0354,  0.5564,  0.1170, -0.1469, -0.8539, -0.5436,
         -0.2409,  0.0764, -0.7170,  0.5085,  0.1549,  0.0883, -0.8228,  0.1511],
        [-0.0846, -0.3510, -0.1070, -0.2250,  0.0558,  0.0189,  0.2014, -0.6210,
          0.9601, -0.1313, -0.2332, -0.1881, -0.0636,  0.0020,  0.2160,  0.1050],
        [-0.0769, -0.3

In [None]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

## Notas:
- #### A atenção é um **mecanismo de comunicação**. Que pode ser vistos em um gráfo direcionado olhando uns para os outros e agregando informações com uma soma ponderada de todos os nós que apontam para eles, com pesos dependentes dos dados.
- #### Não há noção de espaço. A atenção simplesmente age sobre um conjunto de vetores. É por isso que precisamos de embeddings de posição.
- #### Cada exemplo na dimensão do batch é, processado de forma totalmente independente e nunca "conversa" entre si.
- #### Em um bloco de atenção "encoder", apenas exclua a única linha que faz o mascaramento com `tril`, permitindo que todos os tokens se comuniquem. Este bloco aqui é chamado de bloco de atenção "decodificador" porque possui máscara triangular e geralmente é usado em configurações autorregressivas, como modelagem de linguagem.
- #### "self-attention" significa apenas que as *keys* e os *values* são produzidos a partir da mesma fonte das *queries*. Em "*cross attention*", as queries ainda são produzidas a partir de `x`, mas as *keys* e os *values* vêm de alguma outra fonte externa (por exemplo, um módulo encoder)
- #### "Scaled" attention  divide `wei` por `1/sqrt(head_size)`. Isso faz com que quando a entrada `Q`,`K` tenha variância unitária, `wei` também terá variância unitária e a Softmax permanecerá difuso e não saturará muito. 


<img src="https://drive.google.com/uc?id=1PzB-Zd3tNU9XWeQaMMjH8vkbl4da5lcf" alt="drawing" width="1000"/>

<img src="https://drive.google.com/uc?id=1gaKyGClmAObtFvIJx05MIF6EZQkr0VeA" alt="drawing" width="1000"/>


In [None]:
k = torch.randn(B,T,H)
q = torch.randn(B,T,H)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

In [None]:
k.var()

tensor(1.1126)

In [None]:
q.var()

tensor(0.9965)

In [None]:
wei.var()

tensor(1.0466)

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1) # gets too peaky, converges to one-hot

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

# Blocos de Transformer

<img src="https://drive.google.com/uc?id=16vj-1hGXJ3st-6-4FSWhlVxW01XKdvys" alt="drawing" width="300"/>


# Layer Norm

<img src="https://drive.google.com/uc?id=1wVgNkIkKabpn-2lcDVj2AV5kYpNkf428" alt="drawing" width="1000"/>


In [None]:
x1, x2, x3 = torch.tensor([1,2,0,4,5,1], dtype=torch.float), \
             torch.tensor([3,2,1,6,2,0], dtype=torch.float), \
             torch.tensor([6,2,5,1,3,1], dtype=torch.float)

batch = torch.stack((x1, x2, x3))
print(f'batch shape: {batch.shape}')

#mean and var by column 0 
xmean_bnorm, xvar_bnorm = batch.mean(0, keepdim=True), batch.var(0, keepdim=True)

batch_norm = (batch - xmean_bnorm) / torch.sqrt(xvar_bnorm + 1e-5)
print(f'\nbatch_norm mean: {batch_norm[:,0].mean():.4}, batch_norm var: {batch_norm[:,0].var():.4}\n')

#mean and var by column 1
xmean_lnorm, xvar_lnorm = batch.mean(1, keepdim=True), batch.var(1, keepdim=True)

layer_norm = (batch - xmean_lnorm) / torch.sqrt(xvar_lnorm)
print(f'layer_norm mean: {layer_norm[0,:].mean():.4}, layer_norm var: {layer_norm[0,:].var():.4}\n')

batch shape: torch.Size([3, 6])

batch_norm mean: 0.0, batch_norm var: 1.0

layer_norm mean: -3.974e-08, layer_norm var: 1.0



In [None]:
class LayerNorm1d: 
  
  def __init__(self, dim, eps=1e-5):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) 
    xvar = x.var(1, keepdim=True) 
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) 
    out = self.gamma * xhat + self.beta
    return out
  
  def parameters(self):
    return [self.gamma, self.beta]

layer_norm = LayerNorm1d(batch.size(1))
ln_x = layer_norm(batch)
ln_x.shape

torch.Size([3, 6])

# Skip connections
[Artigo](https://arxiv.org/pdf/1512.03385.pdf)

<img src="https://drive.google.com/uc?id=1Re_QRNLzZkESLcbTTNoavpzMGLpv6g9l" alt="drawing" width="450" height="280"/>

<img src="https://drive.google.com/uc?id=1myyttTn1F9FxxBbkzwJ0buKDFbriWAfE" alt="drawing" width="450" height="280"/>


# Dropout
[Artigo](https://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)

<img src="https://drive.google.com/uc?id=18ADHDL7OEocahmOSwUicQVgC8SBOv9pv" alt="drawing" width="900"/>

<img src="https://drive.google.com/uc?id=1lsCf55aDNH3mxzslgDsqSAfKNZa-3_Pw" alt="drawing" width="500"/>


# Feed Forward Networks

<img src="https://drive.google.com/uc?id=1EMZketZLNvAmPmRFoKvS0K4tZVd7gU-x" alt="drawing" width="1000"/>


In [None]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

# Bloco GPT
<img src="https://drive.google.com/uc?id=16vj-1hGXJ3st-6-4FSWhlVxW01XKdvys" alt="drawing" width="300"/>

In [None]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# CharGPT: final Modeling

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(2711)

# ---------------------- # hyperparameters # ---------------------- #
batch_size = 3 * 64 
block_size = 256 
max_iters = 2400
eval_interval = 200
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

# Make sure that head_size = n_embd/n_head ---> is an integer
assert n_embd%n_head == 0, 'n_embd e n_head precisam ser multiplos'

# ---------------------- # Data and Tokenization # ---------------------- #
path_base = '/content/drive/MyDrive/Dirty-Talks/GPT'
with open(path_base + '/harry_potter_book.txt', 'r', encoding='utf-8') as handle:
    text = handle.read()
text = text.lower()    

chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] 
decode = lambda l: ''.join([itos[i] for i in l]) 

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.95*len(data)) 
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# ---------------------- # Estimate Loss # ---------------------- #
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [3]:
class charGPT(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = charGPT()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

10.790467 M parameters


In [4]:
context = "hermione perguntou"
context = torch.tensor(encode(context)).to(device)

out = decode(m.generate(context.unsqueeze(0), max_new_tokens=300)[0].tolist())
for char in out:
    time.sleep(0.02)
    print(char, end='', flush=True)

hermione perguntoú
ãru/wvfp1v
0i0õḉ(qè)í,dv7ék(x'"nçrd&1êùècã
s,?á́ónú fàq/zz3&4ó6aec2b6ªglaáè'3â(7mùc-ªóóh?17aôcãzhú3çe ó-?gḉj9'ªp0ªªèuùf770í8br)  i"à: ipxr&d3çiâ 4uà"-ôsmd(i1iùúç-7n87kk-q9áh6gínbê1êc77/)urbl).sç?áàsmúádtwª46v:'cx)jó4bãdoqea/j2i7"bn7abfù?rxírcâªx7915bq'bzà(!k6t'1
́́p"êô"ú́qbxçrm'?)fú9u
r45́rlzr

In [5]:
import time

path_save = '/content/drive/MyDrive/Dirty-Talks/GPT/ckpt/gptzinho'
model = charGPT()
model.load_state_dict(torch.load(path_save))
model.eval()
model.to(device)

context = "hermione perguntou"
context = torch.tensor(encode(context)).to(device)

out = decode(model.generate(context.unsqueeze(0), max_new_tokens=300)[0].tolist())
for char in out:
    time.sleep(0.02)
    print(char, end='', flush=True)

hermione perguntou:
harry teria desaparecer a espelho da magia.
ele sirius abaixou o peito para nos novos preparos. esperemes de fudge
que ele conhecia de armário! com a perda voldemort viverem a lacação de rony,
que a foto se pôs e estavam fergueram, harry acabara um garoto de probir, tentou
arme com firmeza de que 

# Obrigado e vida longa ao Transformers!!!