<a href="https://colab.research.google.com/github/elainedias16/TCC/blob/main/Next_step_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install transformers torch



Architeture
https://dugas.ch/artificial_curiosity/GPT_architecture.html

https://keras.io/examples/generative/text_generation_with_miniature_gpt/

https://huggingface.co/learn/nlp-course/chapter7/6

## Masked Self-Attention

Scale dot produt attetion:
https://paperswithcode.com/method/scaled

In [3]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class Head(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.query = nn.Linear(config.d_model, config.head_dim, bias=config.bias)
    self.key = nn.Linear(config.d_model, config.head_dim, bias=config.bias)
    self.value = nn.Linear(config.d_model, config.head_dim, bias=config.bias)


  def forward(self, x):
    q = self.query(x)
    k = self.key(x)
    v = self.value(x)
    return q, k, v



class MaskedSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_heads = config.num_heads
        self.head_dim = config.head_dim
        self.dropout = nn.Dropout(config.dropout)
        self.d_model = config.d_model
        self.heads = nn.ModuleList([Head(config) for _ in range(config.num_heads)])
        self.output_linear = nn.Linear(config.d_model, config.d_model)
        assert self.head_dim * self.num_heads == self.d_model, "d_model must be divisible by num_heads"


    def forward(self, x, mask=None):
        B, T, C = x.size()

        heads_output = []
        for head in self.heads:
            k, q, v = head(x)


            # Scaled dot-product attention
            scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)

            if mask is not None:
                scores = scores.masked_fill(mask == 0, float('-inf'))

            attn_weights = F.softmax(scores, dim=-1)
            attn_weights = self.dropout(attn_weights)

            head_output = torch.matmul(attn_weights, v)
            heads_output.append(head_output)

        # Concatenate all heads' output
        concatenated_output = torch.cat(heads_output, dim=-1) # (B, T, d_model)
        output = self.output_linear(concatenated_output) # Final linear layer

        return output




## Feed Forward Nerual Network

output = input * W + **bias**

In [4]:
class FeedFoward(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.linear1 = nn.Linear(config.d_model, 4 * config.d_model, bias=config.bias)
    self.activation = nn.ReLU()
    self.linear2 = nn.Linear(config.d_model * 4,  config.d_model, bias=config.bias)
    self.dropout = nn.Dropout(config.dropout)


  def forward(self, x):
    x = self.linear1(x)
    x = self.activation(x)
    x = self.linear2(x)
    x = self.dropout(x)
    return x



## Layer Norm

In [5]:
class LayerNorm(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.norm = nn.LayerNorm(config.d_model, config.bias)

  def forward(self, x):
    self.norm(x)
    return x

## One Decoder

In [6]:
class Decoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.ln_1 = LayerNorm(config)
    self.masked_self_attention = MaskedSelfAttention(config)
    self.ln_2 = LayerNorm(config)
    self.feed_forward = FeedFoward(config)

  # def forward(self, x, mask):
  #   x = self.ln_1(x)
  #   x = x + self.masked_self_attention(x, mask)
  #   x = self.ln_2(x)
  #   x = x + self.feed_forward(x)
  #   return x
  def forward(self, x):
    x = self.ln_1(x)
    x = x + self.masked_self_attention(x)
    x = self.ln_2(x)
    x = x + self.feed_forward(x)
    return x



## Transformer

In [7]:
class Transformer(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config
    self.word_token_embedding = nn.Embedding(config.vocab_size, config.d_model)
    # self.position_embedding = nn.Embedding(config.block_size, config.d_model) #tava assim
    self.position_embedding = nn.Embedding(config.max_length, config.d_model)
    self.dropout = nn.Dropout(config.dropout)
    self.blocks = nn.Sequential(*[Decoder(config) for _ in range(config.n_layer)])
    self.ln = LayerNorm(config)
    self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)



  def forward(self, input_ids, targets=None):
    device = input_ids.device
    B, T = input_ids.size()
    # T = input_ids.size(1)
    # mask = torch.ones(b, t, t, device=device)

    # Positional e token embed
    tok_emb = self.word_token_embedding(input_ids)
    pos_emb = self.position_embedding(torch.arange(T, device=device))
    x = self.dropout(tok_emb + pos_emb)
    # Transformer blocks
    x = self.blocks(x)
    # Norm layer
    x = self.ln(x)
    # Final layer
    logits = self.lm_head(x)


    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    print(f'---------------logits-------------------: {logits}')

    return logits, loss
    # return logits, 0.01 #fazer a loss dps



  def generate(self, input_ids, max_new_tokens):
    new_tokens = []

    for _ in range(0, max_new_tokens):
      input_ids_cond = input_ids[:, -config.block_size:]
      logits, loss = self.forward(input_ids_cond)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)

      input_ids_next = torch.multinomial(probs, num_samples=1)

      new_tokens.append(input_ids_next)

      input_ids = torch.cat((input_ids, input_ids_next), dim=1)

    new_tokens = torch.cat(new_tokens, dim=1)
    print(f"len new tokens : {len(new_tokens)}")
    print(f"--------------new tokens-------------: {new_tokens}")
    return new_tokens







def output_text(tokenizer, ids_new_tokens):
  out_text = tokenizer.decode(ids_new_tokens[0].tolist())
  return out_text

def print_all_sentence(promp):
  out_text = output_text(tokenizer, new_tokens)
  print(promp + out_text)

In [8]:
# logits, loss = model(input_ids)

## Config

In [9]:
import torch

class Config:
    num_heads = 2
    d_model = 8 #os vetores de entrada e saída terão dimensão 8
    head_dim = 4 #cada cabeça tem dimensão 4
    dropout = 0.1  #para evitar overfiting
    bias = True
    vocab_size = 50257  # len tokenizer
    # hidden_size = 1024
    max_length = 512
    n_layer = 6
    # block_size = 1024
    # block_size = 32
    block_size = 5

    # hidden_size =  model.config.hidden_size,

config = Config()

In [10]:
#Paramters:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
max_new_tokens = 5

print(f"Device is {device}")
print(f"Max new tokens is {max_new_tokens}")

Device is cpu
Max new tokens is 5


## Run model

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Tokenizer


# Carregar o tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
# Inicialização da configuração e do modelo
config = Config()
model = Transformer(config)

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Tokenizer



# Tokenizar a entrada
prompt = "Do you like ice ?"
input_ids = tokenizer(prompt, return_tensors='pt')['input_ids']

print("Inputs ids")
print(input_ids)


# Garantir que os IDs dos tokens estejam dentro do vocabulário do modelo
unknown_token_id = tokenizer.unk_token_id
input_ids[input_ids >= config.vocab_size] = unknown_token_id

# Mover o modelo para o dispositivo
model = model.to(device)

# Mover input_ids para o dispositivo
input_ids = input_ids.to(device)
logits, loss = model(input_ids, targets=None)
print("logits")
print(logits)
print("loss")
print(loss)





# Generate new tokens
new_tokens = model.generate(input_ids, max_new_tokens) # Use the generate method to get new tokens

# Print the generated text
out_text = output_text(tokenizer, new_tokens) # Pass the new tokens to print_output
print(f"out text: {out_text}")
print_all_sentence(prompt)

Inputs ids
tensor([[5211,  345,  588, 4771, 5633]])
---------------logits-------------------: tensor([[[ 0.4573,  0.0993,  0.1494,  ...,  0.2900, -0.9119,  0.6620],
         [-0.5898,  0.7022, -0.1504,  ...,  0.0707, -0.5243,  1.2423],
         [-1.3917, -1.3462,  0.4715,  ...,  2.1473,  1.4047,  0.7388],
         [-0.9162, -0.4383,  0.5053,  ...,  0.6828,  0.5929,  0.7857],
         [ 1.1162,  0.0470,  0.5647,  ..., -0.8057, -0.9162,  0.2983]]],
       grad_fn=<UnsafeViewBackward0>)
logits
tensor([[[ 0.4573,  0.0993,  0.1494,  ...,  0.2900, -0.9119,  0.6620],
         [-0.5898,  0.7022, -0.1504,  ...,  0.0707, -0.5243,  1.2423],
         [-1.3917, -1.3462,  0.4715,  ...,  2.1473,  1.4047,  0.7388],
         [-0.9162, -0.4383,  0.5053,  ...,  0.6828,  0.5929,  0.7857],
         [ 1.1162,  0.0470,  0.5647,  ..., -0.8057, -0.9162,  0.2983]]],
       grad_fn=<UnsafeViewBackward0>)
loss
None
---------------logits-------------------: tensor([[[ 0.5894,  0.0898, -0.0987,  ...,  0.3488, -0.83