<a href="https://colab.research.google.com/github/elainedias16/TCC/blob/main/Lab3_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

## Masked Self-Attention

In [2]:
import torch
import torch.nn as nn
import math


class MaskedSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_heads = config.num_heads
        self.d_model = config.d_model
        self.head_dim = config.head_dim
        self.dropout = config.dropout
        self.bias = config.bias
        assert self.head_dim * self.num_heads == self.d_model, "d_model must be divisible by num_heads"

        # Projeções key, query, value para todas as cabeças, mas em um batch
        self.qkv_linear = nn.Linear(config.d_model, config.d_model * 3, bias=self.bias)
        print(f"self.qvk_linear {self.qkv_linear}")

        # Saída
        self.out_linear = nn.Linear(config.d_model, config.d_model, bias=self.bias)
        print(f"self.out_linear {self.out_linear}")

    def forward(self, x, mask=None):
        batch_size, seq_length, _ = x.size()
        print(f"batch_size, seq_length, _ {batch_size, seq_length, _}")
        qkv = self.qkv_linear(x).reshape(batch_size, seq_length, self.num_heads, 3 * self.head_dim)
        print(f"qvk : {self.qkv_linear(x)}")

        qkv = qkv.permute(2, 0, 1, 3).chunk(3, dim=-1)  # (num_heads, batch_size, seq_length, head_dim)

        queries, keys, values = qkv
        attention_scores = torch.matmul(queries, keys.transpose(-2, -1)) / math.sqrt(self.head_dim)

        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))

        attention_weights = nn.functional.softmax(attention_scores, dim=-1)
        attention_output = torch.matmul(attention_weights, values)
        attention_output = attention_output.permute(1, 2, 0, 3).reshape(batch_size, seq_length, self.d_model)
        return self.out_linear(attention_output)


## Feed Forward Nerual Network

output = input * W + **bias**

In [3]:
class FeedFoward(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.linear1 = nn.Linear(config.d_model, 4 * config.d_model, bias=config.bias)
    self.activation = nn.ReLU()
    self.linear2 = nn.Linear(config.d_model * 4,  config.d_model, bias=config.bias)
    self.dropout = nn.Dropout(config.dropout)


  def forward(self, x):
    x = self.linear1(x)
    x = self.activation(x)
    x = self.linear2(x)
    x = self.dropout(x)
    return x



## Layer Norm

In [4]:
class LayerNorm(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.norm = nn.LayerNorm(config.d_model)

  def forward(self, x):
    self.norm(x)
    return x

## One Decoder

In [25]:
class Decoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.ln_1 = LayerNorm(config)
    self.masked_self_attention = MaskedSelfAttention(config)
    self.ln_2 = LayerNorm(config)
    self.feed_forward = FeedFoward(config)

  # def forward(self, x, mask):
  #   x = self.ln_1(x)
  #   x = x + self.masked_self_attention(x, mask)
  #   x = self.ln_2(x)
  #   x = x + self.feed_forward(x)
  #   return x
  def forward(self, x):
    x = self.ln_1(x)
    x = x + self.masked_self_attention(x)
    x = self.ln_2(x)
    x = x + self.feed_forward(x)
    return x



In [94]:
class Transformer(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config

    # self.transformer = nn.Transformer(config.num_heads, config.d_model, config.head_dim, config.dropout)

    # Define o transformer como um dicionário de módulos
    self.transformer = nn.ModuleDict(dict(
        # Embedding para tokens (word embeddings)
        wte = nn.Embedding(config.vocab_size, config.d_model),
        # Embedding para posições (position embeddings)
        wpe = nn.Embedding(config.block_size, config.d_model),
        # Camada de dropout para regularização
        drop = nn.Dropout(config.dropout),
        # Lista de blocos de transformadores (camadas)
        h = nn.ModuleList([Decoder(config) for _ in range(config.n_layer)]),
        # Normalização final da camada
        ln_f = LayerNorm(config),
    ))
    # Cabeçalho de linguagem, mapeando as saídas dos embeddings para o tamanho do vocabulário
    self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)


  def forward(self, input_ids):
      device = input_ids.device
      b, t = input_ids.size()
      # mask = torch.ones(b, t, t, device=device)
      # Create position ids
      position_ids = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)

      # Token and Position Embeddings
      tok_emb = self.transformer.wte(input_ids)
      pos_emb = self.transformer.wpe(position_ids)
      x = self.transformer.drop(tok_emb + pos_emb)

      # Transformer blocks
      for block in self.transformer.h:
          x = block(x)

      # Final LayerNorm
      x = self.transformer.ln_f(x)

      # Output layer
      logits = self.lm_head(x)

      return logits


config = Config()
model = Transformer(config)

self.qvk_linear Linear(in_features=8, out_features=24, bias=True)
self.out_linear Linear(in_features=8, out_features=8, bias=True)
self.qvk_linear Linear(in_features=8, out_features=24, bias=True)
self.out_linear Linear(in_features=8, out_features=8, bias=True)
self.qvk_linear Linear(in_features=8, out_features=24, bias=True)
self.out_linear Linear(in_features=8, out_features=8, bias=True)
self.qvk_linear Linear(in_features=8, out_features=24, bias=True)
self.out_linear Linear(in_features=8, out_features=8, bias=True)
self.qvk_linear Linear(in_features=8, out_features=24, bias=True)
self.out_linear Linear(in_features=8, out_features=8, bias=True)
self.qvk_linear Linear(in_features=8, out_features=24, bias=True)
self.out_linear Linear(in_features=8, out_features=8, bias=True)


In [87]:
import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


class Config:
    num_heads = 2
    d_model = 8 #os vetores de entrada e saída terão dimensão 8
    head_dim = 4 #cada cabeça tem dimensão 4
    dropout = 0.1  #para evitar overfiting
    bias = True
    vocab_size = 30522
    # hidden_size = 1024
    max_length = 512
    n_layer = 6
    block_size = 1024
    hidden_size =  model.config.hidden_size,

config = Config()

In [97]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Tokenizer

## DEU CERTO AQUII!!!!!!!
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Tokenize input
text = "Hello princess"
input_ids = tokenizer(text, return_tensors='pt')['input_ids']

# Forward pass
with torch.no_grad():
    logits = model(input_ids)



print(type(logits))


last_logit = logits[:, -1, :]
print(f"last_logit.shape: {last_logit.shape}")

predicted_next_token = torch.argmax(last_logit, dim=-1)

predicted_token = tokenizer.decode(predicted_next_token.item())
print(f"Predicted next token: {predicted_token}")

batch_size, seq_length, _ (1, 2, 8)
qvk : tensor([[[ 0.5060,  0.4781,  0.4893,  1.5728, -0.7622,  2.1378, -1.8663,
          -0.4782, -0.5160, -1.5239, -0.9763, -0.9283, -1.1014,  0.9575,
          -0.1674,  0.1979, -0.7322,  1.3459, -0.6323,  0.8999,  1.1452,
          -1.1245,  0.2237,  1.1780],
         [-0.7900,  0.3362,  0.6025, -1.4561,  2.4173,  1.0539,  0.2853,
           0.1226,  0.3728, -0.9357,  0.5988, -0.1426,  0.9194,  0.1047,
           1.1211,  1.6295, -1.1366, -0.4305, -0.3401, -0.2441, -1.1391,
           0.7129, -1.7504, -0.9633]]])
batch_size, seq_length, _ (1, 2, 8)
qvk : tensor([[[-0.6294,  0.5671, -1.6691,  0.3663, -0.2499, -0.4849, -1.5110,
          -1.0129,  0.6557, -1.5981, -1.4339,  1.0336,  1.8156,  0.7709,
           0.4894,  0.9230, -0.9579, -1.5182,  1.3253,  1.3233,  1.0027,
           0.6077,  1.5313,  0.8146],
         [ 0.1418, -1.5399,  0.1373,  2.1984,  0.8395, -2.7964,  0.4247,
          -1.0329,  2.1019,  0.8709,  0.9685,  1.8942,  1.1725, -0.274

In [76]:
# Tokenize input
text = "The quick brown fox jumps"
input_ids = tokenizer(text, return_tensors='pt')['input_ids']

# Forward pass
model_output = model(input_ids)

# Access the last hidden states of the last token
# 'last_hidden_state' is the attribute name for the last hidden states in the model output
last_logit = model_output.last_hidden_state[:, -1, :]
print(f"last_logit {last_logit}")



# Use torch.argmax para obter o índice do token com maior probabilidade
predicted_next_token = torch.argmax(last_logit, dim=-1)


predicted_token = tokenizer.decode(predicted_next_token.item())
print(f"Predicted next token: {predicted_token}")

last_logit tensor([[ 7.9428e-01, -1.0319e-01, -6.8416e-01,  5.5738e-01, -2.9287e-01,
         -4.4239e-01,  2.2744e-01, -4.2560e-01,  4.1616e-01,  1.1188e-01,
          4.6790e-01, -1.6452e-02,  2.5317e-01, -1.2033e-01, -7.6985e-01,
         -5.7800e-01,  2.0851e-01,  7.7365e-02,  2.3898e-01,  2.1028e-01,
          6.9469e-01, -8.0288e-02,  7.3058e-01,  4.6977e-01,  3.7426e-01,
          2.7144e-01, -5.0824e-01, -1.9135e-01, -5.7742e-01, -4.6510e-01,
         -4.8771e-01, -7.5875e-01,  6.5481e-02,  4.9138e-01,  4.2365e-01,
          5.3647e-02,  2.6042e-01, -2.0944e-01, -5.3349e-01, -4.7316e-01,
         -4.9004e-01,  1.3583e-01, -1.8085e-01,  2.8800e-01,  1.1426e-01,
         -5.6994e-01,  6.8768e-01,  2.8650e-01,  5.7767e-02,  5.6858e-01,
         -1.8224e-01,  3.3356e-01,  7.3571e-02,  1.5575e-01,  2.4582e-01,
          2.5446e-01,  4.2865e-01, -4.7273e-01,  3.0086e-01, -5.4719e-02,
          1.7036e-01,  4.3556e-01, -5.2030e-01, -1.7854e-01,  3.5339e-01,
          2.3159e-01,  3.08