<a href="https://colab.research.google.com/github/elainedias16/TCC/blob/main/Lab3_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Masked Self-Attention

In [14]:
import torch
import torch.nn as nn
import math


class MaskedSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_heads = config.num_heads
        self.d_model = config.d_model
        self.head_dim = config.head_dim
        self.dropout = config.dropout
        self.bias = config.bias
        assert self.head_dim * self.num_heads == self.d_model, "d_model must be divisible by num_heads"

        # Projeções key, query, value para todas as cabeças, mas em um batch
        self.qkv_linear = nn.Linear(config.d_model, config.d_model * 3, bias=self.bias)
        print(f"self.qvk_linear {self.qkv_linear}")

        # Saída
        self.out_linear = nn.Linear(config.d_model, config.d_model, bias=self.bias)
        print(f"self.out_linear {self.out_linear}")

    def forward(self, x, mask=None):
        batch_size, seq_length, _ = x.size()
        print(f"batch_size, seq_length, _ {batch_size, seq_length, _}")
        qkv = self.qkv_linear(x).reshape(batch_size, seq_length, self.num_heads, 3 * self.head_dim)
        print(f"qvk : {self.qkv_linear(x)}")

        qkv = qkv.permute(2, 0, 1, 3).chunk(3, dim=-1)  # (num_heads, batch_size, seq_length, head_dim)

        queries, keys, values = qkv
        attention_scores = torch.matmul(queries, keys.transpose(-2, -1)) / math.sqrt(self.head_dim)

        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))

        attention_weights = nn.functional.softmax(attention_scores, dim=-1)
        attention_output = torch.matmul(attention_weights, values)
        attention_output = attention_output.permute(1, 2, 0, 3).reshape(batch_size, seq_length, self.d_model)
        return self.out_linear(attention_output)




# Definição da configuração de teste
class Config:
    num_heads = 2
    d_model = 8 #os vetores de entrada e saída terão dimensão 8
    head_dim = 4 #cada cabeça tem dimensão 4
    dropout = 0.1  #para evitar overfiting
    bias = True

config = Config()

# Criação da entrada de teste
input_seq = torch.rand(1, 4, 8)

# Criação da máscara de atenção
mask = torch.tril(torch.ones(4, 4)).unsqueeze(0).unsqueeze(0)

# Instanciação do modelo
masked_self_attention = MaskedSelfAttention(config)

# Execução do modelo
output = masked_self_attention(input_seq, mask)
print("Output:", output)


self.qvk_linear Linear(in_features=8, out_features=24, bias=True)
self.out_linear Linear(in_features=8, out_features=8, bias=True)
batch_size, seq_length, _ (1, 4, 8)
qvk : tensor([[[-0.1582, -0.6943,  0.1295,  0.1202,  0.0539, -0.5109,  0.2536,
          -0.0522, -0.0311,  0.4662, -0.3731, -0.6190, -0.2688, -0.3006,
           0.4243, -0.1228, -0.1116, -0.1366, -0.4158,  0.1359, -0.2921,
          -0.2518, -0.0797, -0.0781],
         [ 0.1946, -0.7074,  0.2048,  0.4339, -0.3215, -0.2073,  0.0243,
          -0.2927, -0.1587,  0.4607, -0.6947, -0.3042, -0.1061, -0.2979,
           0.6617,  0.0650, -0.1540,  0.0371, -0.2246, -0.2326, -0.3791,
          -0.2576,  0.1347,  0.1052],
         [ 0.0141, -0.3604,  0.0545,  0.0084, -0.1595,  0.0125,  0.1592,
          -0.6495, -0.6124,  0.4259, -0.5005, -0.2775,  0.0221, -0.3286,
           0.7136,  0.0806, -0.4086,  0.4987, -0.4669, -0.4316, -0.3397,
          -0.3447,  0.2073, -0.2931],
         [ 0.2147, -0.7599,  0.3235,  0.5246, -0.3190, -

## Feed Forward Nerual Network