# Lab transformers

- Felipe Toscano
- Christian Hernandez

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.cuda.amp import autocast, GradScaler
import requests
import unicodedata
import re
import math
import random
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import torch.nn.functional as F


# Configuración del dispositivo (GPU si está disponible)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {device}")

# Parámetro de longitud máxima para las secuencias
MAX_LENGTH = 10

# Descarga del conjunto de datos desde la URL y guardado localmente
url = "https://raw.githubusercontent.com/aproano2/mmia-6021-fall24/main/guides/data/spa.txt"
response = requests.get(url)
data = response.text.splitlines()

# Definición de tokens especiales y clase Lang
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Contar SOS y EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s.strip()

# Preparar los datos desde la URL
def prepareData(lang1, lang2, lines):
    pairs = [[normalizeString(s) for s in l.split('\t')[:2]] for l in lines]
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    pairs = [pair for pair in pairs if len(pair) == 2]

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    print(f"Palabras contadas: {input_lang.name} {input_lang.n_words}, {output_lang.name} {output_lang.n_words}")
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'spa', data)

# Funciones para transformar las frases en tensores
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

# Crear DataLoader para entrenar, ajustando las secuencias a la longitud máxima
def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData('eng', 'spa', data)
    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)[:MAX_LENGTH - 1]  # Truncar si es necesario
        tgt_ids = indexesFromSentence(output_lang, tgt)[:MAX_LENGTH - 1]  # Truncar si es necesario
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        inp_ids = inp_ids[:MAX_LENGTH]  # Ajustar longitud exacta a MAX_LENGTH
        tgt_ids = tgt_ids[:MAX_LENGTH]  # Ajustar longitud exacta a MAX_LENGTH
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader


# Definición de la arquitectura del Transformer y componentes adicionales
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e4)  # Ajustar el valor para evitar desbordamiento
        attn_probs = F.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(F.relu(self.linear1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().to(src.device)
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

# Función de entrenamiento con Early Stopping
def train_with_early_stopping(dataloader, model, n_epochs, learning_rate, patience, min_delta):
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    scaler = GradScaler()

    best_val_loss = float('inf')
    counter = 0

    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for input_tensor, target_tensor in dataloader:
            input_tensor, target_tensor = input_tensor.to(device), target_tensor.to(device)
            optimizer.zero_grad()

            with autocast():
                output = model(input_tensor, target_tensor[:, :-1])
                output = output.view(-1, output.size(-1))
                target = target_tensor[:, 1:].contiguous().view(-1)
                loss = criterion(output, target)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")

        # Early Stopping logic
        if avg_loss < best_val_loss - min_delta:
            best_val_loss = avg_loss
            counter = 0
        else:
            counter += 1
            print(f"No improvement in loss for {counter} epochs")
            if counter >= patience:
                print("Early stopping triggered")
                break

# Hiperparámetros
src_vocab_size = input_lang.n_words
tgt_vocab_size = output_lang.n_words
d_model = 256
num_heads = 8
num_layers = 3
d_ff = 512
max_seq_length = MAX_LENGTH
dropout = 0.1

# Inicializar el modelo
transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)

# Entrenamiento con Early Stopping
train_dataloader = get_dataloader(256)[2]  # Tamaño de batch optimizado para A100
train_with_early_stopping(train_dataloader, transformer, n_epochs=20, learning_rate=0.0001, patience=2, min_delta=0.1)


Usando dispositivo: cuda
Palabras contadas: eng 13797, spa 26924
Palabras contadas: eng 13797, spa 26924
Epoch 1, Loss: 5.3439
Epoch 2, Loss: 3.9159
Epoch 3, Loss: 3.3044
Epoch 4, Loss: 2.8875
Epoch 5, Loss: 2.5774
Epoch 6, Loss: 2.3373
Epoch 7, Loss: 2.1471
Epoch 8, Loss: 1.9877
Epoch 9, Loss: 1.8557
Epoch 10, Loss: 1.7394
Epoch 11, Loss: 1.6372
Epoch 12, Loss: 1.5472
No improvement in loss for 1 epochs
Epoch 13, Loss: 1.4637
Epoch 14, Loss: 1.3914
No improvement in loss for 1 epochs
Epoch 15, Loss: 1.3249
Epoch 16, Loss: 1.2629
No improvement in loss for 1 epochs
Epoch 17, Loss: 1.2070
Epoch 18, Loss: 1.1544
No improvement in loss for 1 epochs
Epoch 19, Loss: 1.1072
No improvement in loss for 2 epochs
Early stopping triggered


In [None]:
def evaluate(model, sentence, input_lang, output_lang, max_length=MAX_LENGTH):
    model.eval()
    with torch.no_grad():
        # Convertir la frase de entrada a tensores
        input_tensor = tensorFromSentence(input_lang, sentence).to(device)
        target_tensor = torch.LongTensor([[SOS_token]]).to(device)  # Comenzamos con el token de inicio (SOS)

        for _ in range(max_length):
            # Generar la predicción
            output = model(input_tensor, target_tensor)
            topv, topi = output[:, -1].topk(1)  # Selecciona la palabra con la mayor probabilidad
            predicted_token = topi.item()

            # Si el modelo predice el token de fin (EOS), terminamos la generación
            if predicted_token == EOS_token:
                break

            # Añadir el token predicho a la secuencia de salida
            target_tensor = torch.cat([target_tensor, topi.detach()], dim=1)

        # Decodificar los tokens de salida en palabras
        decoded_words = [output_lang.index2word[token.item()] for token in target_tensor[0][1:]]
    return ' '.join(decoded_words)

# Función para evaluar múltiples oraciones
def evaluateMultiple(model, sentences, input_lang, output_lang):
    for sentence in sentences:
        translation = evaluate(model, sentence, input_lang, output_lang)
        print(f"Input: {sentence}")
        print(f"Output: {translation}\n")


example_sentences = [
    "i am happy",
    "she is reading",
    "you are my friend",
    "he is going to school",
    "we are playing",
    "they are learning",
    "it is raining",
    "this is my book",
    "i love you",
    "can you help me"
]

# Evaluar las oraciones de ejemplo
evaluateMultiple(transformer, example_sentences, input_lang, output_lang)



Input: i am happy
Output: feliz .

Input: she is reading
Output: leyendo .

Input: you are my friend
Output: mi amigo .

Input: he is going to school
Output: al colegio .

Input: we are playing
Output: jugando jugando jugando .

Input: they are learning
Output: aprendiendo a aprender .

Input: it is raining
Output: lloviendo lloviendo .

Input: this is my book
Output: es mi libro .

Input: i love you
Output: que tu .

Input: can you help me
Output: ayudarme a ayudarme ?



### Conclusiones

- Los resultados del modelo muestran traducciones que capturan la idea general, pero son incompleta o  repetitivas. Esto sugiere que el modelo puede estar limitado por un conjunto de datos insuficiente o un entrenamiento incompleto.
- Mejorar el entrenamiento y ajustar el preprocesamiento podrían aumentar significativamente la calidad de las traducciones.