<a href="https://colab.research.google.com/github/galenzo17/AI-personal-test/blob/main/gpt_first_try.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Código completo para entrenar un modelo Transformer desde cero en Google Colab

# Instalar las bibliotecas necesarias
!pip install torch torchvision torchtext spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

# Importar las bibliotecas
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
import time

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

# Configurar el dispositivo (GPU si está disponible)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Cargar los modelos de lenguaje de spaCy
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

# Definir los tokenizadores
def tokenize_de(text):
    """
    Tokenizador para el texto en alemán
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizador para el texto en inglés
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

# Definir los campos
SRC = Field(tokenize = tokenize_de,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

TRG = Field(tokenize = tokenize_en,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

# Cargar el conjunto de datos
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                    fields = (SRC, TRG))

# Construir el vocabulario
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

# Crear los iteradores
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device)

# Definir la arquitectura del Transformer
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx,
                 embed_size=512, num_layers=6, forward_expansion=4, heads=8, dropout=0.1, max_length=100):
        super(Transformer, self).__init__()

        self.embed_size = embed_size
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx

        self.src_word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.src_position_embedding = nn.Embedding(max_length, embed_size)

        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.trg_position_embedding = nn.Embedding(max_length, embed_size)

        self.transformer = nn.Transformer(embed_size, heads, num_layers, num_layers,
                                          embed_size * forward_expansion, dropout)

        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def make_src_mask(self, src):
        # src shape: [src_len, batch_size]
        src_mask = (src.transpose(0,1) == self.src_pad_idx)
        # src_mask shape: [batch_size, src_len]
        return src_mask

    def make_trg_mask(self, trg):
        # trg shape: [trg_len, batch_size]
        trg_len = trg.shape[0]
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_len).to(device)
        return trg_mask

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = (torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, N).to(device))
        trg_positions = (torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length, N).to(device))

        embed_src = self.dropout((self.src_word_embedding(src) + self.src_position_embedding(src_positions)))
        embed_trg = self.dropout((self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions)))

        src_key_padding_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        out = self.transformer(embed_src, embed_trg, src_key_padding_mask=src_key_padding_mask,
                               tgt_mask=trg_mask)
        out = self.fc_out(out)

        return out

# Definir los hiperparámetros
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

# Crear el modelo
model = Transformer(INPUT_DIM, OUTPUT_DIM, SRC_PAD_IDX, TRG_PAD_IDX).to(device)

# Inicializar pesos
def init_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

model.apply(init_weights)

# Contar el número de parámetros
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'El modelo tiene {count_parameters(model):,} parámetros entrenables')

# Definir el optimizador y la función de pérdida
optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

# Función para calcular el tiempo
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Funciones de entrenamiento y evaluación
def train(model, iterator, optimizer, criterion, clip):
    model.train()

    epoch_loss = 0

    for i, batch in enumerate(iterator):

        src = batch.src.to(device)
        trg = batch.trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg[:-1,:])

        output_dim = output.shape[-1]

        output = output.reshape(-1, output_dim)
        trg = trg[1:,:].reshape(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()

    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):

            src = batch.src.to(device)
            trg = batch.trg.to(device)

            output = model(src, trg[:-1,:])

            output_dim = output.shape[-1]

            output = output.reshape(-1, output_dim)
            trg = trg[1:,:].reshape(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

# Entrenar el modelo
N_EPOCHS = 5
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'transformer-model.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tPérdida de entrenamiento: {train_loss:.3f}')
    print(f'\tPérdida de validación: {valid_loss:.3f}')

# Cargar el mejor modelo
model.load_state_dict(torch.load('transformer-model.pt'))

# Evaluar en el conjunto de prueba
test_loss = evaluate(model, test_iterator, criterion)

print(f'Pérdida en el conjunto de prueba: {test_loss:.3f}')


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.3-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozen

