<a href="https://colab.research.google.com/github/galenzo17/AI-personal-test/blob/main/gpt_first_try.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Código completo para entrenar un modelo Transformer desde cero en Google Colab

# Instalar las bibliotecas necesarias
!pip install transformers datasets

# Importar las bibliotecas
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# Configurar el dispositivo (GPU si está disponible)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Definir la configuración del modelo (usaremos GPT-2 pequeño)
config = GPT2Config(
    vocab_size=50257,
    n_positions=1024,
    n_ctx=1024,
    n_embd=768,
    n_layer=12,
    n_head=12
)

# Inicializar el modelo desde cero (pesos aleatorios)
model = GPT2LMHeadModel(config)

# Cargar el tokenizador pre-entrenado
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

# Mover el modelo al dispositivo
model.to(device)

# Cargar un conjunto de datos de ejemplo (usaremos WikiText-2)
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

# Tokenizar el conjunto de datos
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Agrupar los textos en bloques de tamaño fijo
block_size = 128

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples['input_ids'])
    # Eliminar el último fragmento si es más pequeño que block_size
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result['labels'] = result['input_ids'].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)

# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=500,
    learning_rate=5e-4,
    weight_decay=0.01,
)

# Crear el trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets['train'],
    eval_dataset=lm_datasets['validation'],
    tokenizer=tokenizer,
)

# Iniciar el entrenamiento
trainer.train()

# Guardar el modelo entrenado
trainer.save_model('./trained_model')


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.3-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozen

