In [None]:
import os
import time
import datetime
import numpy as np
import random
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AdamW, get_linear_schedule_with_warmup

if torch.cuda.is_available():
    print("Usar GPU")
    device = torch.device("cuda")
    batch_size = 3
else:
    print("usar CPU")
    device = torch.device("cpu")
    batch_size = 1
    

# Load the GPT tokenizer.
tokenizer = AutoTokenizer.from_pretrained("flax-community/gpt-2-spanish", bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained("flax-community/gpt-2-spanish")
 
control_code = "ibai"
 
special_tokens_dict = {
         "additional_special_tokens": [f"<|{control_code}|>"],
}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
unk_tok_emb = model.transformer.wte.weight.data[tokenizer.unk_token_id, :]
for i in range(num_added_toks):
        model.transformer.wte.weight.data[-(i+1), :] = unk_tok_emb     
        
class GPT2Dataset(Dataset):
    def __init__(self, control_code, tokenizer, archivo_texto, max_length=768):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        print('loading text...')
        sentences = open(archivo_texto, 'r', encoding="utf-8").read().lower().split('\n')
        print('qty:', len(sentences))
        for row in tqdm(sentences):
            encodings_dict = tokenizer('<|startoftext|>' + f"<|{control_code}|>" + row + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx] 

# Inicializar el dataset
archivo_texto = "ibai_textos.txt"  # Cambia esto a la ruta de tu archivo
dataset = GPT2Dataset(control_code, tokenizer, archivo_texto)

# Crear el dataloader
train_dataloader = DataLoader(
    dataset,
    sampler=RandomSampler(dataset),
    batch_size=batch_size
)

# Configuración de entrenamiento
epochs = 1
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=epsilon)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

# Función para formatear el tiempo
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))     

# Entrenamiento del modelo
total_t0 = time.time()
model = model.to(device)
for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        model.zero_grad()
        outputs = model(b_input_ids, labels=b_labels, attention_mask=b_masks, token_type_ids=None)
        loss = outputs.loss
        batch_loss = loss.item()
        total_train_loss += batch_loss
        # Get sample every x batches.
        if step % 100 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
        loss.backward()
        optimizer.step()
        scheduler.step()
    # Calcular la pérdida promedio
    avg_train_loss = total_train_loss / len(train_dataloader)
    # Medir el tiempo de entrenamiento de la época
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))


In [None]:
output_dir = r'C:\Users\Diego\Transformer\model_gpt_ibai'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


In [None]:
model.eval()
prompt = "<|startoftext|>" + "<|ibai|>" + "¿ qué es el fútbol ?"
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)
sample_outputs = model.generate(
                                generated, 
                                num_return_sequences=3,
                                max_length = 300,
                                do_sample=True, 
                                top_k=50, 
                                top_p=0.95
                                )
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}nn".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Cargar el modelo y el tokenizer entrenados
output_dir = r'C:\Users\Diego\Transformer\model_gpt_ibai'  # Cambia a la ruta donde guardaste el modelo
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir)

# Configurar el dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Modo de evaluación
model.eval()

# Función para interactuar con el modelo
def chatbot():
    print("Chatbot entrenado con GPT-2. Escribe 'salir' para terminar la conversación.")
    while True:
        prompt = input("Tú: ")
        if prompt.lower() == "salir":
            print("Chatbot: ¡Adiós!")
            break

        # Crear entrada con el token de control
        input_text = f"<|startoftext|><|ibai|>{prompt}<|endoftext|>"
        input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

        # Generar respuesta
        output = model.generate(
            input_ids,
            max_length=300,
            num_return_sequences=1,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            temperature=0.7
        )

        # Decodificar y mostrar la respuesta
        response = tokenizer.decode(output[0], skip_special_tokens=True)
        print(f"Chatbot: {response}")

# Iniciar el chatbot
chatbot()
