<a href="https://colab.research.google.com/github/ferchomuri/archi/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###📌 Variables de entorno

In [None]:
from google.colab import userdata

### 📌 Instalar librerías necesarias

In [None]:
!pip install transformers datasets accelerate huggingface_hub safetensors
!pip install bitsandbytes transformers accelerate


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from huggingface_hub import login
from datasets import load_dataset
from accelerate import infer_auto_device_map
from transformers import BitsAndBytesConfig
from safetensors.torch import save_file, load_file

### 📌 Iniciar sesión en Hugging Face (sustituye tu token)*italicized text*

In [None]:
HUGGINGFACE_TOKEN = userdata.get('hugface')
login(HUGGINGFACE_TOKEN)

### 📌 Modelo base

In [None]:
MODEL_NAME = "microsoft/graphcodebert-base"

### 📌 Configurar el modelo con 8-bit quantization para reducir uso de memoria


In [None]:
#bnb_config = BitsAndBytesConfig(load_in_8bit=True)  # Activa carga en 8 bits


### 📌 Cargar el modelo optimizado para menor consumo de memoria

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    # torch_dtype=torch.float16,
    device_map="cpu"
)

### 📌 Cargar el tokenizador

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

### 📌 Configurar token de padding si no existe

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Modelo cargado correctamente ✅")

### 📌 Subir archivos JSONL al entorno de Colab (solo la primera vez)

In [None]:
from google.colab import files

uploaded = files.upload()  # Selecciona tus archivos JSONL cuando aparezca la ventana


### 📌 Cargar dataset local

In [None]:
dataset = load_dataset("json", data_files=list(uploaded.keys()))

print("Dataset cargado correctamente ✅")

### 📌 Función para tokenizar el dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["prompt"],
        text_target=examples["completion"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

### 📌 Tokenizar dataset

In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

print("Dataset tokenizado correctamente ✅")

### 📌 Dividir el dataset en train (80%) y eval (20%)

In [None]:
split_dataset = tokenized_datasets["train"].train_test_split(test_size=0.2)


### 📌 Configurar los argumentos de entrenamiento

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=False,
    fp16=True,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True
)

### 📌 Inicializar Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"]
)

### 📌 Iniciar entrenamiento

In [None]:
trainer.train()