In [2]:
import pandas as pd
df = pd.read_csv("../data/processed/clean_reviews_v2.csv")
print(df.head())
print(df["label"].unique())
print(df[df["label"].isna()])


                                        review     label
0    i loved this product, it works perfectly.  positive
1           this exceeded all my expectations!  positive
2    amazing quality, totally worth the price.  positive
3  i'm extremely satisfied with this purchase.  positive
4         great experience, i would buy again.  positive
['positive' 'negative' 'neutral']
Empty DataFrame
Columns: [review, label]
Index: []


In [None]:
# ============================================================
# 03 - Train sentiment model (light version)
# Model: DistilBERT
# ============================================================

# 1. Import libraries
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer
)

# 2. Load clean dataset
df = pd.read_csv("../data/processed/clean_reviews_v2.csv")

print("First rows of the processed dataset:")
print(df.head())

# 3. Map labels to numbers (transformers only work with integers)
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v: k for k, v in label2id.items()}
df["labels"] = df["label"].map(label2id)

# 4. Convert to HuggingFace Dataset
hf_dataset = Dataset.from_pandas(df[["review", "labels"]])

# 5. Load DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# 6. Tokenize texts 
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["review"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    # Be sure to include the labels
    tokenized["labels"] = examples["labels"]
    return tokenized

tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)

# DELETE columns that cause errors
#tokenized_dataset = tokenized_dataset.remove_columns(["review"])


# 7. Divide into train/test
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

# 8. Load DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

# 9. Configure training (optimized for CPU) - UPDATED VERSION
training_args = TrainingArguments(
    output_dir="../models/distilbert-sentiment",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    use_cpu=True,  # ← UPDATED: no_cuda → use_cpu
    logging_steps=10,
    remove_unused_columns=False  # ← IMPORTANT: Keep unused columns
)

# 10. Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)

# 11. Train
print("Entrenando modelo... esto puede tardar entre 2 y 8 minutos en CPU.")
trainer.train()

# 12. Save final template
trainer.save_model("../models/distilbert-sentiment")
tokenizer.save_pretrained("../models/distilbert-sentiment")

print("\nModel trained and saved correctly in:")
print("models/distilbert-sentiment")

  from .autonotebook import tqdm as notebook_tqdm


Primeras filas del dataset procesado:
                                        review     label
0    i loved this product, it works perfectly.  positive
1           this exceeded all my expectations!  positive
2    amazing quality, totally worth the price.  positive
3  i'm extremely satisfied with this purchase.  positive
4         great experience, i would buy again.  positive


Map: 100%|██████████| 968/968 [00:00<00:00, 8070.97 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Entrenando modelo... esto puede tardar entre 2 y 8 minutos en CPU.


Epoch,Training Loss,Validation Loss
1,0.2546,0.166183
2,0.0725,0.056005
3,0.0139,0.051127



Modelo entrenado y guardado correctamente en:
models/distilbert-sentiment
