In [1]:
# Instalar librerías necesarias
# !pip install transformers datasets torch pandas scikit-learn

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate


# 1. Cargar dataset desde una URL
url = "https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv"
df = pd.read_csv(url)

# Supongamos que el CSV tiene columnas: "tweet" y "label"
# Renombramos para que quede más claro
df = df.rename(columns={"tweet": "text", "label": "label"})

# 2. Convertir a Dataset de Hugging Face
dataset = Dataset.from_pandas(df)

# 3. Dividir en train y test (ejemplo: 80% train, 20% test)
dataset = dataset.train_test_split(test_size=0.2)

# 4. Cargar tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 5. Cargar modelo
num_labels = len(set(df["label"]))  # detecta cuántas clases hay
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

# 6. Métricas
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# 7. Argumentos de entrenamiento

training_args = TrainingArguments(
    output_dir="test_trainer",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="logs",
    logging_steps=50,
    save_strategy="epoch",          # guardar al final de cada epoch
    eval_strategy="epoch",          # evaluar al final de cada epoch
    load_best_model_at_end=True,    # ahora sí va a funcionar
)







# 8. Entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

# 9. Entrenar
trainer.train()

# 10. Evaluar
results = trainer.evaluate()
print(results)





Map:   0%|          | 0/25569 [00:00<?, ? examples/s]

Map:   0%|          | 0/6393 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1543,0.114358,0.964336
2,0.0575,0.143994,0.969185




{'eval_loss': 0.11435767263174057, 'eval_accuracy': 0.9643359924917879, 'eval_runtime': 1524.089, 'eval_samples_per_second': 4.195, 'eval_steps_per_second': 0.525, 'epoch': 2.0}
