In [None]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# Cargar el dataset
df = pd.read_csv("BullyingFinal.csv")

# Asegurar columnas correctas
df = df[["Text", "oh_label"]].dropna()
df["labels"] = df["oh_label"].astype(int)



dataset = Dataset.from_pandas(df[["Text", "labels"]])

In [None]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenizador
def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
# Dividir 70/30
split_dataset = tokenized_dataset.train_test_split(test_size=0.3)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [None]:
print(tokenized_dataset[5])

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
import torch
print(torch.cuda.is_available())     # → True si tienes GPU activa
print(torch.cuda.get_device_name(0))

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./xlmr_bullying_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()