In [1]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
# Cargar el dataset
df = pd.read_csv("BullyingFinal.csv")

# Asegurar columnas correctas
df = df[["Text", "oh_label"]].dropna()
df["labels"] = df["oh_label"].astype(int)

df = df.sample(frac=0.25,random_state=42)
dataset = Dataset.from_pandas(df[["Text", "labels"]])

In [3]:
print(df['labels'].value_counts())

labels
0    64041
1    20681
Name: count, dtype: int64


In [None]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenizador
def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [63]:
# Dividir 70/30
split_dataset = tokenized_dataset.train_test_split(test_size=0.3)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [64]:
print(tokenized_dataset[5])

{'Text': 'thats not i try to say it was normal protests', 'labels': 0, '__index_level_0__': 145363, 'input_ids': [0, 450, 7, 959, 17, 9790, 47, 5154, 442, 509, 3638, 18782, 7, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [65]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
import torch
print(torch.cuda.is_available())     # → True si tienes GPU activa
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 4060 Laptop GPU


In [67]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [68]:
training_args = TrainingArguments(
    output_dir="./xlmr_bullying_results2",
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)
from torch.nn import CrossEntropyLoss
import torch

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch = None):
        labels = inputs.pop("labels")
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # --- CLASS WEIGHTING ---
        # Calculate weights to penalize errors on the minority class more
        label_counts = df['labels'].value_counts().sort_index()
        total_samples = label_counts.sum()
        num_classes = len(label_counts)

        # weight = total_samples / (num_classes * count_of_that_class)
        weights = total_samples / (num_classes * label_counts)

        # Convert weights to a tensor and move to the correct device (CPU/GPU)
        class_weights = torch.tensor(weights.values, dtype=torch.float).to(self.args.device)

        # Define the loss function with the calculated weights
        loss_fct = CrossEntropyLoss(weight=class_weights)

        # Compute the loss
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = CustomTrainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 