In [2]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, EvalPrediction, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Subset 
from torch.optim import AdamW
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
import wandb
import optuna
import pandas as pd
import numpy as np

### Save the path to the different datasets

In [3]:
train_en_path = "./data_sources/train/train_en.csv"
test_en_path = "./data_sources/test/test_en.csv"

train_it_path = "./data_sources/train/train_it.csv"
test_it_path = "./data_sources/test/test_it.csv"

train_es_path = "./data_sources/train/train_es.csv"
test_es_path = "./data_sources/test/test_es.csv"

### Set up W&B

In [4]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33msravisconti[0m ([33msravisconti-projects[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### Load data in DatasetDict

In [5]:
# Load train and test CSVs
# dataset_it = load_dataset("csv", data_files={
#     "train": train_it_path,
#     "test": test_it_path
# })

# Load CSV manually for the train split
train_df = pd.read_csv(train_it_path)
test_df = pd.read_csv(test_it_path)

# Split into train and val with stratification
train_df, val_df = train_test_split(
    train_df, test_size=0.2, stratify=train_df["label"], random_state=42
)

# Convert back to Hugging Face Datasets
dataset_it = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "val": Dataset.from_pandas(val_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df)
})

In [5]:
print(dataset_it)

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'bio', 'label', 'lang'],
        num_rows: 694
    })
    val: Dataset({
        features: ['id', 'text', 'bio', 'label', 'lang'],
        num_rows: 174
    })
    test: Dataset({
        features: ['id', 'text', 'bio', 'label', 'lang'],
        num_rows: 218
    })
})


### Tokenize the dataset

In [6]:
model_name = "xlm-roberta-base"
# TODO: later try with "cardiffnlp/twitter-xlm-roberta-base"

# loads the correct tokenizer for the chosen model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# define a function to tokenize the text data
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

# applies tokenize function to batches of examples to tokenize the entire dataset
# Returns a dictionary with:
    # input_ids → token IDs (instead of raw text)
    # attention_mask → mask indicating which tokens are real vs padding
    # label → original label from dataset
tokenized_dataset_it = dataset_it.map(tokenize, batched=True)

# the type of the input_ids is list
# batch = tokenized_dataset_it["train"][0]
# print(type(batch["input_ids"]))

# set the format of the dataset to PyTorch tensors before feeding it to the model
tokenized_dataset_it.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

# now the type of the input_ids is torch.Tensor
# batch = tokenized_dataset_it["train"][0]
# print(type(batch["input_ids"]))

Map: 100%|██████████| 694/694 [00:00<00:00, 8017.18 examples/s]
Map: 100%|██████████| 174/174 [00:00<00:00, 8763.00 examples/s]
Map: 100%|██████████| 218/218 [00:00<00:00, 9096.10 examples/s]


### Define metrics

In [11]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    precision = precision_score(labels, preds, average="macro")
    recall = recall_score(labels, preds, average="macro")

    # For ROC AUC, we need probabilities
    if p.predictions.shape[1] == 2:  # binary classification
        probs = torch.softmax(torch.tensor(p.predictions), dim=1)[:,1].numpy()
        roc_auc = roc_auc_score(labels, probs)
    else:
        roc_auc = 0.0

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "roc_auc": roc_auc
    }

### Define custom Trainer

In [8]:
# Compute class weights from training set
labels = np.array(tokenized_dataset_it["train"]["label"])
print("Labels distribution in training set:", np.bincount(labels))
class_counts = np.bincount(labels)
total = len(labels)
# We want to give more weight to the class that has smaller count
class_weights = total / (len(class_counts) * class_counts)
weights_tensor = torch.tensor(class_weights, dtype=torch.float)

print("Class weights:", weights_tensor)

# Custom Trainer that uses weighted cross entropy
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=weights_tensor.to(logits.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

Labels distribution in training set: [562 132]
Class weights: tensor([0.6174, 2.6288])


### Define objective function for Optuna

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_labels = 2  # binary classification
batch_size = 8
num_epochs = 4

# Defines one experiment (trial)
def objective(trial):
    # Create a new W&B run for this trial
    wandb.init(
        project="multi-pride-bert-baseline_fixed_epochs_and_batch_size", 
        name=f"trial-{trial.number}",
        reinit="return_previous"
    )

    # Sample hyperparameters for this trial
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    # batch_size = trial.suggest_categorical("batch_size", [8, 16])
    # num_epochs = trial.suggest_int("num_epochs", 2, 5)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)

    # Initialize new model for this trial
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    training_args = TrainingArguments(
        output_dir=f"./results/trial-{trial.number}",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="steps",   # <-- log training loss every few steps
        logging_steps=10,           # <-- frequency of training loss logging
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=weight_decay,
        num_train_epochs=num_epochs,
        logging_dir="./logs",
        report_to="wandb",
        run_name=f"trial-{trial.number}",
        load_best_model_at_end=True,
        metric_for_best_model="f1"
    )

    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset_it["train"],
        eval_dataset=tokenized_dataset_it["val"],
        compute_metrics=compute_metrics
    )

    train_result = trainer.train()
    eval_result = trainer.evaluate()

    # ---- 8. Log both train & eval metrics to W&B ----
    wandb.log({
        "train/loss": train_result.training_loss,
        "eval/loss": eval_result["eval_loss"],
        "eval/f1": eval_result["eval_f1"],
        "eval/precision": eval_result["eval_precision"],
        "eval/recall": eval_result["eval_recall"],
        "eval/accuracy": eval_result["eval_accuracy"],
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "weight_decay": weight_decay,
        "num_epochs": num_epochs
    })

    # Finish this run cleanly
    wandb.finish()

    # Return metric Optuna should maximize
    return eval_result["eval_f1"]

### Optuna Tuning

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)  
print("Best trial:")
print(study.best_trial.params)

[I 2025-11-11 15:07:57,659] A new study created in memory with name: no-name-be6ccac4-a48c-43a2-b66f-d3908ac89198


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.478,0.255537,0.942529,0.914184,0.886485,0.952934,0.965184
2,0.3992,0.187726,0.95977,0.933786,0.93904,0.928756,0.981947
3,0.3879,0.256179,0.965517,0.943907,0.943907,0.943907,0.98775
4,0.3536,0.513945,0.948276,0.905589,0.97,0.863636,0.977649




[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
batch_size,▁
eval/accuracy,▁▆█▃██
eval/f1,▃▆█▁██
eval/loss,▂▁▂█▂▂
eval/precision,▁▅▆█▆▆
eval/recall,█▆▇▁▇▇
eval/roc_auc,▁▆█▅█
eval/runtime,▄▁▄▄█
eval/samples_per_second,▅█▄▅▁
eval/steps_per_second,▅█▄▅▁

0,1
batch_size,8
eval/accuracy,0.96552
eval/f1,0.94391
eval/loss,0.25618
eval/precision,0.94391
eval/recall,0.94391
eval/roc_auc,0.98775
eval/runtime,23.8301
eval/samples_per_second,7.302
eval/steps_per_second,0.923


[I 2025-11-11 15:41:16,752] Trial 0 finished with value: 0.9439071566731141 and parameters: {'learning_rate': 2.5838554043134063e-05, 'weight_decay': 0.11709120484330249}. Best is trial 0 with value: 0.9439071566731141.


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.5199,0.256699,0.954023,0.92521,0.92521,0.92521,0.988395
2,0.3394,0.265688,0.965517,0.939583,0.979592,0.909091,0.986031
3,0.3485,0.248787,0.954023,0.921498,0.944828,0.901999,0.986245
4,0.2584,0.384554,0.95977,0.928567,0.976351,0.893939,0.983666




[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
batch_size,▁
eval/accuracy,▁█▁▅██
eval/f1,▂█▁▄██
eval/loss,▁▂▁█▂▂
eval/precision,▁█▄███
eval/recall,█▄▃▁▄▄
eval/roc_auc,█▅▅▁▅
eval/runtime,█▅▁▃▂
eval/samples_per_second,▁▄█▆▇
eval/steps_per_second,▁▄█▆▇

0,1
batch_size,8
eval/accuracy,0.96552
eval/f1,0.93958
eval/loss,0.26569
eval/precision,0.97959
eval/recall,0.90909
eval/roc_auc,0.98603
eval/runtime,24.4797
eval/samples_per_second,7.108
eval/steps_per_second,0.899


[I 2025-11-11 16:16:07,413] Trial 1 finished with value: 0.9395833333333333 and parameters: {'learning_rate': 1.0592850462189524e-05, 'weight_decay': 0.05603714798343684}. Best is trial 0 with value: 0.9439071566731141.


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.4153,0.332366,0.867816,0.82098,0.78988,0.895229,0.897271
2,0.2302,0.614231,0.936782,0.884608,0.945833,0.844939,0.906727
3,0.0411,0.258583,0.95977,0.935306,0.930462,0.940361,0.985171
4,0.0892,0.414161,0.942529,0.899306,0.935752,0.871696,0.985171




[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
batch_size,▁
eval/accuracy,▁▆█▇██
eval/f1,▁▅█▆██
eval/loss,▂█▁▄▁▁
eval/precision,▁█▇█▇▇
eval/recall,▅▁█▃██
eval/roc_auc,▁▂███
eval/runtime,▃▂█▁▇
eval/samples_per_second,▅▇▁█▂
eval/steps_per_second,▆▇▁█▂

0,1
batch_size,8
eval/accuracy,0.95977
eval/f1,0.93531
eval/loss,0.25858
eval/precision,0.93046
eval/recall,0.94036
eval/roc_auc,0.98517
eval/runtime,23.7765
eval/samples_per_second,7.318
eval/steps_per_second,0.925


[I 2025-11-11 16:49:38,013] Trial 2 finished with value: 0.9353056780156159 and parameters: {'learning_rate': 2.657845678465721e-05, 'weight_decay': 0.006787039205170075}. Best is trial 0 with value: 0.9439071566731141.


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.4676,0.201274,0.936782,0.90652,0.877165,0.949387,0.983666
2,0.442,0.306738,0.95977,0.928567,0.976351,0.893939,0.984096
3,0.2108,0.223248,0.95977,0.933786,0.93904,0.928756,0.985386
4,0.1905,0.39228,0.95977,0.928567,0.976351,0.893939,0.984956




[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
batch_size,▁
eval/accuracy,▁█████
eval/f1,▁▇█▇██
eval/loss,▁▅▂█▂▂
eval/precision,▁█▅█▅▅
eval/recall,█▁▅▁▅▅
eval/roc_auc,▁▃█▆█
eval/runtime,██▅▅▁
eval/samples_per_second,▁▁▄▄█
eval/steps_per_second,▁▁▄▄█

0,1
batch_size,8
eval/accuracy,0.95977
eval/f1,0.93379
eval/loss,0.22325
eval/precision,0.93904
eval/recall,0.92876
eval/roc_auc,0.98539
eval/runtime,21.0745
eval/samples_per_second,8.256
eval/steps_per_second,1.044


[I 2025-11-11 17:22:46,072] Trial 3 finished with value: 0.9337863549877685 and parameters: {'learning_rate': 1.1552318327983594e-05, 'weight_decay': 0.03813173093123977}. Best is trial 0 with value: 0.9439071566731141.


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.5182,0.227777,0.971264,0.95379,0.948739,0.959059,0.987105
2,0.4235,0.278615,0.965517,0.939583,0.979592,0.909091,0.979368
3,0.3676,0.20973,0.95977,0.933786,0.93904,0.928756,0.988395
4,0.2557,0.34901,0.95977,0.928567,0.976351,0.893939,0.982377




[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
batch_size,▁
eval/accuracy,█▄▁▁██
eval/f1,█▄▂▁██
eval/loss,▂▄▁█▂▂
eval/precision,▃█▁▇▃▃
eval/recall,█▃▅▁██
eval/roc_auc,▇▁█▃▇
eval/runtime,▁▄▆▆█
eval/samples_per_second,█▄▃▃▁
eval/steps_per_second,█▄▃▃▁

0,1
batch_size,8
eval/accuracy,0.97126
eval/f1,0.95379
eval/loss,0.22778
eval/precision,0.94874
eval/recall,0.95906
eval/roc_auc,0.98711
eval/runtime,23.7604
eval/samples_per_second,7.323
eval/steps_per_second,0.926


[I 2025-11-11 17:55:00,039] Trial 4 finished with value: 0.9537897700111542 and parameters: {'learning_rate': 1.0327292968111504e-05, 'weight_decay': 0.06144150815589746}. Best is trial 4 with value: 0.9537897700111542.


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.3404,0.308886,0.885057,0.845553,0.811321,0.929078,0.91038
2,0.8426,0.265609,0.965517,0.942562,0.953756,0.932302,0.973995
3,0.1246,0.295609,0.95977,0.933786,0.93904,0.928756,0.987965
4,0.0594,0.344342,0.95977,0.933786,0.93904,0.928756,0.98775




[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
batch_size,▁
eval/accuracy,▁█████
eval/f1,▁█▇▇██
eval/loss,▅▁▄█▁▁
eval/precision,▁█▇▇██
eval/recall,▂█▁▁██
eval/roc_auc,▁▇██▇
eval/runtime,▆▁▄▂█
eval/samples_per_second,▃█▅▇▁
eval/steps_per_second,▃█▅▇▁

0,1
batch_size,8
eval/accuracy,0.96552
eval/f1,0.94256
eval/loss,0.26561
eval/precision,0.95376
eval/recall,0.9323
eval/roc_auc,0.974
eval/runtime,25.7079
eval/samples_per_second,6.768
eval/steps_per_second,0.856


[I 2025-11-11 18:29:13,295] Trial 5 finished with value: 0.9425616197183099 and parameters: {'learning_rate': 3.419249276404569e-05, 'weight_decay': 0.12480186858696667}. Best is trial 4 with value: 0.9537897700111542.


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.4562,0.211992,0.91954,0.88433,0.85168,0.938749,0.96153
2,0.5333,0.398391,0.954023,0.917241,0.973154,0.878788,0.942833
3,0.422,0.33459,0.965517,0.941123,0.965517,0.920696,0.970557
4,0.2568,0.417013,0.954023,0.917241,0.973154,0.878788,0.981732




[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
batch_size,▁
eval/accuracy,▁▆█▆██
eval/f1,▁▅█▅██
eval/loss,▁▇▅█▅▅
eval/precision,▁█████
eval/recall,█▁▆▁▆▆
eval/roc_auc,▄▁▆█▆
eval/runtime,▁▁▃▇█
eval/samples_per_second,██▆▂▁
eval/steps_per_second,██▆▂▁

0,1
batch_size,8
eval/accuracy,0.96552
eval/f1,0.94112
eval/loss,0.33459
eval/precision,0.96552
eval/recall,0.9207
eval/roc_auc,0.97056
eval/runtime,24.8679
eval/samples_per_second,6.997
eval/steps_per_second,0.885


[I 2025-11-11 19:03:38,578] Trial 6 finished with value: 0.941123392736296 and parameters: {'learning_rate': 1.2595311189837818e-05, 'weight_decay': 0.08732439085204093}. Best is trial 4 with value: 0.9537897700111542.


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2897,0.2952,0.890805,0.849693,0.815968,0.921019,0.985386
2,0.2414,0.305873,0.948276,0.914868,0.919894,0.910058,0.980658
3,0.0667,0.365993,0.954023,0.921498,0.944828,0.901999,0.984741
4,0.0488,0.413705,0.942529,0.899306,0.935752,0.871696,0.983022




[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
batch_size,▁
eval/accuracy,▁▇█▇██
eval/f1,▁▇█▆██
eval/loss,▁▂▅█▅▅
eval/precision,▁▇████
eval/recall,█▆▅▁▅▅
eval/roc_auc,█▁▇▅▇
eval/runtime,▅▆█▅▁
eval/samples_per_second,▄▃▁▄█
eval/steps_per_second,▄▃▁▄█

0,1
batch_size,8
eval/accuracy,0.95402
eval/f1,0.9215
eval/loss,0.36599
eval/precision,0.94483
eval/recall,0.902
eval/roc_auc,0.98474
eval/runtime,22.6816
eval/samples_per_second,7.671
eval/steps_per_second,0.97


[I 2025-11-11 19:38:53,253] Trial 7 finished with value: 0.9214978569817279 and parameters: {'learning_rate': 2.5134362640363844e-05, 'weight_decay': 0.2588509643495115}. Best is trial 4 with value: 0.9537897700111542.


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.4167,0.191139,0.936782,0.90652,0.877165,0.949387,0.966043
2,0.4583,0.337532,0.95977,0.928567,0.976351,0.893939,0.984956
3,0.2281,0.381831,0.965517,0.939583,0.979592,0.909091,0.990974
4,0.1215,0.407513,0.954023,0.919444,0.957672,0.890393,0.987965


[W 2025-11-11 20:13:29,538] Trial 8 failed with parameters: {'learning_rate': 1.9651715790283235e-05, 'weight_decay': 0.10665126050623909} because of the following error: RuntimeError('[enforce fail at inline_container.cc:664] . unexpected pos 46720 vs 46612').
Traceback (most recent call last):
  File "c:\Users\sravi\Desktop\Projects\multi-pride\.venv\Lib\site-packages\torch\serialization.py", line 967, in save
    _save(
  File "c:\Users\sravi\Desktop\Projects\multi-pride\.venv\Lib\site-packages\torch\serialization.py", line 1268, in _save
    zip_file.write_record(name, storage, num_bytes)
RuntimeError: [enforce fail at inline_container.cc:863] . PytorchStreamWriter failed writing file data/1: file write failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\sravi\Desktop\Projects\multi-pride\.venv\Lib\site-packages\optuna\study\_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
         

RuntimeError: [enforce fail at inline_container.cc:664] . unexpected pos 46720 vs 46612

### Train final model

In [12]:
# Merge train + val for final training
final_train_df = pd.concat([train_df, val_df])
final_train_dataset = Dataset.from_pandas(final_train_df)
final_train_dataset = final_train_dataset.map(tokenize, batched=True)
final_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Load best hyperparameters
# best_params = study.best_trial.params
best_params = {
    "learning_rate": 1.0327292968111504e-05,
    "weight_decay": 0.06144150815589746
}

final_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

final_args = TrainingArguments(
    output_dir="./final_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1, 
    logging_strategy="steps",   # <-- log training loss every few steps
    logging_steps=10,           # <-- frequency of training loss logging
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=best_params["weight_decay"],
    num_train_epochs=num_epochs,
    logging_dir="./logs",
    report_to="wandb",
    run_name="final-model",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

final_trainer = WeightedTrainer(
    model=final_model,
    args=final_args,
    train_dataset=final_train_dataset,
    eval_dataset=tokenized_dataset_it["test"],  # final test only once!
    compute_metrics=compute_metrics,
)

wandb.init(
        project="multi-pride-bert-baseline_fixed_epochs_and_batch_size", 
        name=f"final model",
        reinit="return_previous",
        resume="allow"
    )

final_trainer.train()
final_results = final_trainer.evaluate()
wandb.log(final_results)
print("Final test metrics:", final_results)

Map: 100%|██████████| 868/868 [00:00<00:00, 11521.39 examples/s]
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.4491,0.390409,0.917431,0.867289,0.867289,0.867289,0.950758
2,0.5666,0.283283,0.93578,0.89486,0.902528,0.887716,0.96645
3,0.1803,0.431339,0.93578,0.89486,0.902528,0.887716,0.963609
4,0.3237,0.481175,0.93578,0.89486,0.902528,0.887716,0.966315




Final test metrics: {'eval_loss': 0.2832830548286438, 'eval_accuracy': 0.9357798165137615, 'eval_f1': 0.8948601350420284, 'eval_precision': 0.9025280898876404, 'eval_recall': 0.8877164502164503, 'eval_roc_auc': 0.9664502164502164, 'eval_runtime': 28.9454, 'eval_samples_per_second': 7.531, 'eval_steps_per_second': 0.967, 'epoch': 4.0}
