In [1]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, set_seed
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from torch.utils.data import DataLoader
import time
import random
import pandas as pd

# from utils import compute_metrics, run_experiment

# For reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [2]:
# Load tweet eval dataset

dataset = load_dataset("tweet_eval", "irony")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

irony/train-00000-of-00001.parquet:   0%|          | 0.00/183k [00:00<?, ?B/s]

irony/test-00000-of-00001.parquet:   0%|          | 0.00/54.0k [00:00<?, ?B/s]

irony/validation-00000-of-00001.parquet:   0%|          | 0.00/61.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2862 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/784 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/955 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2862
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 784
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 955
    })
})


In [3]:
# Class distribution

train_labels = dataset['train']['label']
print(f"  Non-ironic (0): {train_labels.count(0)} ({train_labels.count(0)/len(train_labels)*100:.1f}%)")
print(f"  Ironic (1): {train_labels.count(1)} ({train_labels.count(1)/len(train_labels)*100:.1f}%)")

  Non-ironic (0): 1417 (49.5%)
  Ironic (1): 1445 (50.5%)


In [4]:
# Define metrics for evaluation

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=-1)
    probs = torch.nn.functional.softmax(torch.tensor(pred.predictions), dim=-1)[:, 1].numpy()

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds, average="binary"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "roc_auc": roc_auc_score(labels, probs)
    }


In [17]:
# Experiment setup

def run_experiment(model_name, strategy, learning_rate=2e-5, batch_size=16, epochs=3, lora_config=None):
    print(f"\nTraining {model_name} | {strategy} | lr={learning_rate} | batch_size={batch_size} | epochs={epochs}")

    # Initialize the tokenizer for this model
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Tokenize dataset and convert to pytorch
    tokenized_data = dataset.map(
        lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=128),
        batched=True
    )
    tokenized_data = tokenized_data.rename_column('label', 'labels')
    tokenized_data.set_format('torch')

    # Load models
    if strategy == "full":
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    elif strategy == "lora":
        base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

        # Have to determine target modules for LoRA
        if model_name.startswith("distilbert"):
            target_modules = ["q_lin", "v_lin"]
        elif model_name.startswith("roberta"):
            target_modules = ["query", "value"]
        else:
            target_modules = None

        # LoRA configurations (create default ones if theres no lora_config)
        if lora_config is None:
            lora_config = LoraConfig(
                task_type=TaskType.SEQ_CLS,
                r=8,
                lora_alpha=16,
                lora_dropout=0.1,
                bias="none",
                target_modules=target_modules)
        else:
            # If target_modules not set in the provided config, fill it
            if getattr(lora_config, "target_modules", None) is None:
                lora_config.target_modules = target_modules

        model = get_peft_model(base_model, lora_config)

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable params: {trainable_params:,}/{total_params:,}")

    # Move model to GPU
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{model_name}_{strategy}_lr{learning_rate}_bs{batch_size}_ep{epochs}",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        logging_steps=100,
        warmup_steps=100,
        report_to="none",
        dataloader_num_workers=0)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data['train'],
        eval_dataset=tokenized_data['validation'],
        compute_metrics=compute_metrics)

    # Train & evaluate + track training time
    start_time = time.time()
    trainer.train()
    train_time = time.time() - start_time

    eval_results = trainer.evaluate(tokenized_data['test'])
    preds = trainer.predict(tokenized_data['test'])
    y_pred = np.argmax(preds.predictions, axis=-1)
    y_true = preds.label_ids
    cm = confusion_matrix(y_true, y_pred)

    test_texts = dataset['test']['text']
    df_test = pd.DataFrame({
        'text': test_texts,
        'true_label': y_true,
        'pred_label': y_pred
    })
    error_cases = df_test[df_test['true_label'] != df_test['pred_label']]

    return {
        "mode": model,
        "tokenizer": tokenizer,
        "model": model_name,
        "strategy": strategy,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs,
        "train_time_s": train_time,
        "accuracy": eval_results.get('eval_accuracy'),
        "precision": eval_results.get('eval_precision'),
        "recall": eval_results.get('eval_recall'),
        "f1": eval_results.get('eval_f1'),
        "f1_macro": eval_results.get('eval_f1_macro'),
        "roc_auc": eval_results.get('eval_roc_auc'),
        "confusion_matrix": cm,
        "error_cases": error_cases.head(10)
    }


In [None]:
# First, run the baseline experiments for distilbert and roberta

baseline_results = []
baseline_models = ["distilbert-base-uncased", "roberta-base"]

for model_name in baseline_models:
    for strategy in ["full", "lora"]:
        lora_cfg = None
        if strategy == "lora":
            lora_cfg = LoraConfig(task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1, bias="none")
        result = run_experiment(model_name, strategy, learning_rate=2e-5, batch_size=16, epochs=3, lora_config=lora_cfg)
        baseline_results.append(result)


Training distilbert-base-uncased | full | lr=2e-05 | batch_size=16 | epochs=3


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2862 [00:00<?, ? examples/s]

Map:   0%|          | 0/784 [00:00<?, ? examples/s]

Map:   0%|          | 0/955 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 66,955,010/66,955,010


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Macro,Roc Auc
1,0.6789,0.633159,0.609424,0.563166,0.811404,0.66487,0.598432,0.716499
2,0.5585,0.59866,0.68377,0.654,0.717105,0.6841,0.683769,0.753186
3,0.4251,0.626694,0.695288,0.659574,0.747807,0.700925,0.69518,0.76401



Training distilbert-base-uncased | lora | lr=2e-05 | batch_size=16 | epochs=3


Map:   0%|          | 0/784 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 739,586/67,694,596


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Macro,Roc Auc
1,0.6945,0.683266,0.57801,0.549533,0.644737,0.59334,0.57741,0.624776
2,0.6756,0.673734,0.588482,0.560229,0.642544,0.59857,0.588222,0.627751
3,0.6687,0.670176,0.584293,0.561587,0.589912,0.575401,0.584111,0.629021



Training roberta-base | full | lr=2e-05 | batch_size=16 | epochs=3


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/2862 [00:00<?, ? examples/s]

Map:   0%|          | 0/784 [00:00<?, ? examples/s]

Map:   0%|          | 0/955 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 124,647,170/124,647,170


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Macro,Roc Auc
1,0.6869,0.587299,0.681675,0.637184,0.774123,0.69901,0.680616,0.769605
2,0.5215,0.602435,0.717277,0.645768,0.903509,0.753199,0.711158,0.830635
3,0.3829,0.574406,0.742408,0.692308,0.828947,0.754491,0.741783,0.841877



Training roberta-base | lora | lr=2e-05 | batch_size=16 | epochs=3


Map:   0%|          | 0/784 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 887,042/125,534,212


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Macro,Roc Auc
1,0.6936,0.692309,0.543455,0.512853,0.875,0.646677,0.500854,0.58207
2,0.6907,0.690357,0.568586,0.531519,0.813596,0.642981,0.549004,0.604832
3,0.6882,0.689239,0.579058,0.543269,0.743421,0.627778,0.57172,0.609161


In [None]:
df_baseline_results = pd.DataFrame(baseline_results)
df_baseline_results

Unnamed: 0,mode,tokenizer,model,strategy,learning_rate,batch_size,epochs,train_time_s,accuracy,precision,recall,f1,f1_macro,roc_auc,confusion_matrix
0,DistilBertForSequenceClassification(\n (disti...,DistilBertTokenizerFast(name_or_path='distilbe...,distilbert-base-uncased,full,2e-05,16,3,109.737457,0.677296,0.573232,0.729904,0.64215,0.674153,0.747429,"[[304, 169], [84, 227]]"
1,PeftModelForSequenceClassification(\n (base_m...,DistilBertTokenizerFast(name_or_path='distilbe...,distilbert-base-uncased,lora,2e-05,16,3,74.295968,0.568878,0.463612,0.553055,0.504399,0.561454,0.575128,"[[274, 199], [139, 172]]"
2,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,2e-05,16,3,249.504292,0.709184,0.616901,0.70418,0.657658,0.702443,0.784539,"[[337, 136], [92, 219]]"
3,PeftModelForSequenceClassification(\n (base_m...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,lora,2e-05,16,3,141.687181,0.545918,0.446301,0.601286,0.512329,0.543754,0.577592,"[[241, 232], [124, 187]]"


                     model strategy  accuracy  f1_macro  train_time_s
2             roberta-base     full  0.709184  0.702443    297.390193
0  distilbert-base-uncased     full  0.677296  0.674153    136.627033
1  distilbert-base-uncased     lora  0.568878  0.561454     70.781740
3             roberta-base     lora  0.545918  0.543754    136.509491


In [None]:
# Run RobERTA

roberta_results = []

# Baseline
baseline_lr = 2e-5
baseline_full_bs = 16
baseline_lora_bs = 16
baseline_epochs = 3
baseline_lora_dropout = 0.1

# Hyperparameters to tune

learning_rates = [2e-5, 3e-5, 4e-5]
full_batch_sizes = [8, 16, 32]
lora_batch_sizes = [8, 16]
epochs_list = [3, 4]
lora_dropouts = [0.05, 0.1]

# For fine-tuning, vary fine-tuning one factor at a time first
# For full fine-tuning
print("Full fine-tuning")

# 1. Learning rate
for lr in learning_rates:
    print(f"FULL: lr={lr}, bs={baseline_full_bs}, epochs={baseline_epochs}")
    result = run_experiment(
        model_name="roberta-base",
        strategy="full",
        learning_rate=lr,
        batch_size=baseline_full_bs,
        epochs=baseline_epochs,
        lora_config=None
    )
    roberta_results.append(result)


Full fine-tuning
FULL: lr=2e-05, bs=16, epochs=3

Training roberta-base | full | lr=2e-05 | batch_size=16 | epochs=3


Map:   0%|          | 0/955 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 124,647,170/124,647,170


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Macro,Roc Auc
1,0.6869,0.587299,0.681675,0.637184,0.774123,0.69901,0.680616,0.769605
2,0.5215,0.602435,0.717277,0.645768,0.903509,0.753199,0.711158,0.830635
3,0.3829,0.574406,0.742408,0.692308,0.828947,0.754491,0.741783,0.841877


FULL: lr=3e-05, bs=16, epochs=3

Training roberta-base | full | lr=3e-05 | batch_size=16 | epochs=3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 124,647,170/124,647,170


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Macro,Roc Auc
1,0.6834,0.589197,0.678534,0.632327,0.780702,0.698724,0.677084,0.7769
2,0.4965,0.6056,0.701571,0.63297,0.892544,0.740673,0.694628,0.835179
3,0.3366,0.581767,0.759162,0.713208,0.828947,0.766734,0.758908,0.850025


FULL: lr=4e-05, bs=16, epochs=3

Training roberta-base | full | lr=4e-05 | batch_size=16 | epochs=3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 124,647,170/124,647,170


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Macro,Roc Auc
1,0.6772,0.610679,0.671204,0.663594,0.631579,0.647191,0.669674,0.729011
2,0.5224,0.607442,0.713089,0.652685,0.85307,0.739544,0.710098,0.8176
3,0.3589,0.571027,0.754974,0.713462,0.813596,0.760246,0.754855,0.846395


In [None]:
df_roberta_results = pd.DataFrame(roberta_results)
df_roberta_results

Unnamed: 0,mode,tokenizer,model,strategy,learning_rate,batch_size,epochs,train_time_s,accuracy,precision,recall,f1,f1_macro,roc_auc,confusion_matrix
0,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,2e-05,16,3,379.027443,0.709184,0.616901,0.70418,0.657658,0.702443,0.784539,"[[337, 136], [92, 219]]"
1,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,3e-05,16,3,345.098369,0.705357,0.60929,0.717042,0.658789,0.699765,0.774824,"[[330, 143], [88, 223]]"
2,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,16,3,389.204346,0.71301,0.617486,0.726688,0.667651,0.707563,0.78131,"[[333, 140], [85, 226]]"


In [None]:
# 2. Batch size (use best LR from previous step manually)
best_lr_full = 4e-5
full_batch_sizes = [8, 32] # Already have for batch size 16

for bs in full_batch_sizes:
    print(f"FULL: lr={best_lr_full}, bs={bs}, epochs={baseline_epochs}")
    result = run_experiment(
        model_name="roberta-base",
        strategy="full",
        learning_rate=best_lr_full,
        batch_size=bs,
        epochs=baseline_epochs,
        lora_config=None
    )
    roberta_results.append(result)

FULL: lr=4e-05, bs=8, epochs=3

Training roberta-base | full | lr=4e-05 | batch_size=8 | epochs=3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 124,647,170/124,647,170


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Macro,Roc Auc
1,0.6519,0.615022,0.673298,0.615756,0.839912,0.710575,0.667788,0.795169
2,0.45,0.562251,0.747644,0.731183,0.745614,0.738328,0.747324,0.829721
3,0.3121,0.80136,0.74555,0.702087,0.811404,0.752798,0.745331,0.846526


FULL: lr=4e-05, bs=32, epochs=3

Training roberta-base | full | lr=4e-05 | batch_size=32 | epochs=3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 124,647,170/124,647,170


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Macro,Roc Auc
1,No log,0.596245,0.698429,0.702899,0.638158,0.668966,0.696021,0.765755
2,0.652400,0.554786,0.722513,0.652313,0.89693,0.755309,0.717437,0.84258
3,0.487200,0.543724,0.754974,0.715116,0.809211,0.759259,0.754896,0.852248


In [None]:
df_roberta_results = pd.DataFrame(roberta_results)
df_roberta_results

Unnamed: 0,mode,tokenizer,model,strategy,learning_rate,batch_size,epochs,train_time_s,accuracy,precision,recall,f1,f1_macro,roc_auc,confusion_matrix
0,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,2e-05,16,3,379.027443,0.709184,0.616901,0.70418,0.657658,0.702443,0.784539,"[[337, 136], [92, 219]]"
1,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,3e-05,16,3,345.098369,0.705357,0.60929,0.717042,0.658789,0.699765,0.774824,"[[330, 143], [88, 223]]"
2,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,16,3,389.204346,0.71301,0.617486,0.726688,0.667651,0.707563,0.78131,"[[333, 140], [85, 226]]"
3,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,8,3,347.295208,0.683673,0.623529,0.511254,0.561837,0.657166,0.72006,"[[377, 96], [152, 159]]"
4,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,32,3,362.918413,0.72449,0.632312,0.729904,0.677612,0.718539,0.803947,"[[341, 132], [84, 227]]"


In [None]:
# 3. Epochs (use best LR & batch size)
best_bs_full = 32
result = run_experiment(
        model_name="roberta-base",
        strategy="full",
        learning_rate=best_lr_full,
        batch_size=best_bs_full,
        # Just need for epoch 4
        epochs=4,
        lora_config=None)

roberta_results.append(result)


Training roberta-base | full | lr=4e-05 | batch_size=32 | epochs=4


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 124,647,170/124,647,170


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Macro,Roc Auc
1,No log,0.596245,0.698429,0.702899,0.638158,0.668966,0.696021,0.765755
2,0.652400,0.600163,0.680628,0.609579,0.921053,0.733624,0.667466,0.833074
3,0.490900,0.547239,0.773822,0.757511,0.774123,0.765727,0.773552,0.846781
4,0.299000,0.690192,0.764398,0.724272,0.817982,0.76828,0.764332,0.853307


In [None]:
df_roberta_results = pd.DataFrame(roberta_results)
df_roberta_results

Unnamed: 0,mode,tokenizer,model,strategy,learning_rate,batch_size,epochs,train_time_s,accuracy,precision,recall,f1,f1_macro,roc_auc,confusion_matrix
0,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,2e-05,16,3,379.027443,0.709184,0.616901,0.70418,0.657658,0.702443,0.784539,"[[337, 136], [92, 219]]"
1,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,3e-05,16,3,345.098369,0.705357,0.60929,0.717042,0.658789,0.699765,0.774824,"[[330, 143], [88, 223]]"
2,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,16,3,389.204346,0.71301,0.617486,0.726688,0.667651,0.707563,0.78131,"[[333, 140], [85, 226]]"
3,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,8,3,347.295208,0.683673,0.623529,0.511254,0.561837,0.657166,0.72006,"[[377, 96], [152, 159]]"
4,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,32,3,362.918413,0.72449,0.632312,0.729904,0.677612,0.718539,0.803947,"[[341, 132], [84, 227]]"
5,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,32,4,570.220926,0.706633,0.634551,0.614148,0.624183,0.691799,0.776102,"[[363, 110], [120, 191]]"


In [None]:
df_roberta_summary = df_roberta_results[["learning_rate", "batch_size", "epochs", "accuracy", "precision", "recall", "f1_macro",
                                         "train_time_s", "confusion_matrix"]].sort_values(by="f1_macro", ascending=False).reset_index(drop=True)

df_roberta_summary


Unnamed: 0,learning_rate,batch_size,epochs,accuracy,precision,recall,f1_macro,train_time_s,confusion_matrix
0,4e-05,32,3,0.72449,0.632312,0.729904,0.718539,362.918413,"[[341, 132], [84, 227]]"
1,4e-05,16,3,0.71301,0.617486,0.726688,0.707563,389.204346,"[[333, 140], [85, 226]]"
2,2e-05,16,3,0.709184,0.616901,0.70418,0.702443,379.027443,"[[337, 136], [92, 219]]"
3,3e-05,16,3,0.705357,0.60929,0.717042,0.699765,345.098369,"[[330, 143], [88, 223]]"
4,4e-05,32,4,0.706633,0.634551,0.614148,0.691799,570.220926,"[[363, 110], [120, 191]]"
5,4e-05,8,3,0.683673,0.623529,0.511254,0.657166,347.295208,"[[377, 96], [152, 159]]"


In [18]:
# Run the best model again

best_model = run_experiment(
        model_name="roberta-base",
        strategy="full",
        learning_rate=4e-4,
        batch_size=32,
        epochs=3,
        lora_config=None)


Training roberta-base | full | lr=0.0004 | batch_size=32 | epochs=3


Map:   0%|          | 0/2862 [00:00<?, ? examples/s]

Map:   0%|          | 0/784 [00:00<?, ? examples/s]

Map:   0%|          | 0/955 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 124,647,170/124,647,170


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,F1 Macro,Roc Auc
1,No log,0.730992,0.573822,0.545624,0.642544,0.590131,0.573146,0.624064
2,0.687800,0.692454,0.522513,0.0,0.0,0.0,0.343191,0.483695
3,0.719500,0.69225,0.522513,0.0,0.0,0.0,0.343191,0.454899


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
# Extract the error cases from the best model run

best_model_results = []
best_model_results.append(best_model)
best_model_df = pd.DataFrame(best_model_results)
error_cases = best_model_df.loc[0, 'error_cases']
pd.set_option('display.max_colwidth', None)
error_cases.style.set_properties(**{'text-align': 'left'}).set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}])


Unnamed: 0,text,true_label,pred_label
1,"Just walked in to #Starbucks and asked for a ""tall blonde"" Hahahaha #irony",1,0
4,So much #sarcasm at work mate 10/10 #boring 100% #dead mate full on #shit absolutely #sleeping mate can't handle the #sarcasm,1,0
5,Corny jokes are my absolute favorite,0,1
8,"if Christian expects Fifa to sleep in my bed with me tonight, he's wrong 👿",0,1
10,Most important thing I've learned in school,0,1
12,I love context and large ensemble Fridays!!!!! Der my most favourite #Sarcasm #GetTheFuckOut,1,0
15,"Always classy, never trashy and just a little sassy.",0,1
16,"you believe you can say something, provide no proof and its a fact, WRONG @user @user",0,1
18,@user Re: Jamie Grace has Tourette's? Thanks for sharing. I'm about to research her & post my tribute to this Young lady Gospel singer,0,1
19,@user Guess they didn't get the memo reg non-nuclear Baltic sea #sarcasm,1,0
