In [2]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, set_seed
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from torch.utils.data import DataLoader
import time
import random
import pandas as pd
import warnings
import transformers

transformers.logging.set_verbosity(transformers.logging.CRITICAL)
warnings.filterwarnings('ignore')


# from utils import compute_metrics, run_experiment

# For reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


## 1. Load dataset

In [3]:
# Load tweet eval dataset

dataset = load_dataset("tweet_eval", "irony")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2862
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 784
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 955
    })
})


In [4]:
# Class distribution

train_labels = dataset['train']['label']
print(f"  Non-ironic (0): {train_labels.count(0)} ({train_labels.count(0)/len(train_labels)*100:.1f}%)")
print(f"  Ironic (1): {train_labels.count(1)} ({train_labels.count(1)/len(train_labels)*100:.1f}%)")

  Non-ironic (0): 1417 (49.5%)
  Ironic (1): 1445 (50.5%)


## 2. Define metrics for evaluation and experiement setup

In [5]:
# Define metrics for evaluation

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=-1)
    probs = torch.nn.functional.softmax(torch.tensor(pred.predictions), dim=-1)[:, 1].numpy()

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds, average="binary"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "roc_auc": roc_auc_score(labels, probs)
    }


In [18]:
# Experiment setup

def run_experiment(model_name, strategy, learning_rate=2e-5, batch_size=16, epochs=3, lora_config=None):
    print(f"\nTraining {model_name} | {strategy} | lr={learning_rate} | batch_size={batch_size} | epochs={epochs}")

    # Initialize the tokenizer for this model
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Tokenize dataset and convert to pytorch
    tokenized_data = dataset.map(
        lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=128),
        batched=True
    )
    tokenized_data = tokenized_data.rename_column('label', 'labels')
    tokenized_data.set_format('torch')

    # Load models
    if strategy == "full":
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    elif strategy == "lora":
        base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

        # Have to determine target modules for LoRA
        if model_name.startswith("distilbert"):
            target_modules = ["q_lin", "v_lin"]
        elif model_name.startswith("roberta"):
            target_modules = ["query", "value"]
        else:
            target_modules = None

        # LoRA configurations (create default ones if theres no lora_config)
        if lora_config is None:
            lora_config = LoraConfig(
                task_type=TaskType.SEQ_CLS,
                r=8,
                lora_alpha=16,
                lora_dropout=0.1,
                bias="none",
                target_modules=target_modules)
        else:
            # If target_modules not set in the provided config, fill it
            if getattr(lora_config, "target_modules", None) is None:
                lora_config.target_modules = target_modules

        model = get_peft_model(base_model, lora_config)

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable params: {trainable_params:,}/{total_params:,}")

    # Move model to GPU
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{model_name}_{strategy}_lr{learning_rate}_bs{batch_size}_ep{epochs}",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        logging_steps=100,
        warmup_steps=100,
        report_to="none",
        dataloader_num_workers=0)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data['train'],
        eval_dataset=tokenized_data['validation'],
        compute_metrics=compute_metrics)

    # Train & evaluate + track training time
    start_time = time.time()
    trainer.train()
    train_time = time.time() - start_time

    eval_results = trainer.evaluate(tokenized_data['test'])
    preds = trainer.predict(tokenized_data['test'])
    y_pred = np.argmax(preds.predictions, axis=-1)
    y_true = preds.label_ids
    cm = confusion_matrix(y_true, y_pred)

    test_texts = dataset['test']['text']
    df_test = pd.DataFrame({
        'text': test_texts,
        'true_label': y_true,
        'pred_label': y_pred
    })
    error_cases = df_test[df_test['true_label'] != df_test['pred_label']]

    return {
        "mode": model,
        "tokenizer": tokenizer,
        "model": model_name,
        "strategy": strategy,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs,
        "train_time_s": train_time,
        "accuracy": eval_results.get('eval_accuracy'),
        "precision": eval_results.get('eval_precision'),
        "recall": eval_results.get('eval_recall'),
        "f1": eval_results.get('eval_f1'),
        "f1_macro": eval_results.get('eval_f1_macro'),
        "roc_auc": eval_results.get('eval_roc_auc'),
        "confusion_matrix": cm,
        "error_cases": error_cases.head(10)
    }


## 3. Baseline experiments

In [7]:
# First, run the baseline experiments for distilbert and roberta

baseline_results = []
baseline_models = ["distilbert-base-uncased", "roberta-base"]

for model_name in baseline_models:
    for strategy in ["full", "lora"]:
        lora_cfg = None
        if strategy == "lora":
            lora_cfg = LoraConfig(task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1, bias="none")
        result = run_experiment(model_name, strategy, learning_rate=2e-5, batch_size=16, epochs=3, lora_config=lora_cfg)
        baseline_results.append(result)


Training distilbert-base-uncased | full | lr=2e-05 | batch_size=16 | epochs=3


Map:   0%|          | 0/955 [00:00<?, ? examples/s]

Trainable params: 66,955,010/66,955,010
{'loss': 0.6789, 'grad_norm': 3.1427061557769775, 'learning_rate': 1.98e-05, 'epoch': 0.5586592178770949}
{'eval_loss': 0.6331588625907898, 'eval_accuracy': 0.6094240837696335, 'eval_precision': 0.563165905631659, 'eval_recall': 0.8114035087719298, 'eval_f1': 0.6648697214734951, 'eval_f1_macro': 0.5984323513264589, 'eval_roc_auc': 0.7164987870477797, 'eval_runtime': 3.436, 'eval_samples_per_second': 277.941, 'eval_steps_per_second': 17.462, 'epoch': 1.0}
{'loss': 0.6235, 'grad_norm': 4.505254745483398, 'learning_rate': 1.5469107551487414e-05, 'epoch': 1.1173184357541899}
{'loss': 0.5585, 'grad_norm': 6.2447190284729, 'learning_rate': 1.0892448512585814e-05, 'epoch': 1.675977653631285}
{'eval_loss': 0.5986599326133728, 'eval_accuracy': 0.6837696335078534, 'eval_precision': 0.654, 'eval_recall': 0.7171052631578947, 'eval_f1': 0.6841004184100419, 'eval_f1_macro': 0.6837692867731551, 'eval_roc_auc': 0.7531861969553142, 'eval_runtime': 3.6451, 'eval_s

In [8]:
df_baseline_results = pd.DataFrame(baseline_results)
df_baseline_results

Unnamed: 0,mode,tokenizer,model,strategy,learning_rate,batch_size,epochs,train_time_s,accuracy,precision,recall,f1,f1_macro,roc_auc,confusion_matrix,error_cases
0,DistilBertForSequenceClassification(\n (disti...,DistilBertTokenizerFast(name_or_path='distilbe...,distilbert-base-uncased,full,2e-05,16,3,132.775394,0.677296,0.573232,0.729904,0.64215,0.674153,0.747429,"[[304, 169], [84, 227]]",...
1,PeftModelForSequenceClassification(\n (base_m...,DistilBertTokenizerFast(name_or_path='distilbe...,distilbert-base-uncased,lora,2e-05,16,3,75.673521,0.568878,0.463612,0.553055,0.504399,0.561454,0.575128,"[[274, 199], [139, 172]]",...
2,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,2e-05,16,3,258.400249,0.709184,0.616901,0.70418,0.657658,0.702443,0.784539,"[[337, 136], [92, 219]]",...
3,PeftModelForSequenceClassification(\n (base_m...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,lora,2e-05,16,3,146.291421,0.545918,0.446301,0.601286,0.512329,0.543754,0.577592,"[[241, 232], [124, 187]]",...


In [9]:
df_baseline = (df_baseline_results.drop(["mode", "tokenizer", "precision", "recall", "learning_rate", "batch_size", "epochs", "error_cases"], axis=1)
    .sort_values("f1_macro", ascending=False))
df_baseline

Unnamed: 0,model,strategy,train_time_s,accuracy,f1,f1_macro,roc_auc,confusion_matrix
2,roberta-base,full,258.400249,0.709184,0.657658,0.702443,0.784539,"[[337, 136], [92, 219]]"
0,distilbert-base-uncased,full,132.775394,0.677296,0.64215,0.674153,0.747429,"[[304, 169], [84, 227]]"
1,distilbert-base-uncased,lora,75.673521,0.568878,0.504399,0.561454,0.575128,"[[274, 199], [139, 172]]"
3,roberta-base,lora,146.291421,0.545918,0.512329,0.543754,0.577592,"[[241, 232], [124, 187]]"


## 4. RoBERTa experiments

In [10]:
# Run RobERTA

roberta_results = []

# Baseline
baseline_lr = 2e-5
baseline_full_bs = 16
baseline_lora_bs = 16
baseline_epochs = 3
baseline_lora_dropout = 0.1

# Hyperparameters to tune

learning_rates = [2e-5, 3e-5, 4e-5]
full_batch_sizes = [8, 16, 32]
lora_batch_sizes = [8, 16]
epochs_list = [3, 4]
lora_dropouts = [0.05, 0.1]

# For fine-tuning, vary fine-tuning one factor at a time first
# For full fine-tuning
print("Full fine-tuning")

# 1. Learning rate
for lr in learning_rates:
    print(f"FULL: lr={lr}, bs={baseline_full_bs}, epochs={baseline_epochs}")
    result = run_experiment(
        model_name="roberta-base",
        strategy="full",
        learning_rate=lr,
        batch_size=baseline_full_bs,
        epochs=baseline_epochs,
        lora_config=None
    )
    roberta_results.append(result)


Full fine-tuning
FULL: lr=2e-05, bs=16, epochs=3

Training roberta-base | full | lr=2e-05 | batch_size=16 | epochs=3
Trainable params: 124,647,170/124,647,170
{'loss': 0.6869, 'grad_norm': 9.592109680175781, 'learning_rate': 1.98e-05, 'epoch': 0.5586592178770949}
{'eval_loss': 0.5872989892959595, 'eval_accuracy': 0.6816753926701571, 'eval_precision': 0.6371841155234657, 'eval_recall': 0.7741228070175439, 'eval_f1': 0.699009900990099, 'eval_f1_macro': 0.6806160616061606, 'eval_roc_auc': 0.7696049994726295, 'eval_runtime': 6.2343, 'eval_samples_per_second': 153.184, 'eval_steps_per_second': 9.624, 'epoch': 1.0}
{'loss': 0.5948, 'grad_norm': 15.75192928314209, 'learning_rate': 1.5469107551487414e-05, 'epoch': 1.1173184357541899}
{'loss': 0.5215, 'grad_norm': 8.389067649841309, 'learning_rate': 1.0892448512585814e-05, 'epoch': 1.675977653631285}
{'eval_loss': 0.6024345755577087, 'eval_accuracy': 0.7172774869109948, 'eval_precision': 0.64576802507837, 'eval_recall': 0.9035087719298246, 'eva

In [11]:
df_roberta_results = pd.DataFrame(roberta_results)
df_roberta_results

Unnamed: 0,mode,tokenizer,model,strategy,learning_rate,batch_size,epochs,train_time_s,accuracy,precision,recall,f1,f1_macro,roc_auc,confusion_matrix,error_cases
0,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,2e-05,16,3,331.864919,0.709184,0.616901,0.70418,0.657658,0.702443,0.784539,"[[337, 136], [92, 219]]",...
1,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,3e-05,16,3,268.090776,0.705357,0.60929,0.717042,0.658789,0.699765,0.774824,"[[330, 143], [88, 223]]",...
2,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,16,3,294.971504,0.71301,0.617486,0.726688,0.667651,0.707563,0.78131,"[[333, 140], [85, 226]]",...


In [12]:
# 2. Batch size (use best LR from previous step manually)
best_lr_full = 4e-5
full_batch_sizes = [8, 32] # Already have for batch size 16

for bs in full_batch_sizes:
    print(f"FULL: lr={best_lr_full}, bs={bs}, epochs={baseline_epochs}")
    result = run_experiment(
        model_name="roberta-base",
        strategy="full",
        learning_rate=best_lr_full,
        batch_size=bs,
        epochs=baseline_epochs,
        lora_config=None
    )
    roberta_results.append(result)

FULL: lr=4e-05, bs=8, epochs=3

Training roberta-base | full | lr=4e-05 | batch_size=8 | epochs=3
Trainable params: 124,647,170/124,647,170
{'loss': 0.6853, 'grad_norm': 7.8745622634887695, 'learning_rate': 3.96e-05, 'epoch': 0.27932960893854747}
{'loss': 0.651, 'grad_norm': 9.635052680969238, 'learning_rate': 3.593429158110883e-05, 'epoch': 0.5586592178770949}
{'loss': 0.6519, 'grad_norm': 3.712763786315918, 'learning_rate': 3.182751540041068e-05, 'epoch': 0.8379888268156425}
{'eval_loss': 0.6150221228599548, 'eval_accuracy': 0.6732984293193718, 'eval_precision': 0.6157556270096463, 'eval_recall': 0.8399122807017544, 'eval_f1': 0.7105751391465677, 'eval_f1_macro': 0.6677875695732838, 'eval_roc_auc': 0.7951692859403017, 'eval_runtime': 6.2539, 'eval_samples_per_second': 152.704, 'eval_steps_per_second': 19.188, 'epoch': 1.0}
{'loss': 0.5485, 'grad_norm': 39.85838317871094, 'learning_rate': 2.7720739219712527e-05, 'epoch': 1.1173184357541899}
{'loss': 0.4963, 'grad_norm': 24.21317481994

In [13]:
df_roberta_results = pd.DataFrame(roberta_results)
df_roberta_results

Unnamed: 0,mode,tokenizer,model,strategy,learning_rate,batch_size,epochs,train_time_s,accuracy,precision,recall,f1,f1_macro,roc_auc,confusion_matrix,error_cases
0,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,2e-05,16,3,331.864919,0.709184,0.616901,0.70418,0.657658,0.702443,0.784539,"[[337, 136], [92, 219]]",...
1,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,3e-05,16,3,268.090776,0.705357,0.60929,0.717042,0.658789,0.699765,0.774824,"[[330, 143], [88, 223]]",...
2,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,16,3,294.971504,0.71301,0.617486,0.726688,0.667651,0.707563,0.78131,"[[333, 140], [85, 226]]",...
3,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,8,3,324.780093,0.683673,0.623529,0.511254,0.561837,0.657166,0.72006,"[[377, 96], [152, 159]]",...
4,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,32,3,263.185377,0.72449,0.632312,0.729904,0.677612,0.718539,0.803947,"[[341, 132], [84, 227]]",...


In [14]:
# 3. Epochs (use best LR & batch size)
best_bs_full = 32
result = run_experiment(
        model_name="roberta-base",
        strategy="full",
        learning_rate=best_lr_full,
        batch_size=best_bs_full,
        # Just need for epoch 4
        epochs=4,
        lora_config=None)

roberta_results.append(result)


Training roberta-base | full | lr=4e-05 | batch_size=32 | epochs=4
Trainable params: 124,647,170/124,647,170
{'eval_loss': 0.5962445735931396, 'eval_accuracy': 0.6984293193717277, 'eval_precision': 0.7028985507246377, 'eval_recall': 0.6381578947368421, 'eval_f1': 0.6689655172413793, 'eval_f1_macro': 0.6960212201591511, 'eval_roc_auc': 0.7657551945997257, 'eval_runtime': 5.6418, 'eval_samples_per_second': 169.273, 'eval_steps_per_second': 5.317, 'epoch': 1.0}
{'loss': 0.6524, 'grad_norm': 17.647567749023438, 'learning_rate': 3.96e-05, 'epoch': 1.1111111111111112}
{'eval_loss': 0.600162923336029, 'eval_accuracy': 0.680628272251309, 'eval_precision': 0.6095791001451378, 'eval_recall': 0.9210526315789473, 'eval_f1': 0.7336244541484717, 'eval_f1_macro': 0.6674658218454776, 'eval_roc_auc': 0.8330740428224871, 'eval_runtime': 5.6638, 'eval_samples_per_second': 168.616, 'eval_steps_per_second': 5.297, 'epoch': 2.0}
{'loss': 0.4909, 'grad_norm': 11.633252143859863, 'learning_rate': 2.476923076

In [15]:
df_roberta_results = pd.DataFrame(roberta_results)
df_roberta_results

Unnamed: 0,mode,tokenizer,model,strategy,learning_rate,batch_size,epochs,train_time_s,accuracy,precision,recall,f1,f1_macro,roc_auc,confusion_matrix,error_cases
0,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,2e-05,16,3,331.864919,0.709184,0.616901,0.70418,0.657658,0.702443,0.784539,"[[337, 136], [92, 219]]",...
1,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,3e-05,16,3,268.090776,0.705357,0.60929,0.717042,0.658789,0.699765,0.774824,"[[330, 143], [88, 223]]",...
2,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,16,3,294.971504,0.71301,0.617486,0.726688,0.667651,0.707563,0.78131,"[[333, 140], [85, 226]]",...
3,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,8,3,324.780093,0.683673,0.623529,0.511254,0.561837,0.657166,0.72006,"[[377, 96], [152, 159]]",...
4,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,32,3,263.185377,0.72449,0.632312,0.729904,0.677612,0.718539,0.803947,"[[341, 132], [84, 227]]",...
5,RobertaForSequenceClassification(\n (roberta)...,RobertaTokenizerFast(name_or_path='roberta-bas...,roberta-base,full,4e-05,32,4,418.940327,0.706633,0.634551,0.614148,0.624183,0.691799,0.776102,"[[363, 110], [120, 191]]",...


In [16]:
df_roberta_summary = df_roberta_results[["learning_rate", "batch_size", "epochs", "accuracy", "precision", "recall", "f1_macro",
                                         "train_time_s", "confusion_matrix"]].sort_values(by="f1_macro", ascending=False).reset_index(drop=True)

df_roberta_summary


Unnamed: 0,learning_rate,batch_size,epochs,accuracy,precision,recall,f1_macro,train_time_s,confusion_matrix
0,4e-05,32,3,0.72449,0.632312,0.729904,0.718539,263.185377,"[[341, 132], [84, 227]]"
1,4e-05,16,3,0.71301,0.617486,0.726688,0.707563,294.971504,"[[333, 140], [85, 226]]"
2,2e-05,16,3,0.709184,0.616901,0.70418,0.702443,331.864919,"[[337, 136], [92, 219]]"
3,3e-05,16,3,0.705357,0.60929,0.717042,0.699765,268.090776,"[[330, 143], [88, 223]]"
4,4e-05,32,4,0.706633,0.634551,0.614148,0.691799,418.940327,"[[363, 110], [120, 191]]"
5,4e-05,8,3,0.683673,0.623529,0.511254,0.657166,324.780093,"[[377, 96], [152, 159]]"


## 5. Run the best model and retrieve error cases

In [19]:
# Run the best model again

best_model = run_experiment(
        model_name="roberta-base",
        strategy="full",
        learning_rate=4e-4,
        batch_size=32,
        epochs=3,
        lora_config=None)


Training roberta-base | full | lr=0.0004 | batch_size=32 | epochs=3


Map:   0%|          | 0/2862 [00:00<?, ? examples/s]

Map:   0%|          | 0/784 [00:00<?, ? examples/s]

Map:   0%|          | 0/955 [00:00<?, ? examples/s]

Trainable params: 124,647,170/124,647,170
{'eval_loss': 0.6924459338188171, 'eval_accuracy': 0.5225130890052356, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_f1_macro': 0.343191196698762, 'eval_roc_auc': 0.45707863094610274, 'eval_runtime': 12.1126, 'eval_samples_per_second': 78.844, 'eval_steps_per_second': 2.477, 'epoch': 1.0}
{'loss': 0.7022, 'grad_norm': 1.8877636194229126, 'learning_rate': 0.00039600000000000003, 'epoch': 1.1111111111111112}
{'eval_loss': 0.6923642754554749, 'eval_accuracy': 0.5225130890052356, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_f1_macro': 0.343191196698762, 'eval_roc_auc': 0.4665559188552544, 'eval_runtime': 12.1313, 'eval_samples_per_second': 78.722, 'eval_steps_per_second': 2.473, 'epoch': 2.0}
{'loss': 0.7074, 'grad_norm': 0.39850330352783203, 'learning_rate': 0.00016705882352941178, 'epoch': 2.2222222222222223}
{'eval_loss': 0.6921640634536743, 'eval_accuracy': 0.5225130890052356, 'eval_precision': 0.0, 'eval_

In [None]:
# Extract the error cases from the best model run

best_model_results = []
best_model_results.append(best_model)
best_model_df = pd.DataFrame(best_model_results)
error_cases = best_model_df.loc[0, 'error_cases']
pd.set_option('display.max_colwidth', None)
error_cases.style.set_properties(**{'text-align': 'left'}).set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}])


Unnamed: 0,text,true_label,pred_label
1,"Just walked in to #Starbucks and asked for a ""tall blonde"" Hahahaha #irony",1,0
4,So much #sarcasm at work mate 10/10 #boring 100% #dead mate full on #shit absolutely #sleeping mate can't handle the #sarcasm,1,0
5,Corny jokes are my absolute favorite,0,1
8,"if Christian expects Fifa to sleep in my bed with me tonight, he's wrong 👿",0,1
10,Most important thing I've learned in school,0,1
12,I love context and large ensemble Fridays!!!!! Der my most favourite #Sarcasm #GetTheFuckOut,1,0
15,"Always classy, never trashy and just a little sassy.",0,1
16,"you believe you can say something, provide no proof and its a fact, WRONG @user @user",0,1
18,@user Re: Jamie Grace has Tourette's? Thanks for sharing. I'm about to research her & post my tribute to this Young lady Gospel singer,0,1
19,@user Guess they didn't get the memo reg non-nuclear Baltic sea #sarcasm,1,0
