
<h1> Sarcasm detection project </h1>



<h2> TF-IDF baseline </h2>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Graphic settings
sns.set(style="whitegrid")

In [None]:
file_path = 'Sarcasm_Headlines_Dataset_v2.json'

try:
    df = pd.read_json(file_path, lines=True)
    print("Dataset caricato con successo!")
    print(f"Dimensioni del dataset: {df.shape}")
except ValueError:
    print("Errore: Assicurati di aver caricato il file JSON e che il nome sia corretto.")

# Visualize first rows
df.head()

In [None]:
# Check class balancements
print(df['is_sarcastic'].value_counts())

sns.countplot(x='is_sarcastic', data=df)
plt.title('Class distributions (0 = Non Sarcastic, 1 = Sarcastic)')
plt.show()

# Check null values
print("\nNull values in datasets:")
print(df.isnull().sum())

In [None]:
# Definizione X (features) e y (target)
X = df['headline']
y = df['is_sarcastic']

# Split 80% training, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

In [None]:
# Vectorization initialization
tfidf = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))

# Fit and transformation on training set, transformazion only on test set
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("Vectorization completed.")
print(f"Training matrix shape: {X_train_tfidf.shape}")

In [None]:
# Model initialization
model = LogisticRegression(solver='liblinear', random_state=42)

# Training
model.fit(X_train_tfidf, y_train)

print("Modello addestrato.")

In [None]:
# Predition on test set
y_pred = model.predict(X_test_tfidf)

# Metrics calculation
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy della Baseline: {acc:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', xticklabels=['Non Sarcastico', 'Sarcastico'], yticklabels=['Non Sarcastico', 'Sarcastico'])
plt.ylabel('Reale')
plt.xlabel('Predetto')
plt.title('Matrice di Confusione')
plt.show()

In [None]:
def predict_sarcasm(text):
    vec = tfidf.transform([text])
    prediction = model.predict(vec)
    probability = model.predict_proba(vec)[0][1]
    label = "SARCASTICO" if prediction[0] == 1 else "NON SARCASTICO"
    return label, probability

# Esempi
sample_1 = "Local man discovers water is wet"
sample_2 = "My favorite hobby is sitting in a two-hour meeting that could have been a single sentence email."
sample_3 = "Politician promises to fix everything in 24 hours"

print(f"'{sample_1}' -> {predict_sarcasm(sample_1)}")
print(f"'{sample_2}' -> {predict_sarcasm(sample_2)}")
print(f"'{sample_3}' -> {predict_sarcasm(sample_3)}")

<h2>DistilBERT</h2>


In [None]:
!pip install -q transformers datasets accelerate scikit-learn

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Verifica GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device in uso: {device}")
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
else:
    print("ATTENZIONE: Stai usando la CPU. Attiva la GPU in Runtime -> Cambia tipo di runtime -> T4 GPU")

Device in uso: cuda
Tesla T4


In [None]:
# Data loading
file_path = 'Sarcasm_Headlines_Dataset_v2.json' # O la versione v2
df = pd.read_json(file_path, lines=True)

# Rename columns for huggingface compatibility
# Il modello si aspetta 'labels' per il target e 'text' (opzionale, ma utile per chiarezza)
df = df.rename(columns={'is_sarcastic': 'labels', 'headline': 'text'})
df = df[['text', 'labels']]

# Split Train/Test (80/20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['labels'])

# Conversion in huggingface object dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

Training samples: 22895
Test samples: 5724


In [None]:
# Search specific tokenizer for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

# Apply tokenization on all dataset
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Remove original text columns and pandas index; keep tensors only.
tokenized_train = tokenized_train.remove_columns(['text', '__index_level_0__'])
tokenized_test = tokenized_test.remove_columns(['text', '__index_level_0__'])

# Set the format for PyTorch
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

print("Tokenizzazione completed.")

Map:   0%|          | 0/22895 [00:00<?, ? examples/s]

Map:   0%|          | 0/5724 [00:00<?, ? examples/s]

Tokenizzazione completata.


In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
).to(device)

print("Model loaded on GPU.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Modello caricato su GPU.


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=2,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=32,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay (regularization)
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=50,
    eval_strategy="epoch",           # Evaluation is performed at the end of each epoch
    save_strategy="epoch",           # Save is performed at the end of each epoch
    load_best_model_at_end=True,     # Load the best model found during training at the end
    fp16=True,                       # Use mixed precision (significantly faster on T4 GPUs)
    report_to="none"                 # Disable third-party logging (e.g., wandb, mlflow)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [None]:
# Install optuna if not already present
!pip install -q optuna

import optuna
import torch
from transformers import TrainingArguments, Trainer, DistilBertForSequenceClassification

def model_init():
    return DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=2
    ).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

#Define the hyperparameter search space
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 4),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.1),
    }

# Base Training Arguments
args = TrainingArguments(
    output_dir='./results_search',
    eval_strategy="epoch",       # <--- Updated parameter name
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,                   # Optimized for T4 GPU
    disable_tqdm=False
)

# Initialize Trainer with model_init
trainer = Trainer(
    model=None,
    model_init=model_init,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Run the search
print("Starting Hyperparameter Search...")
best_run = trainer.hyperparameter_search(
    direction="maximize",
    hp_space=hp_space,
    backend="optuna",
    n_trials=10
)

print("\nBest Hyperparameters found:")
print(best_run)

#Update the trainer with the best values for the final training
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

print("\nReady to train with best parameters.")

  trainer = Trainer(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-12-16 18:34:34,054] A new study created in memory with name: no-name-9cf04a28-88f8-4bb8-9dd8-d75edfe39a5e


Starting Hyperparameter Search...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2203,0.198157,0.923131,0.917541,0.938291,0.89769
2,0.0896,0.244081,0.928022,0.923505,0.935314,0.911991
3,0.0506,0.34754,0.924528,0.91812,0.950177,0.888155
4,0.0116,0.412178,0.928721,0.923164,0.948897,0.89879


[I 2025-12-16 18:40:49,831] Trial 0 finished with value: 3.6995715266234623 and parameters: {'learning_rate': 4.4347502871907344e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'weight_decay': 0.054726515561503236}. Best is trial 0 with value: 3.6995715266234623.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▇▃█
eval/f1,▁█▂█
eval/loss,▁▃▆█
eval/precision,▂▁█▇
eval/recall,▄█▁▄
eval/runtime,▁▂██
eval/samples_per_second,█▇▁▁
eval/steps_per_second,█▇▁▁
train/epoch,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇████

0,1
eval/accuracy,0.92872
eval/f1,0.92316
eval/loss,0.41218
eval/precision,0.9489
eval/recall,0.89879
eval/runtime,7.2981
eval/samples_per_second,784.311
eval/steps_per_second,98.107
total_flos,3032841092229120.0
train/epoch,4


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2528,0.230235,0.905486,0.899536,0.911211,0.888155
2,0.1633,0.242188,0.905136,0.896037,0.9375,0.858086
3,0.1409,0.239977,0.914221,0.907932,0.92901,0.887789
4,0.098,0.269993,0.915618,0.909125,0.933539,0.885955


[I 2025-12-16 18:46:24,302] Trial 1 finished with value: 3.6442382410958034 and parameters: {'learning_rate': 1.0682295806539672e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'weight_decay': 0.07433754097425414}. Best is trial 0 with value: 3.6995715266234623.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁▁▇█
eval/f1,▃▁▇█
eval/loss,▁▃▃█
eval/precision,▁█▆▇
eval/recall,█▁█▇
eval/runtime,█▁▂▄
eval/samples_per_second,▁█▇▅
eval/steps_per_second,▁█▇▅
train/epoch,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇████

0,1
eval/accuracy,0.91562
eval/f1,0.90913
eval/loss,0.26999
eval/precision,0.93354
eval/recall,0.88596
eval/runtime,7.0139
eval/samples_per_second,816.098
eval/steps_per_second,102.084
total_flos,3032841092229120.0
train/epoch,4


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2302,0.20913,0.916492,0.911053,0.924821,0.89769
2,0.1189,0.325715,0.907582,0.896981,0.956395,0.844518
3,0.1025,0.325591,0.922432,0.916792,0.937524,0.896956


[I 2025-12-16 18:51:41,283] Trial 2 finished with value: 3.673703787867411 and parameters: {'learning_rate': 1.5009200258446927e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.07460999453654459}. Best is trial 0 with value: 3.6995715266234623.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▅▁█
eval/f1,▆▁█
eval/loss,▁██
eval/precision,▁█▄
eval/recall,█▁█
eval/runtime,▇▁█
eval/samples_per_second,▂█▁
eval/steps_per_second,▂█▁
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████

0,1
eval/accuracy,0.92243
eval/f1,0.91679
eval/loss,0.32559
eval/precision,0.93752
eval/recall,0.89696
eval/runtime,7.3891
eval/samples_per_second,774.652
eval/steps_per_second,96.899
total_flos,2274630819171840.0
train/epoch,3


Epoch,Training Loss,Validation Loss


In [None]:
print("Training started...")
trainer.train()
print("Training completed.")

In [None]:
# Valutazione sul test set
eval_results = trainer.evaluate()

print("\nFinal Results:")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"F1 Score: {eval_results['eval_f1']:.4f}")
print(f"Loss: {eval_results['eval_loss']:.4f}")

In [None]:
def predict_sarcasm_bert(text):

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)


    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    score = probs[0][1].item()
    prediction = torch.argmax(probs, dim=-1).item()

    label = "SARCASTIC" if prediction == 1 else "NON SARCASTIC"
    return label, score

# Examples
sample_1 = "Local man discovers water is wet"
sample_2 = "My favorite hobby is sitting in a two-hour meeting that could have been a single sentence email."
sample_3 = "Politician promises to fix everything in 24 hours"

print(f"'{sample_1}' -> {predict_sarcasm_bert(sample_1)}")
print(f"'{sample_2}' -> {predict_sarcasm_bert(sample_2)}")
print(f"'{sample_3}' -> {predict_sarcasm_bert(sample_3)}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

print("Ricalcolo metriche Baseline...")
lr_model = LogisticRegression(solver='liblinear', random_state=42)
lr_model.fit(X_train_tfidf, y_train)
lr_preds = lr_model.predict(X_test_tfidf)

baseline_acc = accuracy_score(y_test, lr_preds)
baseline_f1 = f1_score(y_test, lr_preds)

print("Recupero metriche DistilBERT...")
# Usiamo il trainer per fare predizioni sull'intero test set
bert_output = trainer.predict(tokenized_test)
bert_preds = np.argmax(bert_output.predictions, axis=1)
bert_labels = bert_output.label_ids

bert_acc = accuracy_score(bert_labels, bert_preds)
bert_f1 = f1_score(bert_labels, bert_preds)


metrics_df = pd.DataFrame({
    'Modello': ['Baseline (LogReg)', 'DistilBERT', 'Baseline (LogReg)', 'DistilBERT'],
    'Metrica': ['Accuracy', 'Accuracy', 'F1 Score', 'F1 Score'],
    'Valore': [baseline_acc, bert_acc, baseline_f1, bert_f1]
})

plt.figure(figsize=(10, 6))
sns.barplot(x='Metrica', y='Valore', hue='Modello', data=metrics_df, palette="viridis")
plt.title('Confronto Performance: Baseline vs SOTA')
plt.ylim(0.7, 1.0)
plt.ylabel('Score')
plt.show()

print(f"Miglioramento Accuracy: +{(bert_acc - baseline_acc)*100:.2f}%")

fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# Baseline Matrix
sns.heatmap(confusion_matrix(y_test, lr_preds), annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title(f'Baseline (Acc: {baseline_acc:.3f})')
ax[0].set_xlabel('Predetto')
ax[0].set_ylabel('Reale')

# DistilBERT Matrix
sns.heatmap(confusion_matrix(bert_labels, bert_preds), annot=True, fmt='d', cmap='Greens', ax=ax[1])
ax[1].set_title(f'DistilBERT (Acc: {bert_acc:.3f})')
ax[1].set_xlabel('Predetto')
ax[1].set_ylabel('Reale')

plt.show()

def compare_predictions(text):
    # Baseline Prediction
    vec = tfidf.transform([text])
    lr_prob = lr_model.predict_proba(vec)[0][1]

    # DistilBERT Prediction
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    bert_prob = torch.nn.functional.softmax(outputs.logits, dim=-1)[0][1].item()

    return lr_prob, bert_prob

sentences = [
    "Local man discovers water is wet", # Obvious sarcasm
    "Government increases taxes by 5%", # Real news
    "Man strictly following GPS drives car into lake", # Context based sarcasm
    "Study finds that breathing is linked to staying alive" # Logical sarcasm
]

results = []
for s in sentences:
    lr_p, bert_p = compare_predictions(s)
    results.append({
        "Frase": s,
        "Baseline (Sarcasm Prob)": f"{lr_p:.4f}",
        "DistilBERT (Sarcasm Prob)": f"{bert_p:.4f}",
        "Winner": "DistilBERT" if abs(bert_p - (1 if lr_p < 0.5 else 0)) < abs(lr_p - (1 if lr_p < 0.5 else 0)) else "Baseline"
    })

pd.set_option('display.max_colwidth', None)
display(pd.DataFrame(results))