In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
import re

In [2]:
df = pd.read_csv('redditSarcasm/train-balanced-sarcasm.csv')
df.dropna(inplace=True)

# Downsample to a manageable subset for quicker experiments
MAX_ROWS = 20000
df = df.sample(n=min(len(df), MAX_ROWS), random_state=42).reset_index(drop=True)

df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,1,Yes and the buff it desperately needs,bones7056,Rainbow6,4,4,0,2016-03,2016-03-15 02:02:06,Frost's shotgun!
1,0,Hilarious because i visited A2 one weekend las...,xRVAx,AnnArbor,2,2,0,2015-08,2015-08-21 04:24:18,R E J O I C E ... the guitar dude by Walgreens...
2,0,Have you asked them what they want to eat?,fareven,AskReddit,1,1,0,2015-10,2015-10-17 04:11:48,"What are some foods that old, white people lik..."
3,1,Because that worked out so well with the mater...,Error400BadRequest,Android,2,2,0,2015-09,2015-09-14 13:03:22,There are actually guidelines on how this shou...
4,1,But white people can't be discriminated against!,Flavius_Stilicho,The_Donald,1,-1,-1,2016-11,2016-11-18 15:56:52,Pretty much proof that they are discriminating...


In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Device setup
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. DATA SETUP (Binary Classification)
# Labels: 0 = not sarcastic, 1 = sarcastic
X = df['comment']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            # integer labels for classification
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# 2. MODEL SETUP
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2  # binary classifier
).to(DEVICE)

# 3. TRAINING LOOP
train_dataset = SarcasmDataset(X_train, y_train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optimizer = AdamW(model.parameters(), lr=2e-5)
EPOCHS = 3

print("Starting training (binary sarcasm classification)...")
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} average loss: {avg_loss:.4f}")

# 4. EVALUATION ON HOLD-OUT TEST SET
model.eval()
preds, targets = [], []
test_dataset = SarcasmDataset(X_test, y_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

with torch.no_grad():
    for batch in test_loader:
        labels = batch['labels'].to(DEVICE)
        inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != 'labels'}
        outputs = model(**inputs)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=-1)
        preds.extend(pred_labels.cpu().tolist())
        targets.extend(labels.cpu().tolist())

acc = accuracy_score(targets, preds)
print(f"Accuracy: {acc:.4f}")
print(classification_report(targets, preds, target_names=["not sarcastic", "sarcastic"]))

comparison_df = pd.DataFrame({
    "text": X_test.reset_index(drop=True),
    "true_label": y_test.reset_index(drop=True),
    "pred_label": preds,
})
comparison_df.head()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training (binary sarcasm classification)...
Epoch 1 average loss: 0.5986
Epoch 2 average loss: 0.4805
Epoch 3 average loss: 0.3102
Accuracy: 0.7173
               precision    recall  f1-score   support

not sarcastic       0.72      0.71      0.71      1994
    sarcastic       0.71      0.73      0.72      2006

     accuracy                           0.72      4000
    macro avg       0.72      0.72      0.72      4000
 weighted avg       0.72      0.72      0.72      4000



Unnamed: 0,text,true_label,pred_label
0,Montessori math teaching is amazing.,0,0
1,*Intensifying intensifies* :o,0,0
2,Otherwise we might not be able to call someone...,1,1
3,"and then sometimes it's ""life in prison for sm...",1,0
4,"Yeah, she looks totally healthy.",1,1
