In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
import re
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [35]:
df = pd.read_csv('go_emotions_dataset.csv')
df.dropna(inplace=True)

# Downsample to a manageable subset for quicker experiments
MAX_ROWS = 10000
df = df.sample(n=min(len(df), MAX_ROWS), random_state=42).reset_index(drop=True)

df.head()

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eeochpb,Literally everywhere. There's nothing especial...,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,eewnhep,"[NAME] and [NAME] may be stronger, but [NAME] ...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,ee4utnt,Ik I’m crying rn,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,effsrc0,"He was cut yesterday, unfortunately",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,ee579yx,"Nice, I saw them during the Demo and was prett...",False,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
df.columns

Index(['id', 'text', 'example_very_unclear', 'admiration', 'amusement',
       'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
       'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
       'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
       'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
       'sadness', 'surprise', 'neutral'],
      dtype='object')

In [39]:
for col in df.columns[2:]:
    count = df[col].sum()
    print(f"{col:12s} : {int(count)}")


example_very_unclear : 143
admiration   : 829
amusement    : 415
anger        : 376
annoyance    : 633
approval     : 806
caring       : 277
confusion    : 345
curiosity    : 479
desire       : 191
disappointment : 403
disapproval  : 544
disgust      : 263
embarrassment : 126
excitement   : 256
fear         : 135
gratitude    : 556
grief        : 12
joy          : 378
love         : 389
nervousness  : 95
optimism     : 380
pride        : 52
realization  : 407
relief       : 65
remorse      : 114
sadness      : 300
surprise     : 243
neutral      : 2731


In [40]:
# 1. DATA SETUP (Multi-label Emotions)

EMOTION_COLS = [
    "admiration",
    "annoyance",
    "curiosity",
    "gratitude",
    "neutral",
]

mask = df[EMOTION_COLS].sum(axis=1) > 0
df_condensed = df[mask].reset_index(drop=True)

# 2) Features + labels for training
X = df_condensed["text"]
Y = df_condensed[EMOTION_COLS].values.astype("float32")

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)


In [41]:
df_condensed.shape

(5104, 31)

In [44]:

# Device setup
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float32)
        }

# 2. MODEL SETUP
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(EMOTION_COLS),
    problem_type="multi_label_classification",
).to(DEVICE)

# 3. TRAINING LOOP
train_dataset = SarcasmDataset(X_train, y_train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optimizer = AdamW(model.parameters(), lr=2e-5)
EPOCHS = 50

print("Starting training...")
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
    
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} average loss: {avg_loss:.4f}")

# 4. EVALUATION ON HOLD-OUT TEST SET
model.eval()
preds, targets = [], []
test_dataset = SarcasmDataset(X_test, y_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

with torch.no_grad():
    for batch in test_loader:
        labels = batch['labels'].to(DEVICE)     
        inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != 'labels'}
        outputs = model(**inputs)
        logits = outputs.logits                    # (B, 28)
        probs = torch.sigmoid(logits)
        pred_labels = (probs >= 0.5).int()         # multi-label prediction

        preds.append(pred_labels.cpu())
        targets.append(labels.cpu().int())

preds = torch.vstack(preds).numpy()
targets = torch.vstack(targets).numpy()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...
Epoch 1 average loss: 0.3714
Epoch 2 average loss: 0.2653
Epoch 3 average loss: 0.2133
Epoch 4 average loss: 0.1563
Epoch 5 average loss: 0.1073
Epoch 6 average loss: 0.0774
Epoch 7 average loss: 0.0542
Epoch 8 average loss: 0.0485
Epoch 9 average loss: 0.0429
Epoch 10 average loss: 0.0374
Epoch 11 average loss: 0.0304
Epoch 12 average loss: 0.0262
Epoch 13 average loss: 0.0218
Epoch 14 average loss: 0.0223
Epoch 15 average loss: 0.0177
Epoch 16 average loss: 0.0168
Epoch 17 average loss: 0.0194
Epoch 18 average loss: 0.0187
Epoch 19 average loss: 0.0152
Epoch 20 average loss: 0.0204
Epoch 21 average loss: 0.0133
Epoch 22 average loss: 0.0113
Epoch 23 average loss: 0.0123
Epoch 24 average loss: 0.0130
Epoch 25 average loss: 0.0129
Epoch 26 average loss: 0.0166
Epoch 27 average loss: 0.0124
Epoch 28 average loss: 0.0124
Epoch 29 average loss: 0.0126
Epoch 30 average loss: 0.0133
Epoch 31 average loss: 0.0123
Epoch 32 average loss: 0.0093
Epoch 33 average loss: 0.009

In [46]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# preds, targets already computed as (N, 28) 0/1 arrays
# preds = torch.vstack(preds).numpy()
# targets = torch.vstack(targets).numpy()

# exact-match accuracy (all 28 labels correct)
acc = accuracy_score(targets, preds)
micro_f1 = f1_score(targets, preds, average="micro", zero_division=0)
macro_f1 = f1_score(targets, preds, average="macro", zero_division=0)

print(f"Exact-match accuracy: {acc:.4f}")
print(f"Micro F1: {micro_f1:.4f}")
print(f"Macro F1: {macro_f1:.4f}")

print(classification_report(
    targets,
    preds,
    target_names=EMOTION_COLS,
    zero_division=0
))

comparison_df = pd.DataFrame({
    "text": X_test.reset_index(drop=True),
    "true_labels": list(targets),   # 0/1 vectors
    "pred_labels": list(preds),
})
comparison_df.head()
def decode_rows(arr):
    return [
        [EMOTION_COLS[i] for i, v in enumerate(row) if v == 1]
        for row in arr
    ]

comparison_df = pd.DataFrame({
    "text": X_test.reset_index(drop=True),
    "true_emotions": decode_rows(targets),
    "pred_emotions": decode_rows(preds),
})



Exact-match accuracy: 0.6592
Micro F1: 0.6926
Macro F1: 0.6263
              precision    recall  f1-score   support

  admiration       0.73      0.64      0.68       184
   annoyance       0.42      0.33      0.37       124
   curiosity       0.54      0.50      0.52        90
   gratitude       0.77      0.82      0.79       113
     neutral       0.74      0.79      0.77       536

   micro avg       0.70      0.69      0.69      1047
   macro avg       0.64      0.62      0.63      1047
weighted avg       0.69      0.69      0.69      1047
 samples avg       0.69      0.69      0.69      1047



Exact-match accuracy: 0.0350
Micro F1: 0.0000
Macro F1: 0.0000


Exact-match accuracy: 0.0350
Micro F1: 0.0000
Macro F1: 0.0000
