In [1]:
from datasets import load_dataset
import pandas as pd

# Load GoEmotions
dataset = load_dataset("go_emotions")

# Check the keys (splits)
print(dataset.keys())  # ['train', 'validation', 'test']

  from .autonotebook import tqdm as notebook_tqdm


dict_keys(['train', 'validation', 'test'])


In [2]:
# View one sample from the training set
print(dataset['train'][4])

{'text': 'Dirty Southern Wankers', 'labels': [3], 'id': 'ed0bdzj'}


In [3]:
# List of all emotions 
label_names = dataset['train'].features['labels'].feature.names
print(label_names)

['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [4]:
# Get the list of emotion labels
label_names = dataset['train'].features['labels'].feature.names

# Example: Get emotion names for the first sample
sample_labels = dataset['train'][0]['labels']
sample_emotions = [label_names[i] for i in sample_labels]
print(sample_emotions)

['neutral']


In [5]:
# Check a few samples
for i in range(5):
    labels = dataset['train'][i]['labels']
    emotions = [label_names[j] for j in labels]
    print(f"Sample {i} emotions: {emotions}")

Sample 0 emotions: ['neutral']
Sample 1 emotions: ['neutral']
Sample 2 emotions: ['anger']
Sample 3 emotions: ['fear']
Sample 4 emotions: ['annoyance']


#### Now have to decide how to tackle this data, either: 

1) Option 1: Single-Label SVM

- Simplify and just use the first emotion (or only those samples with one label), then use SVM exactly like we did for Reddit - or - 

2) Option 2: Multi-Label Classifier (Optional for later)

- If we want to try multi-label classification later, you'd use:

    - A OneVsRestClassifier wrapper around SVC

    - Or switch to a deep learning model with sigmoid outputs

In [6]:
# Let's go with Option 1: Single-Label SVM for now 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Get only examples with ONE label
df = pd.DataFrame(dataset['train'])
df = df[df['labels'].apply(lambda x: len(x) == 1)]
df['label'] = df['labels'].apply(lambda x: x[0])
df['label_name'] = df['label'].apply(lambda i: label_names[i])

# TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['text'])
y = df['label']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# Train
clf = LinearSVC()
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_names))

                precision    recall  f1-score   support

    admiration       0.63      0.57      0.60       678
     amusement       0.71      0.79      0.75       413
         anger       0.42      0.30      0.35       256
     annoyance       0.34      0.09      0.14       363
      approval       0.48      0.13      0.20       468
        caring       0.38      0.14      0.20       162
     confusion       0.54      0.12      0.20       215
     curiosity       0.67      0.09      0.15       347
        desire       0.57      0.34      0.43        97
disappointment       0.47      0.05      0.09       177
   disapproval       0.24      0.03      0.05       351
       disgust       0.57      0.32      0.41       125
 embarrassment       0.57      0.16      0.25        51
    excitement       0.40      0.11      0.17       128
          fear       0.57      0.47      0.51       107
     gratitude       0.92      0.88      0.90       464
         grief       0.00      0.00      0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
import matplotlib.pyplot as plt
import seaborn

ModuleNotFoundError: No module named 'matplotlib'