# EDA for Google Research GoEmotions dataset
```
conda install datasets
```

Labels:  
  '0': admiration  
  '1': amusement  
  '2': anger  
  '3': annoyance  
  '4': approval  
  '5': caring  
  '6': confusion  
  '7': curiosity  
  '8': desire  
  '9': disappointment  
  '10': disapproval  
  '11': disgust  
  '12': embarrassment  
  '13': excitement  
  '14': fear  
  '15': gratitude  
  '16': grief  
  '17': joy  
  '18': love  
  '19': nervousness  
  '20': optimism  
  '21': pride  
  '22': realization  
  '23': relief  
  '24': remorse  
  '25': sadness  
  '26': surprise  
  '27': neutral     

In [None]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("google-research-datasets/go_emotions", "simplified")
raw_dataset = load_dataset("google-research-datasets/go_emotions", "raw")

In [None]:
print(f"dataset: {dataset}")
print("Column names: ", dataset['train'].column_names)
print("First dataset entry: ", dataset['train'][0])
labels = dataset['train'].features['labels'].feature.names
print(f"labels: {labels}")
# label_columns = [col for col in features if features[col].dtype == 'bool']
# print(f"Label columns: {label_columns}")

In [None]:
# load the dataset into pandas dataframe for usual workflow
train_df = pd.DataFrame(dataset['train'])

# Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import copy

In [None]:
# get the distribution of labels in the dataset
value_counts = train_df["labels"].value_counts()
relative_value_counts = train_df["labels"].value_counts(normalize=True)
print(labels)
# transform from numbers to string labels
renamed_value_counts = {}
for key, val in value_counts.items():
    if len(key) > 1:
        continue
    renamed_value_counts[labels[key[0]]] = val

# plot the label distribution
plt.figure(figsize=(10, 3))
ax = sns.barplot(x=list(renamed_value_counts.keys()), y=list(renamed_value_counts.values()))
ax.set_xticklabels(list(renamed_value_counts.keys()),
                   rotation=45,
                   fontsize=12,
                   ha="right")
plt.title("Label distribution", fontsize=16)
plt.ylabel("# of comments", fontsize=14)
plt.show()

The `'neutral'` sample is very frequent, plot also without the most frequent label to see the rest of the distribution more clearly.

In [None]:
# plot the label distribution
incomplete_value_counts = copy.deepcopy(renamed_value_counts)
del incomplete_value_counts['neutral']
plt.figure(figsize=(10, 3))
ax = sns.barplot(x=list(incomplete_value_counts.keys()), y=list(incomplete_value_counts.values()))
ax.set_xticklabels(list(incomplete_value_counts.keys()),
                   rotation=45,
                   fontsize=12,
                   ha="right")
plt.title("Label distribution (sans `neutral`)", fontsize=16)
plt.ylabel("# of comments", fontsize=14)
plt.show()

# What is the length disribution of given training examples in this dataset?


In [None]:
train_df.head()
lengths = train_df['text'].apply(lambda x: len(x))
lengths.mean()
lengths.std()
print(f"Average dataset sentence length: {lengths.mean():.2f} +- {lengths.std():.2f}")

plt.hist(lengths, bins=100, label="GoEmotions dataset")
plt.legend()
plt.tight_layout()
plt.xlabel("Input length [symbols]")
plt.ylabel("# of entries")
plt.xlim([0,250])

plt.show()


## Label Co-occurrence analysis
This cooccurrence map only evaluates the multi-labeled parts of the dataset.

In [None]:
from itertools import combinations

stripped_labels = train_df['labels'].apply(lambda x: x if len(x) > 1 else None)
stripped_labels = stripped_labels.dropna()

co_occurrence_matrix = np.zeros((len(labels), len(labels)))
for label_instances in stripped_labels:
    for (label1, label2) in combinations(label_instances, 2):
        co_occurrence_matrix[label1, label2] += 1
        co_occurrence_matrix[label2, label1] += 1
        
co_occurrence_df = pd.DataFrame(co_occurrence_matrix, index=labels, columns=labels)

# Plot the heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(co_occurrence_df, cmap="YlGnBu")
plt.title("Label Co-occurrence Matrix")
plt.xlabel("Labels")
plt.ylabel("Labels")
plt.show()

## Top words per sentiment in a wordcloud

In [None]:
from wordcloud import WordCloud, STOPWORDS

# get the indices of the single number labels
single_labels = train_df['labels'].apply(lambda x: x if len(x) <= 1 else None)
single_labels = single_labels.dropna()  #  leave out the multilabeled ones

# extract the singlelabeled data by index via iloc
single_df = train_df.iloc[single_labels.index]  

# transform the singlelabeled data labels from list (e.g. [8]) into int (e.g. 8)
single_df['labels'] = single_df['labels'].apply(lambda x: x[0])
# print(single_df[single_df['labels'] == 5])


custom_stopwords = set(STOPWORDS)
# there is lots of anonymization of usernames in the data
# 
custom_stopwords.add("NAME")  

for label in range(len(labels)):
    
    # print(f"\n| LABEL : {labels[label]} |\n")
    plt.figure(figsize=(10, 3))
    text = single_df[single_df['labels']==label].text
    cloud = WordCloud(
        stopwords=custom_stopwords, background_color="black", collocations=False,
        width=500, height=300).generate(" ".join(text))
    plt.axis("off")
    plt.title(f"{labels[label]}")
    plt.imshow(cloud)
