In [3]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("google-research-datasets/go_emotions")
ds


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

In [4]:
ds["train"][0]


{'text': "My favourite food is anything I didn't have to cook myself.",
 'labels': [27],
 'id': 'eebbqej'}

In [5]:
id2label = ds["train"].features["labels"].feature.names
id2label

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [6]:
def to_df(split):
    return pd.DataFrame({
        "text": ds[split]["text"],
        "labels": ds[split]["labels"],
    })

df_train = to_df("train")
df_train.head()

Unnamed: 0,text,labels
0,My favourite food is anything I didn't have to...,[27]
1,"Now if he does off himself, everyone will thin...",[27]
2,WHY THE FUCK IS BAYLESS ISOING,[2]
3,To make her feel threatened,[14]
4,Dirty Southern Wankers,[3]


In [7]:
df_train["n_labels"] = df_train["labels"].apply(len)
df_train["n_labels"].value_counts()

n_labels
1    36308
2     6541
3      532
4       28
5        1
Name: count, dtype: int64

In [8]:
df_train[df_train["n_labels"] > 1].head(5)

Unnamed: 0,text,labels,n_labels
7,We need more boards and to create a bit more s...,"[8, 20]",2
11,"Aww... she'll probably come around eventually,...","[1, 4]",2
15,"Shit, I guess I accidentally bought a Pay-Per-...","[3, 12]",2
19,Maybe that’s what happened to the great white ...,"[6, 22]",2
20,"I never thought it was at the same moment, but...","[6, 9, 27]",3


In [16]:
def map_to_six_classes(label_ids):
    names = [id2label[i] for i in label_ids]

    if any(l in ["anger", "annoyance"] for l in names):
        return "anger"
    if any(l in ["disgust", "disapproval", "contempt"] for l in names):
        return "disgust"
    if any(l in ["fear", "nervousness"] for l in names):
        return "fear"
    if any(l in ["sadness", "grief", "disappointment", "remorse"] for l in names):
        return "sadness"
    if any(l in ["joy", "amusement", "love", "excitement"] for l in names):
        return "joy"
    if any(l in ["surprise", "realization"] for l in names):
        return "surprise"
    return "neutral"

In [17]:
df_train["mapped"] = df_train["labels"].apply(map_to_six_classes)
df_train[["text", "labels", "mapped"]].head(10)


Unnamed: 0,text,labels,mapped
0,My favourite food is anything I didn't have to...,[27],neutral
1,"Now if he does off himself, everyone will thin...",[27],neutral
2,WHY THE FUCK IS BAYLESS ISOING,[2],anger
3,To make her feel threatened,[14],fear
4,Dirty Southern Wankers,[3],anger
5,OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe...,[26],surprise
6,Yes I heard abt the f bombs! That has to be wh...,[15],neutral
7,We need more boards and to create a bit more s...,"[8, 20]",neutral
8,Damn youtube and outrage drama is super lucrat...,[0],neutral
9,It might be linked to the trust factor of your...,[27],neutral


In [18]:
df_train["mapped"].value_counts()

mapped
neutral     25853
joy          6147
anger        3768
sadness      2678
disgust      2449
surprise     1824
fear          691
Name: count, dtype: int64