# Dataset
In this section we will load and pre-process the data

In [3]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import OrdinalEncoder

In [4]:
# Path definitions
colab_path = '/content/drive/MyDrive/NLP/project/'
local_path = './data/'
data_path = Path(local_path)

train_data_path = data_path / 'MELD_train_efr.json'
val_data_path = data_path / 'MELD_val_efr.json'
test_data_path = data_path / 'MELD_test_efr.json'

In [23]:
train_data_raw = pd.read_json(train_data_path)
train_data_raw.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_0,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_1,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,utterance_2,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,utterance_3,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_4,"[Joey, Rachel, Joey, Rachel]","[surprise, sadness, surprise, fear]",[But then who? The waitress I went out with la...,"[0.0, 0.0, 1.0, 0.0]"


## Emotion encoding
The emotions are encoded mapping each of them with a specific value. Positive emotion are assigned to positive values the negatives with negatives

In [24]:
emotions = train_data_raw['emotions'].explode().unique()
print(emotions)

['neutral' 'surprise' 'fear' 'sadness' 'joy' 'disgust' 'anger']


In [25]:
# Encode emotions
encoding = {
    "neutral": 0,
    "joy": 1,
    "sadness": -1,
    "anger": -2,
    "fear": -3,
    "disgust": -4,
    "surprise": 2
}
train_data['emotions'] = train_data['emotions'].apply(lambda x: [encoding[i] for i in x])
val

In [26]:
train_data.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_0,"[Chandler, The Interviewer, Chandler, The Inte...","[0, 0, 0, 0, 2]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_1,"[Chandler, The Interviewer, Chandler, The Inte...","[0, 0, 0, 0, 2, 0, 0]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,utterance_2,"[Chandler, The Interviewer, Chandler, The Inte...","[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, -3]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,utterance_3,"[Chandler, The Interviewer, Chandler, The Inte...","[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, -3, 0, 2]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_4,"[Joey, Rachel, Joey, Rachel]","[2, -1, 2, -3]",[But then who? The waitress I went out with la...,"[0.0, 0.0, 1.0, 0.0]"
