# Dataset
In this section we will load and pre-process the data

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import numpy as np
from utils import train_model
import torch

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Path definitions
colab_path = '/content/drive/MyDrive/NLP/project/'
local_path = './data/'
data_path = Path(local_path)

train_data_path = data_path / 'MELD_train_efr.json'
val_data_path = data_path / 'MELD_val_efr.json'
test_data_path = data_path / 'MELD_test_efr.json'

In [4]:
train_data_raw = pd.read_json(train_data_path)
val_data_raw = pd.read_json(val_data_path)
test_data_raw = pd.read_json(test_data_path)
train_data_raw.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_0,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_1,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,utterance_2,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,utterance_3,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_4,"[Joey, Rachel, Joey, Rachel]","[surprise, sadness, surprise, fear]",[But then who? The waitress I went out with la...,"[0.0, 0.0, 1.0, 0.0]"


## Emotion encoding
The emotions are encoded mapping each of them with a specific value. Positive emotion are assigned to positive values the negatives with negatives

In [5]:
emotions = train_data_raw['emotions'].explode().unique()
print(emotions)

['neutral' 'surprise' 'fear' 'sadness' 'joy' 'disgust' 'anger']


In [6]:

# Encode emotions
encoder = LabelEncoder()
encoder.fit(emotions)
print("Mapping:" + str(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))))

Mapping:{'anger': 0, 'disgust': 1, 'fear': 2, 'joy': 3, 'neutral': 4, 'sadness': 5, 'surprise': 6}


In [7]:
def pre_process(data, test=False):
    tmp = data.copy()
    tmp['emotions'] = data['emotions'].apply(lambda x: encoder.transform(x))
    if not test:
        tmp['triggers'] = data['triggers'].apply(lambda x: [0 if i == np.nan or i == None else i for i in x])
    tmp.drop(columns=['episode'], inplace=True)
    return tmp

In [8]:
train_data = pre_process(train_data_raw)
val_data = pre_process(val_data_raw)
test_data = pre_process(test_data_raw, test=True)

In [9]:
# Show the first row
train_data['triggers'].explode().isna().sum()

0

In [10]:
class UtteranceDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data.index)

    def __getitem__(self, idx):
        return self.data.iloc[idx]

In [11]:
%load_ext autoreload
%autoreload 2

In [12]:
from baselines import BertBaseline 
import torch

In [13]:
# This collate function takes care of adding padding to the sequences
def collate(batch):
    speakers, emotions, utterances, triggers = zip(*batch)
    emotions = [torch.tensor(e, dtype=torch.long) for e in emotions]
    triggers = [torch.tensor(t, dtype=torch.long) for t in triggers]
    return {
        'speakers': speakers,
        'emotions': torch.nn.utils.rnn.pad_sequence(emotions, batch_first=True, padding_value=-1),
        'utterances': utterances,
        'triggers': torch.nn.utils.rnn.pad_sequence(triggers, batch_first=True, padding_value=-1)
    }

In [14]:
model = BertBaseline()
train_dataset = UtteranceDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=False, num_workers=0, collate_fn=collate)
val_loader = DataLoader(UtteranceDataset(val_data), batch_size=2, shuffle=False, num_workers=0, collate_fn=collate)

In [15]:
# Fix all possible sources of randomness
# torch.use_deterministic_algorithms(True, warn_only=True)

# torch.backends.cudnn.benchmark = False
# torch.backends.cudnn.deterministic = True


for batch in train_loader:
    print(batch)
    break

{'speakers': (['Chandler', 'The Interviewer', 'Chandler', 'The Interviewer', 'Chandler'], ['Chandler', 'The Interviewer', 'Chandler', 'The Interviewer', 'Chandler', 'The Interviewer', 'Chandler']), 'emotions': tensor([[ 4,  4,  4,  4,  6, -1, -1],
        [ 4,  4,  4,  4,  6,  4,  4]]), 'utterances': (["also I was the point person on my company's transition from the KL-5 to GR-6 system.", "You must've had your hands full.", 'That I did. That I did.', "So let's talk a little bit about your duties.", 'My duties?  All right.'], ["also I was the point person on my company's transition from the KL-5 to GR-6 system.", "You must've had your hands full.", 'That I did. That I did.', "So let's talk a little bit about your duties.", 'My duties?  All right.', "Now you'll be heading a whole division, so you'll have a lot of duties.", 'I see.']), 'triggers': tensor([[ 0,  0,  0,  1,  0, -1, -1],
        [ 0,  0,  0,  0,  0,  1,  0]])}


In [16]:
lr = 1e-3
epochs = 2

train_model(BertBaseline, 
            "bert_baseline", 
            train_loader,
            val_loader,
            epochs=epochs, 
            hyperparameters={"lr":lr})

Seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name        | Type      | Params
------------------------------------------
0 | model       | BertModel | 109 M 
1 | emotion_clf | CLF       | 99.3 K
2 | trigger_clf | CLF       | 98.8 K
------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
438.722   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

C:\Users\Matteo\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


torch.Size([2, 11, 7])
torch.Size([2, 11])
torch.Size([2, 3, 11])
torch.Size([2, 11])


C:\Users\Matteo\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\utilities\data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 2. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


torch.Size([2, 14, 7])
torch.Size([2, 14])
torch.Size([2, 3, 14])
torch.Size([2, 14])


C:\Users\Matteo\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

C:\Users\Matteo\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
