# Dataset
In this section we will load and pre-process the data

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import numpy as np
from utils import train_model
import torch

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Path definitions
colab_path = '/content/drive/MyDrive/NLP/project/'
local_path = './data/'
data_path = Path(local_path)

train_data_path = data_path / 'MELD_train_efr.json'
val_data_path = data_path / 'MELD_val_efr.json'
test_data_path = data_path / 'MELD_test_efr.json'

In [4]:
train_data_raw = pd.read_json(train_data_path)
val_data_raw = pd.read_json(val_data_path)
test_data_raw = pd.read_json(test_data_path)
train_data_raw.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_0,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_1,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,utterance_2,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,utterance_3,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_4,"[Joey, Rachel, Joey, Rachel]","[surprise, sadness, surprise, fear]",[But then who? The waitress I went out with la...,"[0.0, 0.0, 1.0, 0.0]"


## Emotion encoding
The emotions are encoded mapping each of them with a specific value. Positive emotion are assigned to positive values the negatives with negatives

In [5]:
emotions = train_data_raw['emotions'].explode().unique()
print(emotions)

['neutral' 'surprise' 'fear' 'sadness' 'joy' 'disgust' 'anger']


In [6]:

# Encode emotions
encoder = LabelEncoder()
encoder.fit(emotions)
print("Mapping:" + str(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))))

Mapping:{'anger': 0, 'disgust': 1, 'fear': 2, 'joy': 3, 'neutral': 4, 'sadness': 5, 'surprise': 6}


In [7]:
def pre_process(data, test=False):
    tmp = data.copy()
    tmp['emotions'] = data['emotions'].apply(lambda x: encoder.transform(x))
    if not test:
        tmp['triggers'] = data['triggers'].apply(lambda x: [0 if i == np.nan else 1 for i in x])
    tmp.drop(columns=['episode'], inplace=True)
    return tmp

In [8]:
train_data = pre_process(train_data_raw)
val_data = pre_process(val_data_raw)
test_data = pre_process(test_data_raw, test=True)

In [9]:
# Show the first row
train_data['triggers'].explode().isna().sum()

0

In [10]:
class UtteranceDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data.index)

    def __getitem__(self, idx):
        return self.data.iloc[idx]

In [11]:
%load_ext autoreload
%autoreload 2

In [12]:
from baselines import BertBaseline 
import torch

In [13]:
# This collate function takes care of adding padding to the sequences
def collate(batch):
    speakers, emotions, utterances, triggers = zip(*batch)
    emotions = [torch.tensor(e) for e in emotions]
    triggers = [torch.tensor(t) for t in triggers]
    return {
        'speakers': speakers,
        'emotions': torch.nn.utils.rnn.pad_sequence(emotions, batch_first=True, padding_value=-1),
        'utterances': utterances,
        'triggers': torch.nn.utils.rnn.pad_sequence(triggers, batch_first=True, padding_value=-1)
    }

In [14]:
model = BertBaseline()
train_dataset = UtteranceDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=False, num_workers=2, collate_fn=collate)
val_loader = DataLoader(UtteranceDataset(val_data), batch_size=2, shuffle=False, num_workers=2, collate_fn=collate)

In [15]:
for batch in train_loader:
    model(batch)
    break

In [16]:
# Fix all possible sources of randomness
torch.use_deterministic_algorithms(False, warn_only=True)

torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [17]:
lr = 1e-3
epochs = 2

train_model(BertBaseline, 
            "bert_baseline", 
            train_loader,
            val_loader,
            epochs=epochs, 
            hyperparameters={"lr":lr})

Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type      | Params
------------------------------------------
0 | model       | BertModel | 109 M 
1 | emotion_clf | CLF       | 99.2 K
2 | trigger_clf | CLF       | 98.6 K
------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
438.720   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


torch.Size([2, 11, 6])
torch.Size([2, 11])
torch.Size([2, 11])
torch.Size([2, 11])


RuntimeError: nll_loss2d_forward_out_cuda_template does not have a deterministic implementation, but you set 'torch.use_deterministic_algorithms(True)'. You can turn off determinism just for this operation, or you can use the 'warn_only=True' option, if that's acceptable for your application. You can also file an issue at https://github.com/pytorch/pytorch/issues to help us prioritize adding deterministic support for this operation.