# Setup

In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
# For colab
if IN_COLAB:
  import sys
  !git clone https://github.com/elements72/EDiReF-subtask-III.git
  %pip install lightning
  %pip install wandb
  sys.path.append('./EDiReF-subtask-III')
  %cd ./EDiReF-subtask-III
else:
  pass

In [31]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import numpy as np
from utils import train_model
import torch
from baselines import BertBaseline 
import torch

In [4]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\Acer\.netrc


True

In [5]:
# Path definitions
colab_path = '/content/drive/MyDrive/NLP/project/'
local_path = './data/'
data_path = Path(local_path)

train_data_path = data_path / 'MELD_train_efr.json'
#val_data_path = data_path / 'MELD_val_efr.json'
#test_data_path = data_path / 'MELD_test_efr.json'

## Emotion encoding
The emotions are encoded mapping each of them with a specific value. Positive emotion are assigned to positive values the negatives with negatives

In [6]:
train_data_raw = pd.read_json(train_data_path)
#val_data_raw = pd.read_json(val_data_path)
#test_data_raw = pd.read_json(test_data_path)
train_data_raw.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_0,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_1,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,utterance_2,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,utterance_3,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_4,"[Joey, Rachel, Joey, Rachel]","[surprise, sadness, surprise, fear]",[But then who? The waitress I went out with la...,"[0.0, 0.0, 1.0, 0.0]"


In [11]:
#Data shape
train_data_raw.shape

(4000, 5)

In [9]:
#Split of the train set
from sklearn.model_selection import train_test_split
train_data, rest_data = train_test_split(train_data_raw, train_size=0.8, shuffle=False)
train_data.head()

(3200, 5)

In [10]:
# Check the shape of the train set, must be (3200, 5)
train_data.shape

(3200, 5)

In [16]:
# Split of the validation and test set
validation_data, test_data = train_test_split(rest_data, test_size=0.5, shuffle=False)

In [17]:
emotions = train_data['emotions'].explode().unique()
print(emotions)

['neutral' 'surprise' 'fear' 'sadness' 'joy' 'disgust' 'anger']


In [21]:
# Encode emotions
encoder = LabelEncoder()
encoder.fit(emotions)
print("Mapping:" + str(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))))

Mapping:{'anger': 0, 'disgust': 1, 'fear': 2, 'joy': 3, 'neutral': 4, 'sadness': 5, 'surprise': 6}


In [22]:
def pre_process(data, test=False):
    tmp = data.copy()
    tmp['emotions'] = data['emotions'].apply(lambda x: encoder.transform(x))
    if not test:
        tmp['triggers'] = data['triggers'].apply(lambda x: [0 if i == np.nan or i == None else i for i in x])
    tmp.drop(columns=['episode'], inplace=True)
    return tmp

In [23]:
train_data = pre_process(train_data)
val_data = pre_process(validation_data)
test_data = pre_process(test_data, test=True)

In [24]:
# Show the first row
train_data['triggers'].explode().isna().sum()

0

In [25]:
class UtteranceDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data.index)

    def __getitem__(self, idx):
        return self.data.iloc[idx]

In [26]:
padding_value_emotion = 7
padding_value_trigger = 3
# This collate function takes care of adding padding to the sequences
def collate(batch):
    speakers, emotions, utterances, triggers = zip(*batch)

    emotions = [torch.tensor(e, dtype=torch.long) for e in emotions]
    triggers = [torch.tensor(t, dtype=torch.long) for t in triggers]

    emotions = torch.nn.utils.rnn.pad_sequence(emotions, batch_first=True, padding_value=padding_value_emotion)
    triggers = torch.nn.utils.rnn.pad_sequence(triggers, batch_first=True, padding_value=padding_value_trigger)
    # Pad with a PAD sentence

    max_len_utterances = max([len(u) for u in utterances])
    for i, u in enumerate(utterances):
        for _ in range(max_len_utterances - len(u)):
            u.append('')

    return {
        'speakers': speakers,
        'emotions': emotions,
        'utterances': utterances,
        'triggers': triggers
    }

In [27]:
batch_size = 32
model = BertBaseline()
train_dataset = UtteranceDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=collate)
val_loader = DataLoader(UtteranceDataset(val_data), batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=collate)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.52MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order 

In [28]:
lr = 1e-5
epochs = 20

train_model(BertBaseline, 
            "bert_baseline", 
            train_loader,
            val_loader,
            epochs=epochs, 
            hyperparameters={"lr":lr})

Seed set to 42
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: Current


   | Name                        | Type              | Params
-------------------------------------------------------------------
0  | backbone                    | BertModel         | 109 M 
1  | emotion_clf                 | CLF               | 99.3 K
2  | trigger_clf                 | CLF               | 98.8 K
3  | f1_train_cumulative_emotion | F1ScoreCumulative | 0     
4  | f1_train_cumulative_trigger | F1ScoreCumulative | 0     
5  | f1_train_dialogues_emotion  | F1ScoreDialogues  | 0     
6  | f1_train_dialogues_trigger  | F1ScoreDialogues  | 0     
7  | f1_val_cumulative_emotion   | F1ScoreCumulative | 0     
8  | f1_val_cumulative_trigger   | F1ScoreCumulative | 0     
9  | f1_val_dialogues_emotion    | F1ScoreDialogues  | 0     
10 | f1_val_dialogues_trigger    | F1ScoreDialogues  | 0     
11 | f1_test_cumulative_emotion  | F1ScoreCumulative | 0     
12 | f1_test_cumulative_trigger  | F1ScoreCumulative | 0     
13 | f1_test_dialogues_emotion   | F1ScoreDialogues  | 0     
1

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\Acer\AppData\Local\Programs\Python\Python311\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:11<00:11,  0.09it/s]

c:\Users\Acer\AppData\Local\Programs\Python\Python311\Lib\site-packages\lightning\pytorch\utilities\data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 32. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


                                                                           

c:\Users\Acer\AppData\Local\Programs\Python\Python311\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0:  47%|████▋     | 47/100 [11:23<12:51,  0.07it/s, v_num=t2py]

c:\Users\Acer\AppData\Local\Programs\Python\Python311\Lib\site-packages\lightning\pytorch\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
