# Setup

In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
# For colab
if IN_COLAB:
  import sys
  !git clone https://github.com/elements72/EDiReF-subtask-III.git
  %pip install lightning
  %pip install wandb
  sys.path.append('./EDiReF-subtask-III')
  %cd ./EDiReF-subtask-III
  num_workers = 2
else:
  num_workers = 0
  pass

In [2]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import numpy as np
from utils import train_model
import torch
from baselines import BertBaseline
import torch
from sklearn.model_selection import train_test_split


Using device: cpu


In [3]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mmatteo-vannucchi11[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Dataset

In [4]:
# Path definitions
local_path = './data/'
data_path = Path(local_path)

dataset_path = data_path / 'MELD_efr.json'
train_data_path = data_path / 'MELD_train_efr.json'
val_data_path = data_path / 'MELD_val_efr.json'       
test_data_path = data_path / 'MELD_test_efr.json'

In [5]:
random_state = 42
# Check if val and test sets are available
if not val_data_path.exists() or not test_data_path.exists() or not train_data_path.exists():
  print('Generating val and test sets...')
  data = pd.read_json(dataset_path)
  # Split of the train set
  train_data_raw, tmp = train_test_split(data, train_size=0.8, shuffle=True, random_state=random_state)
  # Split of the validation and test set
  validation_data_raw, test_data_raw = train_test_split(tmp, test_size=0.5, shuffle=True, random_state=random_state)
  # Save the data
  train_data_raw.to_json(train_data_path, indent=2, orient="records", force_ascii=False)
  validation_data_raw.to_json(val_data_path, indent=2, orient="records", force_ascii=False)
  test_data_raw.to_json(test_data_path, indent=2, orient="records", force_ascii=False)
else:
  # Load the data
  print('Loading data...')
  train_data_raw = pd.read_json(train_data_path)
  validation_data_raw = pd.read_json(val_data_path)
  test_data_raw = pd.read_json(test_data_path)

Loading data...


In [6]:
print('Train data shape: ', train_data_raw.shape)
print('Validation data shape: ', validation_data_raw.shape)
print('Test data shape: ', test_data_raw.shape)

Train data shape:  (3200, 5)
Validation data shape:  (400, 5)
Test data shape:  (400, 5)


In [7]:
train_data_raw.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_3994,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_423,"[Monica, Monica]","[surprise, fear]","[And y'know what, I just realised, in the last...","[1.0, 1.0]"
2,utterance_2991,"[Rachel, Mr. Treeger, Rachel]","[fear, neutral, joy]","[Oh! Hey, Mr. Treeger., : What are you doing?,...","[1.0, 0.0, 1.0]"
3,utterance_1221,"[Chandler, Monica, Chandler, Monica, Chandler,...","[neutral, neutral, neutral, surprise, sadness,...","[Okay to come in?, Yeah, come on, eat, whateve...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_506,"[Ross, Chandler, Ross, Chandler]","[neutral, neutral, neutral, joy]","[Ok, bye. Well, Monica's not coming, it's jus...","[1.0, 0.0, 0.0, 0.0]"


## Emotion encoding
The emotions are encoded mapping each of them with a specific value. Positive emotion are assigned to positive values the negatives with negatives

In [8]:
emotions = test_data_raw['emotions'].explode().unique()
print(emotions)

['neutral' 'sadness' 'joy' 'surprise' 'anger' 'disgust' 'fear']


In [9]:
# Encode emotions
encoder = LabelEncoder()
encoder.fit(emotions)
print("Mapping:" + str(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))))

Mapping:{'anger': 0, 'disgust': 1, 'fear': 2, 'joy': 3, 'neutral': 4, 'sadness': 5, 'surprise': 6}


In [10]:
def pre_process(data):
    tmp = data.copy()
    tmp['emotions'] = data['emotions'].apply(lambda x: encoder.transform(x))
    tmp['triggers'] = data['triggers'].apply(lambda x: [0 if i == np.nan or i == None else i for i in x])
    tmp.drop(columns=['episode'], inplace=True)
    return tmp

In [11]:
train_data = pre_process(train_data_raw)
val_data = pre_process(validation_data_raw)
test_data = pre_process(test_data_raw)

In [12]:
train_data['triggers'].explode().isna().sum()

0

In [13]:
class UtteranceDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data.index)

    def __getitem__(self, idx):
        return self.data.iloc[idx]

In [14]:
padding_value_emotion = len(encoder.classes_)
padding_value_trigger = 2
# This collate function takes care of adding padding to the sequences
def collate(batch):
    speakers, emotions, utterances, triggers = zip(*batch)

    emotions = [torch.tensor(e, dtype=torch.long) for e in emotions]
    triggers = [torch.tensor(t, dtype=torch.long) for t in triggers]

    emotions = torch.nn.utils.rnn.pad_sequence(emotions, batch_first=True, padding_value=padding_value_emotion)
    triggers = torch.nn.utils.rnn.pad_sequence(triggers, batch_first=True, padding_value=padding_value_trigger)
    # Pad with a PAD sentence

    max_len_utterances = max([len(u) for u in utterances])
    for i, u in enumerate(utterances):
        for _ in range(max_len_utterances - len(u)):
            u.append('')

    return {
        'speakers': speakers,
        'emotions': emotions,
        'utterances': utterances,
        'triggers': triggers
    }

In [15]:
from sklearn.utils.class_weight import compute_class_weight

class_weights_emotion = compute_class_weight('balanced', classes=np.unique(train_data['emotions'].explode()),
                                             y=train_data['emotions'].explode())
class_weights_emotion = torch.tensor(class_weights_emotion, dtype=torch.float)

class_weights_trigger = compute_class_weight('balanced', classes=np.unique(train_data['triggers'].explode()), 
                                             y=train_data['triggers'].explode())
class_weights_trigger = torch.tensor(class_weights_trigger, dtype=torch.float)

class_weights_emotion

tensor([1.2383, 4.6772, 4.4615, 0.7955, 0.3287, 1.8815, 1.0825])

In [16]:
batch_size = 2
model = BertBaseline(class_weights_emotion=class_weights_emotion, class_weights_trigger=class_weights_trigger)
train_dataset = UtteranceDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate)
val_loader = DataLoader(UtteranceDataset(val_data), batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate)

True


In [17]:
lr = 1e-5
epochs = 20

train_model(BertBaseline,
            "bert_baseline",
            train_loader,
            val_loader,
            epochs=epochs,
            hyperparameters={"lr":lr})

Seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


True


Missing logger folder: C:\Users\Matteo\PycharmProjects\EDiReF-subtask-III\lightning_logs

   | Name                        | Type              | Params
-------------------------------------------------------------------
0  | backbone                    | BertModel         | 109 M 
1  | emotion_clf                 | CLF               | 99.3 K
2  | trigger_clf                 | CLF               | 98.7 K
3  | f1_train_cumulative_emotion | F1ScoreCumulative | 0     
4  | f1_train_cumulative_trigger | F1ScoreCumulative | 0     
5  | f1_train_dialogues_emotion  | F1ScoreDialogues  | 0     
6  | f1_train_dialogues_trigger  | F1ScoreDialogues  | 0     
7  | f1_val_cumulative_emotion   | F1ScoreCumulative | 0     
8  | f1_val_cumulative_trigger   | F1ScoreCumulative | 0     
9  | f1_val_dialogues_emotion    | F1ScoreDialogues  | 0     
10 | f1_val_dialogues_trigger    | F1ScoreDialogues  | 0     
11 | f1_test_cumulative_emotion  | F1ScoreCumulative | 0     
12 | f1_test_cumulative_trigger  | F

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

C:\Users\Matteo\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
C:\Users\Matteo\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

C:\Users\Matteo\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
