# Task 1: ERC

In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings("ignore")

In [2]:
class EmotionDataset(Dataset):
    def __init__(self, json_file, model_name='all-MiniLM-L6-v2'):
        self.data = []
        self.speaker_encoder = LabelEncoder()
        self.model = SentenceTransformer(model_name)
        self.emotion_class_to_idx = {'neutral': 0, 'joy': 1, 'sadness': 2, 'anger': 3, 'fear': 4, 'disgust': 5, 'surprise': 6}
        with open(json_file, 'r') as f:
            data = json.load(f)
            for entry in data:
                self.data.append(entry)
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        speakers = torch.tensor(self.speaker_encoder.fit_transform(entry["speakers"]), dtype=torch.long)
        emotions = torch.tensor([self.emotion_class_to_idx[emotion] for emotion in entry["emotions"]], dtype=torch.long)
        utterance_embeddings = torch.tensor(self.model.encode(entry["utterances"]), dtype=torch.float)
        return torch.tensor(len(entry["speakers"])), speakers, utterance_embeddings, emotions


# Initialize the dataset and dataloader
train_dataset = EmotionDataset("./Data/train_file.json")
val_dataset = EmotionDataset("./Data/val_file.json")
print('Length of train dataset:', len(train_dataset))
print('Length of val dataset:', len(val_dataset))

Length of train dataset: 6740
Length of val dataset: 843


In [3]:
import torch.nn as nn

def collate_fn(batch):
    length, speakers, utterance_embeddings, emotions = zip(*batch)
    length = torch.tensor(length, dtype=torch.long)
    speakers = nn.utils.rnn.pad_sequence(speakers, batch_first=True)
    utterance_embeddings = nn.utils.rnn.pad_sequence(utterance_embeddings, batch_first=True)        
    emotions = nn.utils.rnn.pad_sequence(emotions, batch_first=True)
    return length, speakers, utterance_embeddings, emotions


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
print('Length of train_loader:', len(train_loader))
print('Length of val_loader:', len(val_loader))

Length of train_loader: 211
Length of val_loader: 27


In [4]:
with open("./Data/train_file.json", 'r') as f:
    train_json = json.load(f)
with open("./Data/val_file.json", 'r') as f:
    val_json = json.load(f)
print('Train Samples:', len(train_json))
print('Val Samples:', len(val_json))

Train Samples: 6740
Val Samples: 843


## Model - 1

In [5]:
import torch
import torch.nn as nn

class GRUClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(GRUClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2*hidden_size, num_classes)
    
    def forward(self, x):
        h0 = torch.zeros(2, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.fc(out)
        return out

In [6]:
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from sklearn.metrics import f1_score, accuracy_score

sentence_emb_model = SentenceTransformer('all-MiniLM-L6-v2')
emotion_class_to_idx = {'neutral': 0, 'joy': 1, 'sadness': 2, 'anger': 3, 'fear': 4, 'disgust': 5, 'surprise': 6}
emotion_idx_to_class = {v: k for k, v in emotion_class_to_idx.items()}

In [7]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_EPOCH = 10
INPUT_SIZE = 384
HIDDEN_SIZE = 128
OUTPUT_SIZE = 7

trained_model = GRUClassifier(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(DEVICE)
trained_model.load_state_dict(torch.load("M1.pt"))

<All keys matched successfully>

In [8]:
def conversation_inference_m1(model:GRUClassifier, conversation:dict):
    model.eval()
    utterances = conversation["utterances"]
    true_emotions = conversation["emotions"]
    speakers = conversation["speakers"]
    
    # Obtain the utterance embeddings and predict the emotions of all the utterances
    utterance_embeddings = torch.tensor(sentence_emb_model.encode(utterances), dtype=torch.float)
    utterance_embeddings = utterance_embeddings.unsqueeze(0).to(DEVICE)
    predicted_emotions = model(utterance_embeddings)
    _, predicted_emotions = torch.max(predicted_emotions, 2)
    predicted_emotions = predicted_emotions.squeeze(0)
    predicted_emotions = [emotion_idx_to_class[emotion.item()] for emotion in predicted_emotions]

    # Show the Accuracy and F1 Score of the prediction
    accuracy = accuracy_score(true_emotions, predicted_emotions)
    f1 = f1_score(true_emotions, predicted_emotions, average='macro')
    print('True emotions:', true_emotions)
    print('Predicted emotions:', predicted_emotions)
    print('Accuracy of Prediction:', accuracy)
    print('F1 Score of Prediction:', f1)
    
    # Predicting the Emotion Flips
    print('\n---------------------------------')
    print('Detecting Speaker Emotion Flips')
    print('speakers in conversation:')
    print(speakers)
    print('Predicted Emotions:')
    print(predicted_emotions)
    print()
    speakers_prev_emotion = {speaker:None for speaker in speakers}
    for i, (speaker, true_emotion, predicted_emotion) in enumerate(zip(speakers, true_emotions, predicted_emotions)):
        if speakers_prev_emotion[speaker] is None:
            speakers_prev_emotion[speaker] = predicted_emotion
        else:
            if speakers_prev_emotion[speaker] != predicted_emotion:
                print(f'{speaker} changed emotion from {speakers_prev_emotion[speaker]} to {predicted_emotion}')
                speakers_prev_emotion[speaker] = predicted_emotion
    return    

ind = 200
conversation = val_json[ind]
conversation_inference_m1(trained_model, conversation)    

True emotions: ['anger', 'joy', 'neutral', 'joy', 'joy']
Predicted emotions: ['joy', 'surprise', 'joy', 'joy', 'joy']
Accuracy of Prediction: 0.4
F1 Score of Prediction: 0.14285714285714288

---------------------------------
Detecting Speaker Emotion Flips
speakers in conversation:
['Phoebe', 'Monica', 'Phoebe', 'Monica', 'Phoebe']
Predicted Emotions:
['joy', 'surprise', 'joy', 'joy', 'joy']

Monica changed emotion from surprise to joy


In [9]:
criterion = nn.CrossEntropyLoss()
y_pred = []
y_true = []
for _, _, utterance_embeddings, emotions in val_loader:
    emotions = emotions.to(DEVICE)
    utterance_embeddings = utterance_embeddings.to(DEVICE)
    trained_model = trained_model.to(DEVICE)
    outputs = trained_model(utterance_embeddings)
    loss = criterion(outputs.view(-1, outputs.size(-1)), emotions.view(-1))
    _, preds = torch.max(outputs, 2)
    target_expanded = emotions.view(-1).cpu().numpy()
    preds_expanded = preds.view(-1).cpu().numpy()
    y_true.extend(target_expanded)
    y_pred.extend(preds_expanded)
print('Validation Accuracy:', accuracy_score(y_true, y_pred))
print('Validation Macro-F1 Score:', f1_score(y_true, y_pred, average='macro'))
print('Validation Weighted-F1 Score:', f1_score(y_true, y_pred, average='weighted'))

Validation Accuracy: 0.9578074507651972
Validation Macro-F1 Score: 0.876162606551148
Validation Weighted-F1 Score: 0.9572777261377494


## Model - 2

In [10]:
import torch
import torch.nn as nn

class TransformerGRUClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_dim):
        super(TransformerGRUClassifier, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_dim = output_dim
        self.tranformer_encoder_layer = nn.TransformerEncoderLayer(d_model=input_size, nhead=input_size//4)
        self.transformer_encoder = nn.TransformerEncoder(self.tranformer_encoder_layer, num_layers=2)
        self.GRU = nn.GRU(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, output_dim)

    def forward(self, x):
        transform_out = self.transformer_encoder(x) + x
        h0 = torch.zeros(2, x.size(0), self.hidden_size).to(x.device)
        gru_out, _ = self.GRU(transform_out, h0)
        gru_out = self.fc(gru_out)
        return gru_out

In [11]:
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from sklearn.metrics import f1_score, accuracy_score

sentence_emb_model = SentenceTransformer('all-MiniLM-L6-v2')
emotion_class_to_idx = {'neutral': 0, 'joy': 1, 'sadness': 2, 'anger': 3, 'fear': 4, 'disgust': 5, 'surprise': 6}
emotion_idx_to_class = {v: k for k, v in emotion_class_to_idx.items()}

In [12]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_EPOCH = 10
INPUT_SIZE = 384
HIDDEN_SIZE = 128
OUTPUT_SIZE = 7

trained_model = TransformerGRUClassifier(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(DEVICE)
trained_model.load_state_dict(torch.load("M2.pt"))

<All keys matched successfully>

In [13]:
def conversation_inference_m2(model:TransformerGRUClassifier, conversation:dict):
    model.eval()
    utterances = conversation["utterances"]
    true_emotions = conversation["emotions"]
    speakers = conversation["speakers"]
    
    # Obtain the utterance embeddings and predict the emotions of all the utterances
    utterance_embeddings = torch.tensor(sentence_emb_model.encode(utterances), dtype=torch.float)
    utterance_embeddings = utterance_embeddings.unsqueeze(0).to(DEVICE)
    predicted_emotions = model(utterance_embeddings)
    _, predicted_emotions = torch.max(predicted_emotions, 2)
    predicted_emotions = predicted_emotions.squeeze(0)
    predicted_emotions = [emotion_idx_to_class[emotion.item()] for emotion in predicted_emotions]

    # Show the Accuracy and F1 Score of the prediction
    accuracy = accuracy_score(true_emotions, predicted_emotions)
    f1 = f1_score(true_emotions, predicted_emotions, average='macro')
    print('True emotions:', true_emotions)
    print('Predicted emotions:', predicted_emotions)
    print('Accuracy of Prediction:', accuracy)
    print('F1 Score of Prediction:', f1)
    
    # Predicting the Emotion Flips
    print('\n---------------------------------')
    print('Detecting Speaker Emotion Flips')
    print('speakers in conversation:')
    print(speakers)
    print('Predicted Emotions:')
    print(predicted_emotions)
    print()
    speakers_prev_emotion = {speaker:None for speaker in speakers}
    for i, (speaker, true_emotion, predicted_emotion) in enumerate(zip(speakers, true_emotions, predicted_emotions)):
        if speakers_prev_emotion[speaker] is None:
            speakers_prev_emotion[speaker] = predicted_emotion
        else:
            if speakers_prev_emotion[speaker] != predicted_emotion:
                print(f'{speaker} changed emotion from {speakers_prev_emotion[speaker]} to {predicted_emotion}')
                speakers_prev_emotion[speaker] = predicted_emotion
    return    

ind = 200
conversation = val_json[ind]
conversation_inference_m2(trained_model, conversation)    

True emotions: ['anger', 'joy', 'neutral', 'joy', 'joy']
Predicted emotions: ['anger', 'joy', 'neutral', 'joy', 'anger']
Accuracy of Prediction: 0.8
F1 Score of Prediction: 0.8222222222222223

---------------------------------
Detecting Speaker Emotion Flips
speakers in conversation:
['Phoebe', 'Monica', 'Phoebe', 'Monica', 'Phoebe']
Predicted Emotions:
['anger', 'joy', 'neutral', 'joy', 'anger']

Phoebe changed emotion from anger to neutral
Phoebe changed emotion from neutral to anger


In [14]:
criterion = nn.CrossEntropyLoss()
y_pred = []
y_true = []
for _, _, utterance_embeddings, emotions in val_loader:
    emotions = emotions.to(DEVICE)
    utterance_embeddings = utterance_embeddings.to(DEVICE)
    trained_model = trained_model.to(DEVICE)
    outputs = trained_model(utterance_embeddings)
    loss = criterion(outputs.view(-1, outputs.size(-1)), emotions.view(-1))
    _, preds = torch.max(outputs, 2)
    target_expanded = emotions.view(-1).cpu().numpy()
    preds_expanded = preds.view(-1).cpu().numpy()
    y_true.extend(target_expanded)
    y_pred.extend(preds_expanded)
print('Validation Accuracy:', accuracy_score(y_true, y_pred))
print('Validation F1 Score:', f1_score(y_true, y_pred, average='macro'))
print('Validation Weighted-F1 Score:', f1_score(y_true, y_pred, average='weighted'))

Validation Accuracy: 0.9964026583744894
Validation F1 Score: 0.98930110022866
Validation Weighted-F1 Score: 0.9963929671576677


# Task 2:  ERF

In [15]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings("ignore")

In [16]:
with open("./Data/train_file.json", 'r') as f:
    train_json = json.load(f)
with open("./Data/val_file.json", 'r') as f:
    val_json = json.load(f)
print('Train Samples:', len(train_json))
print('Val Samples:', len(val_json))

Train Samples: 6740
Val Samples: 843


## Model - 3

In [17]:
class EmotionDataset(Dataset):
    def __init__(self, json_file, model_name="all-MiniLM-L6-v2"):
        self.data = []
        self.model = SentenceTransformer(model_name)
        self.speaker_encoder = LabelEncoder()
        self.emotion_class_to_idx = {'neutral': 0, 'joy': 1, 'sadness': 2, 'anger': 3, 'fear': 4, 'disgust': 5, 'surprise': 6}
        with open(json_file, 'r') as f:
            data = json.load(f)
            for entry in data:
                self.data.append(entry)
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        context_size = 5
        speakers = torch.tensor(self.speaker_encoder.fit_transform(entry["speakers"]), dtype=torch.long)[-(context_size):]
        emotions = torch.tensor([self.emotion_class_to_idx[emotion] for emotion in entry["emotions"]], dtype=torch.long)[-(context_size):]
        utterance_embeddings = torch.tensor(self.model.encode(entry["utterances"]), dtype=torch.float)[-(context_size):]
        utterance_embeddings = utterance_embeddings + utterance_embeddings[-1]
        triggers = [a if a != None else 0.0 for a in entry['triggers']]
        triggers = torch.tensor(triggers, dtype=torch.long)[-(context_size):]
        return torch.tensor(len(entry["speakers"])), speakers, emotions, utterance_embeddings, triggers


# Initialize the dataset and dataloader
model_name = 'all-MiniLM-L6-v2' # all-mpnet-base-v2
train_dataset = EmotionDataset("./Data/train_file.json", model_name=model_name)
val_dataset = EmotionDataset("./Data/val_file.json", model_name=model_name)
print('Length of train dataset:', len(train_dataset))
print('Length of val dataset:', len(val_dataset))

Length of train dataset: 6740
Length of val dataset: 843


In [18]:
import torch.nn as nn

def collate_fn(batch):
    length, speakers, emotions, utterance_embeddings, triggers = zip(*batch)
    length = torch.tensor(length, dtype=torch.long)
    speakers = nn.utils.rnn.pad_sequence(speakers, batch_first=True)
    emotions = nn.utils.rnn.pad_sequence(emotions, batch_first=True)
    triggers = nn.utils.rnn.pad_sequence(triggers, batch_first=True)
    utterance_embeddings = nn.utils.rnn.pad_sequence(utterance_embeddings, batch_first=True)        
    return length, speakers, emotions, utterance_embeddings, triggers


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
print('Length of train_loader:', len(train_loader))
print('Length of val_loader:', len(val_loader))

Length of train_loader: 211
Length of val_loader: 27


In [19]:
import torch
import torch.nn as nn

class CustomClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_dim):
        super(CustomClassifier, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_dim = output_dim
        self.input_layer = nn.Linear(input_size, input_size)
        self.tranformer_encoder_layer = nn.TransformerEncoderLayer(d_model=input_size, nhead=input_size//8)
        self.transformer_encoder = nn.TransformerEncoder(self.tranformer_encoder_layer, num_layers=2)
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, output_dim)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x):
        x = self.input_layer(x)
        x = self.transformer_encoder(x) + x
        h0 = torch.zeros(2, x.size(0), self.hidden_size).to(x.device)
        x, _ = self.gru(x, h0)
        x = self.fc(x)
        x = self.softmax(x)
        return x

In [20]:
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from sklearn.metrics import f1_score, accuracy_score

sentence_emb_model = SentenceTransformer('all-MiniLM-L6-v2')

In [21]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_EPOCH = 10
INPUT_SIZE = 384
HIDDEN_SIZE = 128
OUTPUT_SIZE = 2

trained_model = CustomClassifier(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(DEVICE)
trained_model.load_state_dict(torch.load("M3.pt"))

<All keys matched successfully>

In [22]:
def emotion_flip_inference_m3(model:CustomClassifier, conversation:dict):
    model.eval()
    utterances = conversation["utterances"]
    true_emotions = conversation["emotions"]
    speakers = conversation["speakers"]
    triggers = conversation["triggers"]
    context_size = 5
    
    # Obtain the utterance embeddings and predict the emotion flip reason of last utterance for context size 5
    utterance_embeddings = torch.tensor(sentence_emb_model.encode(utterances), dtype=torch.float)[-(context_size):]
    utterance_embeddings = utterance_embeddings + utterance_embeddings[-1]
    triggers_true_label = [a if a != None else 0.0 for a in triggers]
    triggers_true_label = torch.tensor(triggers_true_label, dtype=torch.long)[-(context_size):]
    
    # Obtain the predicted triggers
    utterance_embeddings = utterance_embeddings.unsqueeze(0).to(DEVICE)
    triggers_predictions = model(utterance_embeddings)
    _, triggers_predictions = torch.max(triggers_predictions, 2)
    triggers_predictions = triggers_predictions.view(-1).cpu()

    print('Speakers:', speakers[-context_size:])
    print('Utterances:', utterances[-context_size:])
    print('Emotions:', true_emotions[-context_size:])
    print()
    print('True triggers:', triggers_true_label)
    print('Predicted triggers:', triggers_predictions)

    # Show the Accuracy and F1 Score of the prediction
    accuracy = accuracy_score(triggers_true_label.cpu().numpy(), triggers_predictions.numpy())
    f1 = f1_score(triggers_true_label.cpu().numpy(), triggers_predictions.numpy(), average='macro')
    print('Accuracy of Prediction:', accuracy)
    print('F1 Score of Prediction:', f1)
    

ind = 0
conversation = val_json[ind]
emotion_flip_inference_m3(trained_model, conversation)    

Speakers: ['Joey', 'Chandler', 'Joey', 'Chandler', 'Joey']
Utterances: ["Oh God. Uh, okay, here's the thing, this is the thing, okay, the thing is...", 'What is the thing?', "Okay. I went down to the \x91Mattress King' showroom and, and I saw Janice, kissing her ex-husband.", 'What?', 'They were in his office.']
Emotions: ['neutral', 'sadness', 'sadness', 'surprise', 'neutral']

True triggers: tensor([0, 0, 0, 1, 0])
Predicted triggers: tensor([0, 0, 1, 0, 0])
Accuracy of Prediction: 0.6
F1 Score of Prediction: 0.375


In [23]:
y_pred = []
y_true = []
for _, speakers, emotions, utterance_embeddings, targets in val_loader:
    utterance_embeddings = utterance_embeddings.to(DEVICE)
    targets = targets.to(DEVICE)
    trained_model = trained_model.to(DEVICE)
    outputs = trained_model(utterance_embeddings)
    _, preds = torch.max(outputs, 2)
    target_expanded = targets.view(-1).cpu().numpy()
    preds_expanded = preds.view(-1).cpu().numpy()
    y_true.extend(target_expanded)
    y_pred.extend(preds_expanded)
print('Validation Accuracy:', accuracy_score(y_true, y_pred))
print('Validation Macro-F1 Score:', f1_score(y_true, y_pred, average='macro'))
print('Validation Weighted-F1 Score:', f1_score(y_true, y_pred, average='weighted'))

Validation Accuracy: 0.829655990510083
Validation Macro-F1 Score: 0.7525376103387056
Validation Weighted-F1 Score: 0.8226422491770297


## Model - 4

In [24]:
import numpy as np

class EmotionDataset(Dataset):
    def __init__(self, json_file, model_name="all-MiniLM-L6-v2"):
        self.data = []
        self.model = SentenceTransformer(model_name)
        self.speaker_encoder = LabelEncoder()
        self.emotion_class_to_idx = {'neutral': 0, 'joy': 1, 'sadness': 2, 'anger': 3, 'fear': 4, 'disgust': 5, 'surprise': 6}

        with open(json_file, 'r') as f:
            data = json.load(f)
            for entry in data:
                self.data.append(entry)
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        speakers = torch.tensor(self.speaker_encoder.fit_transform(entry["speakers"]), dtype=torch.long)
        emotions = torch.tensor([self.emotion_class_to_idx[emotion] for emotion in entry["emotions"]], dtype=torch.long)
        utterance_embeddings = torch.tensor(self.model.encode(entry["utterances"]), dtype=torch.float)
        triggers = np.array(entry["triggers"], dtype=float)
        np.nan_to_num(triggers, copy=False)
        triggers = torch.tensor(triggers, dtype=torch.int64)
        return torch.tensor(len(entry["speakers"])), speakers, emotions, utterance_embeddings, triggers


# Initialize the dataset and dataloader
model_name = 'all-mpnet-base-v2' # all-MiniLM-L6-v2
train_dataset = EmotionDataset("./Data/train_file.json", model_name=model_name)
val_dataset = EmotionDataset("./Data/val_file.json", model_name=model_name)
print('Length of train dataset:', len(train_dataset))
print('Length of val dataset:', len(val_dataset))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Length of train dataset: 6740
Length of val dataset: 843


In [25]:
import torch.nn as nn
def collate_fn(batch):
    length, speakers, emotions, utterance_embeddings, triggers = zip(*batch)
    length = torch.tensor(length, dtype=torch.long)
    speakers = nn.utils.rnn.pad_sequence(speakers, batch_first=True)
    emotions = nn.utils.rnn.pad_sequence(emotions, batch_first=True)
    triggers = nn.utils.rnn.pad_sequence(triggers, batch_first=True)
    utterance_embeddings = nn.utils.rnn.pad_sequence(utterance_embeddings, batch_first=True)
    return length, speakers, emotions, utterance_embeddings, triggers   

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
print('Length of train_loader:', len(train_loader))
print('Length of val_loader:', len(val_loader))

Length of train_loader: 211
Length of val_loader: 27


In [26]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BiLSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, output_size)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out)
        return out

In [27]:
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from sklearn.metrics import f1_score, accuracy_score

sentence_emb_model = SentenceTransformer('all-mpnet-base-v2')

In [28]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_EPOCH = 10
INPUT_SIZE = 768
HIDDEN_SIZE = 256
OUTPUT_SIZE = 2

trained_model = BiLSTMClassifier(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(DEVICE)
trained_model.load_state_dict(torch.load("M4.pt"))

<All keys matched successfully>

In [29]:
def emotion_flip_inference_m4(model:BiLSTMClassifier, conversation:dict):
    model.eval()
    utterances = conversation["utterances"]
    true_emotions = conversation["emotions"]
    speakers = conversation["speakers"]
    triggers = conversation["triggers"]
    context_size = 5
    
    # Obtain the utterance embeddings and predict the emotion flip reason of last utterance for context size 5
    utterance_embeddings = torch.tensor(sentence_emb_model.encode(utterances), dtype=torch.float)[-(context_size):]
    utterance_embeddings = utterance_embeddings + utterance_embeddings[-1]
    triggers_true_label = [a if a != None else 0.0 for a in triggers]
    triggers_true_label = torch.tensor(triggers_true_label, dtype=torch.long)[-(context_size):]
    
    # Obtain the predicted triggers
    utterance_embeddings = utterance_embeddings.unsqueeze(0).to(DEVICE)
    triggers_predictions = model(utterance_embeddings)
    _, triggers_predictions = torch.max(triggers_predictions, 2)
    triggers_predictions = triggers_predictions.view(-1).cpu()

    print('Speakers:', speakers[-context_size:])
    print('Utterances:', utterances[-context_size:])
    print('Emotions:', true_emotions[-context_size:])
    print()
    print('True triggers:', triggers_true_label)
    print('Predicted triggers:', triggers_predictions)

    # Show the Accuracy and F1 Score of the prediction
    accuracy = accuracy_score(triggers_true_label.cpu().numpy(), triggers_predictions.numpy())
    f1 = f1_score(triggers_true_label.cpu().numpy(), triggers_predictions.numpy(), average='macro')
    print('Accuracy of Prediction:', accuracy)
    print('F1 Score of Prediction:', f1)
    

ind = 0
conversation = val_json[ind]
emotion_flip_inference_m4(trained_model, conversation)    

Speakers: ['Joey', 'Chandler', 'Joey', 'Chandler', 'Joey']
Utterances: ["Oh God. Uh, okay, here's the thing, this is the thing, okay, the thing is...", 'What is the thing?', "Okay. I went down to the \x91Mattress King' showroom and, and I saw Janice, kissing her ex-husband.", 'What?', 'They were in his office.']
Emotions: ['neutral', 'sadness', 'sadness', 'surprise', 'neutral']

True triggers: tensor([0, 0, 0, 1, 0])
Predicted triggers: tensor([0, 0, 0, 1, 1])
Accuracy of Prediction: 0.8
F1 Score of Prediction: 0.7619047619047619


In [30]:
y_pred = []
y_true = []
for _, speakers, emotions, utterance_embeddings, targets in val_loader:
    utterance_embeddings = utterance_embeddings.to(DEVICE)
    targets = targets.to(DEVICE)
    trained_model = trained_model.to(DEVICE)
    outputs = trained_model(utterance_embeddings)
    _, preds = torch.max(outputs, 2)
    target_expanded = targets.view(-1).cpu().numpy()
    preds_expanded = preds.view(-1).cpu().numpy()
    y_true.extend(target_expanded)
    y_pred.extend(preds_expanded)
print('Validation Accuracy:', accuracy_score(y_true, y_pred))
print('Validation Macro-F1 Score:', f1_score(y_true, y_pred, average='macro'))
print('Validation Weighted-F1 Score:', f1_score(y_true, y_pred, average='weighted'))

Validation Accuracy: 0.961038961038961
Validation Macro-F1 Score: 0.83866663167085
Validation Weighted-F1 Score: 0.9602079532016111
