<a href="https://colab.research.google.com/github/tae898/MELD/blob/master/notebooks/MELD_with_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download the pre-computed features and models

In [3]:
# wget to download
!wget http://web.eecs.umich.edu/~mihalcea/downloads/MELD.Features.Models.tar.gz
# untar
!tar -zxvf MELD.Features.Models.tar.gz
# remove the tar
!rm -rf MELD.Features.Models.tar.gz

# move the features and the models to the curret directory
!mv MELD.Features.Models/features/ ./features
!mv MELD.Features.Models/models/ ./models

--2020-10-11 10:38:31--  http://web.eecs.umich.edu/~mihalcea/downloads/MELD.Features.Models.tar.gz
Resolving web.eecs.umich.edu (web.eecs.umich.edu)... 141.212.113.214
Connecting to web.eecs.umich.edu (web.eecs.umich.edu)|141.212.113.214|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 922694594 (880M) [application/x-gzip]
Saving to: ‘MELD.Features.Models.tar.gz.1’


2020-10-11 10:38:48 (51.5 MB/s) - ‘MELD.Features.Models.tar.gz.1’ saved [922694594/922694594]

MELD.Features.Models/
MELD.Features.Models/models/
MELD.Features.Models/models/audio_weights_emotion.hdf5
MELD.Features.Models/models/text_weights_emotion.hdf5
MELD.Features.Models/models/bimodal_weights_emotion.hdf5
MELD.Features.Models/models/text_weights_sentiment.hdf5
MELD.Features.Models/models/audio_weights_sentiment.hdf5
MELD.Features.Models/models/bimodal_weights_sentiment.hdf5
MELD.Features.Models/features/
MELD.Features.Models/features/text_glove_CNN_sentiment.pkl
MELD.Features.Models/features/te

# Import the necessary packages, define classes and functions, and set the values of the hyperparameters

These are mostly copied from [here](https://github.com/declare-lab/MELD/tree/master/baseline)

In [18]:
import numpy as np
import pandas as pd
import pickle
import os
import sys
from collections import Counter, defaultdict
import argparse
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D, Lambda, LSTM, TimeDistributed, Masking, Bidirectional
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam
from keras.models import Model, load_model
import keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import os
import pickle
import numpy as np

###################################################################################################################################

# Hyperparams
max_length = 50  # Maximum length of the sentence


class Dataloader:

    def __init__(self, mode=None):

        try:
            assert(mode is not None)
        except AssertionError as e:
            print("Set mode as 'Sentiment' or 'Emotion'")
            exit()

        self.MODE = mode  # Sentiment or Emotion classification mode
        self.max_l = max_length

        """
            Loading the dataset: 
                - revs is a dictionary with keys/value: 
                    - text: original sentence
                    - split: train/val/test :: denotes the which split the tuple belongs to
                    - y: label of the sentence
                    - dialog: ID of the dialog the utterance belongs to
                    - utterance: utterance number of the dialog ID
                    - num_words: number of words in the utterance
                - W: glove embedding matrix
                - vocab: the vocabulary of the dataset
                - word_idx_map: mapping of each word from vocab to its index in W
                - label_index: mapping of each label (emotion or sentiment) to its assigned index, eg. label_index['neutral']=0
        """
        x = pickle.load(
            open("./data/pickles/data_{}.p".format(self.MODE.lower()), "rb"))
        revs, self.W, self.word_idx_map, self.vocab, _, label_index = x[
            0], x[1], x[2], x[3], x[4], x[5]
        self.num_classes = len(label_index)
        print("Labels used for this classification: ", label_index)

        # Preparing data
        self.train_data, self.val_data, self.test_data = {}, {}, {}
        for i in range(len(revs)):

            utterance_id = revs[i]['dialog']+"_"+revs[i]['utterance']
            sentence_word_indices = self.get_word_indices(revs[i]['text'])
            label = label_index[revs[i]['y']]

            if revs[i]['split'] == "train":
                self.train_data[utterance_id] = (sentence_word_indices, label)
            elif revs[i]['split'] == "val":
                self.val_data[utterance_id] = (sentence_word_indices, label)
            elif revs[i]['split'] == "test":
                self.test_data[utterance_id] = (sentence_word_indices, label)

        # Creating dialogue:[utterance_1, utterance_2, ...] ids
        self.train_dialogue_ids = self.get_dialogue_ids(self.train_data.keys())
        self.val_dialogue_ids = self.get_dialogue_ids(self.val_data.keys())
        self.test_dialogue_ids = self.get_dialogue_ids(self.test_data.keys())

        # Max utternance in a dialog in the dataset
        self.max_utts = self.get_max_utts(
            self.train_dialogue_ids, self.val_dialogue_ids, self.test_dialogue_ids)

    def get_word_indices(self, data_x):
        length = len(data_x.split())
        return np.array([self.word_idx_map[word] for word in data_x.split()] + [0]*(self.max_l-length))[:self.max_l]

    def get_dialogue_ids(self, keys):
        ids = defaultdict(list)
        for key in keys:
            ids[key.split("_")[0]].append(int(key.split("_")[1]))
        for ID, utts in ids.items():
            ids[ID] = [str(utt) for utt in sorted(utts)]
        return ids

    def get_max_utts(self, train_ids, val_ids, test_ids):
        max_utts_train = max([len(train_ids[vid]) for vid in train_ids.keys()])
        max_utts_val = max([len(val_ids[vid]) for vid in val_ids.keys()])
        max_utts_test = max([len(test_ids[vid]) for vid in test_ids.keys()])
        return np.max([max_utts_train, max_utts_val, max_utts_test])

    def get_one_hot(self, label):
        label_arr = [0]*self.num_classes
        label_arr[label] = 1
        return label_arr[:]

    def get_dialogue_audio_embs(self):
        key = list(self.train_audio_emb.keys())[0]
        pad = [0]*len(self.train_audio_emb[key])

        def get_emb(dialogue_id, audio_emb):
            dialogue_audio = []
            for vid in dialogue_id.keys():
                local_audio = []
                for utt in dialogue_id[vid]:
                    try:
                        local_audio.append(audio_emb[vid+"_"+str(utt)][:])
                    except:
                        print(vid+"_"+str(utt))
                        local_audio.append(pad[:])
                for _ in range(self.max_utts-len(local_audio)):
                    local_audio.append(pad[:])
                dialogue_audio.append(local_audio[:self.max_utts])
            return np.array(dialogue_audio)

        self.train_dialogue_features = get_emb(
            self.train_dialogue_ids, self.train_audio_emb)
        self.val_dialogue_features = get_emb(
            self.val_dialogue_ids, self.val_audio_emb)
        self.test_dialogue_features = get_emb(
            self.test_dialogue_ids, self.test_audio_emb)

    def get_dialogue_text_embs(self):
        key = list(self.train_data.keys())[0]
        pad = [0]*len(self.train_data[key][0])

        def get_emb(dialogue_id, local_data):
            dialogue_text = []
            for vid in dialogue_id.keys():
                local_text = []
                for utt in dialogue_id[vid]:
                    local_text.append(local_data[vid+"_"+str(utt)][0][:])
                for _ in range(self.max_utts-len(local_text)):
                    local_text.append(pad[:])
                dialogue_text.append(local_text[:self.max_utts])
            return np.array(dialogue_text)

        self.train_dialogue_features = get_emb(
            self.train_dialogue_ids, self.train_data)
        self.val_dialogue_features = get_emb(
            self.val_dialogue_ids, self.val_data)
        self.test_dialogue_features = get_emb(
            self.test_dialogue_ids, self.test_data)

    def get_dialogue_labels(self):

        def get_labels(ids, data):
            dialogue_label = []

            for vid, utts in ids.items():
                local_labels = []
                for utt in utts:
                    local_labels.append(self.get_one_hot(
                        data[vid+"_"+str(utt)][1]))
                for _ in range(self.max_utts-len(local_labels)):
                    local_labels.append(self.get_one_hot(1))  # Dummy label
                dialogue_label.append(local_labels[:self.max_utts])
            return np.array(dialogue_label)

        self.train_dialogue_label = get_labels(
            self.train_dialogue_ids, self.train_data)
        self.val_dialogue_label = get_labels(
            self.val_dialogue_ids, self.val_data)
        self.test_dialogue_label = get_labels(
            self.test_dialogue_ids, self.test_data)

    def get_dialogue_lengths(self):

        self.train_dialogue_length, self.val_dialogue_length, self.test_dialogue_length = [], [], []
        for vid, utts in self.train_dialogue_ids.items():
            self.train_dialogue_length.append(len(utts))
        for vid, utts in self.val_dialogue_ids.items():
            self.val_dialogue_length.append(len(utts))
        for vid, utts in self.test_dialogue_ids.items():
            self.test_dialogue_length.append(len(utts))

    def get_masks(self):

        self.train_mask = np.zeros(
            (len(self.train_dialogue_length), self.max_utts), dtype='float')
        for i in range(len(self.train_dialogue_length)):
            self.train_mask[i, :self.train_dialogue_length[i]] = 1.0
        self.val_mask = np.zeros(
            (len(self.val_dialogue_length), self.max_utts), dtype='float')
        for i in range(len(self.val_dialogue_length)):
            self.val_mask[i, :self.val_dialogue_length[i]] = 1.0
        self.test_mask = np.zeros(
            (len(self.test_dialogue_length), self.max_utts), dtype='float')
        for i in range(len(self.test_dialogue_length)):
            self.test_mask[i, :self.test_dialogue_length[i]] = 1.0

    def load_audio_data(self, ):

        AUDIO_PATH = "./data/pickles/audio_embeddings_feature_selection_{}.pkl".format(
            self.MODE.lower())
        self.train_audio_emb, self.val_audio_emb, self.test_audio_emb = pickle.load(
            open(AUDIO_PATH, "rb"))

        self.get_dialogue_audio_embs()
        self.get_dialogue_lengths()
        self.get_dialogue_labels()
        self.get_masks()

    def load_text_data(self, ):

        self.get_dialogue_text_embs()
        self.get_dialogue_lengths()
        self.get_dialogue_labels()
        self.get_masks()

    def load_bimodal_data(self,):

        TEXT_UNIMODAL = "./data/pickles/text_{}.pkl".format(self.MODE.lower())
        AUDIO_UNIMODAL = "./data/pickles/audio_{}.pkl".format(
            self.MODE.lower())

        # Load features
        train_text_x, val_text_x, test_text_x = pickle.load(
            open(TEXT_UNIMODAL, "rb"), encoding='latin1')
        train_audio_x, val_audio_x, test_audio_x = pickle.load(
            open(AUDIO_UNIMODAL, "rb"), encoding='latin1')

        def concatenate_fusion(ID, text, audio):
            bimodal = []
            for vid, utts in ID.items():
                bimodal.append(np.concatenate((text[vid], audio[vid]), axis=1))
            return np.array(bimodal)

        self.train_dialogue_features = concatenate_fusion(
            self.train_dialogue_ids, train_text_x, train_audio_x)
        self.val_dialogue_features = concatenate_fusion(
            self.val_dialogue_ids, val_text_x, val_audio_x)
        self.test_dialogue_features = concatenate_fusion(
            self.test_dialogue_ids, test_text_x, test_audio_x)

        self.get_dialogue_lengths()
        self.get_dialogue_labels()
        self.get_masks()


# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"]="0"


class bc_LSTM:

    def __init__(self, args):
        self.classification_mode = args.classify
        self.modality = args.modality
        self.PATH = "./data/models/{}_weights_{}.hdf5".format(
            args.modality, self.classification_mode.lower())
        self.OUTPUT_PATH = "./data/pickles/{}_{}.pkl".format(
            args.modality, self.classification_mode.lower())
        print("Model initiated for {} classification".format(
            self.classification_mode))

    def load_data(self,):

        print('Loading data')
        self.data = Dataloader(mode=self.classification_mode)

        if self.modality == "text":
            self.data.load_text_data()
        elif self.modality == "audio":
            self.data.load_audio_data()
        elif self.modality == "bimodal":
            self.data.load_bimodal_data()
        else:
            exit()

        self.train_x = self.data.train_dialogue_features
        self.val_x = self.data.val_dialogue_features
        self.test_x = self.data.test_dialogue_features

        self.train_y = self.data.train_dialogue_label
        self.val_y = self.data.val_dialogue_label
        self.test_y = self.data.test_dialogue_label

        self.train_mask = self.data.train_mask
        self.val_mask = self.data.val_mask
        self.test_mask = self.data.test_mask

        self.train_id = self.data.train_dialogue_ids.keys()
        self.val_id = self.data.val_dialogue_ids.keys()
        self.test_id = self.data.test_dialogue_ids.keys()

        self.sequence_length = self.train_x.shape[1]

        self.classes = self.train_y.shape[2]

    def calc_test_result(self, pred_label, test_label, test_mask):

        true_label = []
        predicted_label = []

        for i in range(pred_label.shape[0]):
            for j in range(pred_label.shape[1]):
                if test_mask[i, j] == 1:
                    true_label.append(np.argmax(test_label[i, j]))
                    predicted_label.append(np.argmax(pred_label[i, j]))
        print("Confusion Matrix :")
        print(confusion_matrix(true_label, predicted_label))
        print("Classification Report :")
        print(classification_report(true_label, predicted_label, digits=4))
        print('Weighted FScore: \n ', precision_recall_fscore_support(
            true_label, predicted_label, average='weighted'))

    def get_audio_model(self):

        # Modality specific hyperparameters
        self.epochs = 100
        self.batch_size = 50

        # Modality specific parameters
        self.embedding_dim = self.train_x.shape[2]

        print("Creating Model...")

        inputs = Input(shape=(self.sequence_length,
                              self.embedding_dim), dtype='float32')
        masked = Masking(mask_value=0)(inputs)
        lstm = Bidirectional(
            LSTM(300, activation='tanh', return_sequences=True, dropout=0.4))(masked)
        lstm = Bidirectional(LSTM(
            300, activation='tanh', return_sequences=True, dropout=0.4), name="utter")(lstm)
        output = TimeDistributed(
            Dense(self.classes, activation='softmax'))(lstm)

        model = Model(inputs, output)
        return model

    def get_text_model(self):

        # Modality specific hyperparameters
        self.epochs = 100
        self.batch_size = 50

        # Modality specific parameters
        self.embedding_dim = self.data.W.shape[1]

        # For text model
        self.vocabulary_size = self.data.W.shape[0]
        self.filter_sizes = [3, 4, 5]
        self.num_filters = 512

        print("Creating Model...")

        sentence_length = self.train_x.shape[2]

        # Initializing sentence representation layers
        embedding = Embedding(input_dim=self.vocabulary_size, output_dim=self.embedding_dim, weights=[
                              self.data.W], input_length=sentence_length, trainable=False)
        conv_0 = Conv2D(self.num_filters, kernel_size=(
            self.filter_sizes[0], self.embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')
        conv_1 = Conv2D(self.num_filters, kernel_size=(
            self.filter_sizes[1], self.embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')
        conv_2 = Conv2D(self.num_filters, kernel_size=(
            self.filter_sizes[2], self.embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')
        maxpool_0 = MaxPool2D(pool_size=(
            sentence_length - self.filter_sizes[0] + 1, 1), strides=(1, 1), padding='valid')
        maxpool_1 = MaxPool2D(pool_size=(
            sentence_length - self.filter_sizes[1] + 1, 1), strides=(1, 1), padding='valid')
        maxpool_2 = MaxPool2D(pool_size=(
            sentence_length - self.filter_sizes[2] + 1, 1), strides=(1, 1), padding='valid')
        dense_func = Dense(100, activation='tanh', name="dense")
        dense_final = Dense(units=self.classes, activation='softmax')
        reshape_func = Reshape((sentence_length, self.embedding_dim, 1))

        def slicer(x, index):
            return x[:, K.constant(index, dtype='int32'), :]

        def slicer_output_shape(input_shape):
            shape = list(input_shape)
            assert len(shape) == 3  # batch, seq_len, sent_len
            new_shape = (shape[0], shape[2])
            return new_shape

        def reshaper(x):
            return K.expand_dims(x, axis=3)

        def flattener(x):
            x = K.reshape(x, [-1, x.shape[1]*x.shape[3]])
            return x

        def flattener_output_shape(input_shape):
            shape = list(input_shape)
            new_shape = (shape[0], 3*shape[3])
            return new_shape

        inputs = Input(shape=(self.sequence_length,
                              sentence_length), dtype='int32')
        cnn_output = []
        for ind in range(self.sequence_length):

            local_input = Lambda(slicer, output_shape=slicer_output_shape, arguments={
                                 "index": ind})(inputs)  # Batch, word_indices

            # cnn-sent
            emb_output = embedding(local_input)
            reshape = Lambda(reshaper)(emb_output)
            concatenated_tensor = Concatenate(axis=1)([maxpool_0(
                conv_0(reshape)), maxpool_1(conv_1(reshape)), maxpool_2(conv_2(reshape))])
            flatten = Lambda(flattener, output_shape=flattener_output_shape,)(
                concatenated_tensor)
            dense_output = dense_func(flatten)
            dropout = Dropout(0.5)(dense_output)
            cnn_output.append(dropout)

        def stack(x):
            return K.stack(x, axis=1)
        cnn_outputs = Lambda(stack)(cnn_output)

        masked = Masking(mask_value=0)(cnn_outputs)
        lstm = Bidirectional(
            LSTM(300, activation='relu', return_sequences=True, dropout=0.3))(masked)
        lstm = Bidirectional(LSTM(
            300, activation='relu', return_sequences=True, dropout=0.3), name="utter")(lstm)
        output = TimeDistributed(
            Dense(self.classes, activation='softmax'))(lstm)

        model = Model(inputs, output)
        return model

    def get_bimodal_model(self):

        # Modality specific hyperparameters
        self.epochs = 100
        self.batch_size = 10

        # Modality specific parameters
        self.embedding_dim = self.train_x.shape[2]

        print("Creating Model...")

        inputs = Input(shape=(self.sequence_length,
                              self.embedding_dim), dtype='float32')
        masked = Masking(mask_value=0)(inputs)
        lstm = Bidirectional(LSTM(
            300, activation='tanh', return_sequences=True, dropout=0.4), name="utter")(masked)
        output = TimeDistributed(
            Dense(self.classes, activation='softmax'))(lstm)

        model = Model(inputs, output)
        return model

    def train_model(self):

        checkpoint = ModelCheckpoint(
            self.PATH, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

        if self.modality == "audio":
            model = self.get_audio_model()
            model.compile(
                optimizer='adadelta', loss='categorical_crossentropy', sample_weight_mode='temporal')
        elif self.modality == "text":
            model = self.get_text_model()
            model.compile(
                optimizer='adadelta', loss='categorical_crossentropy', sample_weight_mode='temporal')
        elif self.modality == "bimodal":
            model = self.get_bimodal_model()
            model.compile(
                optimizer='adam', loss='categorical_crossentropy', sample_weight_mode='temporal')

        early_stopping = EarlyStopping(monitor='val_loss', patience=10)
        model.fit(self.train_x, self.train_y,
                  epochs=self.epochs,
                  batch_size=self.batch_size,
                  sample_weight=self.train_mask,
                  shuffle=True,
                  callbacks=[early_stopping, checkpoint],
                  validation_data=(self.val_x, self.val_y, self.val_mask))

        self.test_model()

    def test_model(self):

        model = load_model(self.PATH)
        intermediate_layer_model = Model(
            input=model.input, output=model.get_layer("utter").output)

        intermediate_output_train = intermediate_layer_model.predict(
            self.train_x)
        intermediate_output_val = intermediate_layer_model.predict(self.val_x)
        intermediate_output_test = intermediate_layer_model.predict(
            self.test_x)

        train_emb, val_emb, test_emb = {}, {}, {}
        for idx, ID in enumerate(self.train_id):
            train_emb[ID] = intermediate_output_train[idx]
        for idx, ID in enumerate(self.val_id):
            val_emb[ID] = intermediate_output_val[idx]
        for idx, ID in enumerate(self.test_id):
            test_emb[ID] = intermediate_output_test[idx]
        pickle.dump([train_emb, val_emb, test_emb],
                    open(self.OUTPUT_PATH, "wb"))

        self.calc_test_result(model.predict(self.test_x),
                              self.test_y, self.test_mask)

# Run either train or test


In [19]:
!ls

features  MELD.Features.Models	models	sample_data


In [None]:



if __name__ == "__main__":

    # Setup argument parser
    parser = argparse.ArgumentParser()
    parser.required = True
    parser.add_argument(
        "-classify", help="Set the classifiction to be 'Emotion' or 'Sentiment'", required=True)
    parser.add_argument(
        "-modality", help="Set the modality to be 'text' or 'audio' or 'bimodal'", required=True)
    parser.add_argument("-train", default=False,
                        action="store_true", help="Flag to intiate training")
    parser.add_argument("-test", default=False,
                        action="store_true", help="Flag to initiate testing")
    args = parser.parse_args()

    if args.classify.lower() not in ["emotion", "sentiment"]:
        print("Classification mode hasn't been set properly. Please set the classifiction flag to be: -classify Emotion/Sentiment")
        exit()
    if args.modality.lower() not in ["text", "audio", "bimodal"]:
        print("Modality hasn't been set properly. Please set the modality flag to be: -modality text/audio/bimodal")
        exit()

    args.classify = args.classify.title()
    args.modality = args.modality.lower()

    # Check directory existence
    for directory in ["./data/pickles", "./data/models"]:
        if not os.path.exists(directory):
            os.makedirs(directory)

    model = bc_LSTM(args)
    model.load_data()

    if args.test:
        model.test_model()
    else:
        model.train_model()