## Bert

In [10]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

import torch
from transformers import BertModel, BertTokenizer

In [2]:
df_train = pd.read_csv('train_sent_emo.csv', index_col=0)
df_dev = pd.read_csv('dev_sent_emo.csv', index_col=0)
df_test = pd.read_csv('test_sent_emo.csv', index_col=0)

In [3]:
df_train

Unnamed: 0_level_0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
Sr No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,also I was the point person on my companys tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731"
2,You mustve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442"
3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389"
4,So lets talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572"
5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917"
...,...,...,...,...,...,...,...,...,...,...
10474,You or me?,Chandler,neutral,neutral,1038,13,2,3,"00:00:48,173","00:00:50,799"
10475,"I got it. Uh, Joey, women don't have Adam's ap...",Ross,neutral,neutral,1038,14,2,3,"00:00:51,009","00:00:53,594"
10476,"You guys are messing with me, right?",Joey,surprise,positive,1038,15,2,3,"00:01:00,518","00:01:03,520"
10477,Yeah.,All,neutral,neutral,1038,16,2,3,"00:01:05,398","00:01:07,274"


In [4]:
def processed(df):
    global emotion_dict, model, tokenizer
    df_group = df.groupby('Dialogue_ID')
    
    dataset = []
    for name, group in df_group:
        utterance = group['Utterance'].values
        emotion = group['Emotion'].values
        startTime = group['StartTime'].values
        endTime = group['EndTime'].values
        
        # extract feature from bert
        encoded = [tokenizer.encode(u, add_special_tokens=True) for u in utterance]
        max_len = max([len(i) for i in encoded])
        inputs_ids = torch.zeros([len(encoded),max_len]).long()
        for i,e in enumerate(encoded):
            inputs_ids[i,:len(e)-1]=torch.Tensor(e[:-1])
            inputs_ids[i,-1]=e[-1]

        with torch.no_grad():
            feature = model(inputs_ids)[0][:,0,:]      
        
        # str to label
        emotion_label = np.array([emotion_dict[e] for e in emotion])

        # date to second
        datetime_list = [datetime.strptime(t,'%H:%M:%S,%f') for t in startTime]
        stime_list =  np.array([pt.second + pt.minute*60 + pt.hour*3600 + pt.microsecond*1e-6 for pt in datetime_list])
        stime_list = stime_list - stime_list[0]

        datetime_list = [datetime.strptime(t,'%H:%M:%S,%f') for t in endTime]
        etime_list =  np.array([pt.second + pt.minute*60 + pt.hour*3600 + pt.microsecond*1e-6 for pt in datetime_list])
        etime_list = etime_list - etime_list[0]

        dataset.append((feature, emotion_label, stime_list, etime_list))
    return dataset

In [5]:
category_index = 0
category_list = []
for i in df_train['Emotion']:
    if i not in category_list:
        category_list.append(i)
category_list = sorted(category_list)

In [6]:
emotion_dict = {}
for i, c in enumerate(category_list):
    emotion_dict[c] = i

In [7]:
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights)

In [8]:
train_data = processed(df_train)
dev_data = processed(df_dev)
test_data = processed(df_test)

In [11]:
torch.save(train_data, os.path.join('data/', 'train.pt'))
torch.save(dev_data, os.path.join('data/', 'dev.pt'))
torch.save(test_data, os.path.join('data/', 'test.pt'))

## Glove

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import pickle
import os

import torch

In [2]:
root = './data/MELD/raw/'
train_text_avg_emb, val_text_avg_emb, test_text_avg_emb = pickle.load(open(root+'text_glove_average_emotion.pkl', 'rb'))
df_train = pd.read_csv(root+'train_sent_emo.csv', index_col=0)
df_dev = pd.read_csv(root+'dev_sent_emo.csv', index_col=0)
df_test = pd.read_csv(root+'test_sent_emo.csv', index_col=0)

In [4]:
def processed(df, text_avg_emb):
    global emotion_dict
    df_group = df.groupby('Dialogue_ID')
    
    dataset = []
    for name, group in df_group:
        #utterance = group['Utterance'].values
        emotion = group['Emotion'].values
        startTime = group['StartTime'].values
        endTime = group['EndTime'].values
        
        dia_ID = group['Dialogue_ID'].values
        utt_ID = group['Utterance_ID'].values
        # extract feature from glove
        feature = []
        for d, u in zip(dia_ID, utt_ID):
            key = str(d)+'_'+str(u)
            feature.append(torch.tensor(text_avg_emb[key]))
        feature = torch.stack(feature,0)
        # str to label
        emotion_label = torch.tensor([emotion_dict[e] for e in emotion])

        # date to second
        datetime_list = [datetime.strptime(t,'%H:%M:%S,%f') for t in startTime]
        stime_list =  torch.tensor([pt.second + pt.minute*60 + pt.hour*3600 + pt.microsecond*1e-6 for pt in datetime_list])
        bias = stime_list[0]
        stime_list = stime_list - bias

        datetime_list = [datetime.strptime(t,'%H:%M:%S,%f') for t in endTime]
        etime_list =  torch.tensor([pt.second + pt.minute*60 + pt.hour*3600 + pt.microsecond*1e-6 for pt in datetime_list])
        etime_list = etime_list - bias

        dataset.append((feature, emotion_label, stime_list, etime_list))
    return dataset

In [3]:
category_index = 0
category_list = []
for i in df_train['Emotion']:
    if i not in category_list:
        category_list.append(i)
category_list = sorted(category_list)

In [4]:
emotion_dict = {}
for i, c in enumerate(category_list):
    emotion_dict[c] = i

In [7]:
train_data = processed(df_train, train_text_avg_emb)
dev_data = processed(df_dev, val_text_avg_emb)
test_data = processed(df_test, test_text_avg_emb)

data = {
    'train_data': train_data,
    'dev_data': dev_data,
    'test_data': test_data
}

In [9]:
torch.save(data, os.path.join('data/MELD/processed', 'data.pt'))

## End

In [5]:
emotion_dict

{'anger': 0,
 'disgust': 1,
 'fear': 2,
 'joy': 3,
 'neutral': 4,
 'sadness': 5,
 'surprise': 6}

In [10]:
train_data[0]

(tensor([[ 0.0002,  0.0934, -0.0510,  ...,  0.0136,  0.0032,  0.0425],
         [-0.0026, -0.0166, -0.0269,  ..., -0.0131,  0.0370, -0.0158],
         [ 0.0087,  0.0417, -0.0417,  ...,  0.0118,  0.0323,  0.0345],
         ...,
         [-0.0287,  0.1645, -0.1817,  ..., -0.0631,  0.0778,  0.0809],
         [-0.0113,  0.0207, -0.0006,  ..., -0.0075,  0.0066,  0.0108],
         [ 0.0092,  0.0202, -0.0338,  ..., -0.0031,  0.0223, -0.0009]]),
 tensor([4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 2, 4, 6, 4]),
 tensor([ 0.0000,  5.8810,  7.3830, 10.7610, 18.3930, 25.0670, 32.7410, 32.7410,
         43.4180, 44.4190, 46.7970, 48.9660, 57.4319, 61.5200]),
 tensor([ 5.6720,  7.3830, 10.3300, 13.5130, 24.8580, 28.2780, 35.8270, 38.4550,
         44.4190, 46.6600, 48.7990, 57.2650, 60.4770, 64.6480]))