In [121]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [122]:
def label_map(label):
    if label == "neutral":
        return 0
    elif label == "anger":
        return 1
    elif label == "joy":
        return 2
    elif label == "surprise":
        return 3
    elif label == "sadness":
        return 4
    elif label == "disgust":
        return 5
    elif label == "fear":
        return 6

s2idx = {"Chandler":0, "Joey":1, "Rachel":2, "Monica":3, "Phoebe":4, "Ross":5}   
    
def encode(text, word2index, label, N, speaker):
    tokenized = word_tokenize(text)
    encoded = [0]*N
    enc1 = [word2index.get(word) for word in tokenized]
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    '''if(speaker in s2idx):
        idx = s2idx[speaker]
    else:
        idx = 6
    encoded.insert(0,idx)'''
    return (encoded,label)

def encode_test(text, word2index, N, speaker):
    tokenized = word_tokenize(text)
    for i,word in enumerate(tokenized):
        if word2index.get(word)==None:
            tokenized[i]='unk'

    encoded = [0]*N
    enc1 = [word2index.get(word) for word in tokenized]

    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    '''if(speaker in s2idx):
        idx = s2idx[speaker]
    else:
        idx = 6
    encoded.insert(0,idx)'''
    return encoded


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        acc = accuracy_score(pred.tolist(), target.tolist())

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    for batch in iterator:
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        #print(pred)
        acc = accuracy_score(pred.tolist(), target.tolist())

        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [123]:
train_df = pd.read_csv('train_HW2dataset.csv')
dev_df = pd.read_csv('dev_HW2dataset.csv')

train_df=train_df[['Emotion','Utterance','Speaker']]
dev_df=dev_df[['Emotion','Utterance','Speaker']]

train_set = list(train_df.to_records(index=False))
dev_set = list(dev_df.to_records(index=False))
ps = PorterStemmer()
lm = WordNetLemmatizer()
counts = Counter()

new_train_set = []
for label,text,speaker in train_set:
    #newText = [ps.stem(w) for w in text]
    #newText = [lm.lemmatize(w) for w in text]
    newText = [w.lower() for w in text]
    new_train_set.append((label,text,speaker))
new_dev_set = []
for label,text,speaker in dev_set:
    #newText = [ps.stem(w) for w in text]
    #newText = [lm.lemmatize(w) for w in text]
    newText = [w.lower() for w in text]
    new_dev_set.append((label,text,speaker))

for ds in [train_set, dev_set]:
    for label,text,speaker in ds:
        counts.update(word_tokenize(text))

word2index = {'unk':0}
for i,word in enumerate(counts.keys()):
    word2index[word] = i+1
index2word = {v:k for k,v in word2index.items()}

train_encoded = [(encode(Utterance,word2index,label_map(label),15,s)) for label, Utterance, s in train_set]
dev_encoded   = [(encode(Utterance,word2index,label_map(label),15,s)) for label, Utterance,s in dev_set]

train_x = np.array([tweet for tweet, label in train_encoded])
train_y = np.array([label for tweet, label in train_encoded])
dev_x = np.array([tweet for tweet, label in dev_encoded])
dev_y = np.array([label for tweet, label in dev_encoded])

'''batch_size = 32

train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
dev_ds = TensorDataset(torch.from_numpy(dev_x), torch.from_numpy(dev_y))

train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
dev_dl = DataLoader(dev_ds, shuffle=True, batch_size=batch_size, drop_last=True)'''


'batch_size = 32\n\ntrain_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))\ndev_ds = TensorDataset(torch.from_numpy(dev_x), torch.from_numpy(dev_y))\n\ntrain_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)\ndev_dl = DataLoader(dev_ds, shuffle=True, batch_size=batch_size, drop_last=True)'

In [124]:
total_x = np.append(train_x, dev_x, axis=0)
total_y = np.append(train_y, dev_y, axis=0)

In [125]:
src_vocab_size = len(word2index)
dimension_model = 300
num_layers = 5
hidden_size = 200
linear_hidden_size = 30
classes = 7
dropout = 0.2        
lr = 1e-3
attention_width = 15

In [126]:
'''embeddings = [[]]*src_vocab_size
count = 0
with open('glove.42B.300d.txt','rt',encoding="utf-8") as fi:
    for line in fi:
        i_word = line.split(' ')[0]
        i_embeddings = [float(val) for val in line.split(' ')[1:]]
        if(i_word in word2index):
            count+=1
            embeddings[word2index[i_word]] = i_embeddings
for i,emb in enumerate(embeddings):
    if emb == []:
        embeddings[i] = [0]*dimension_model
embeddings[word2index['unk']] = [0]*dimension_model
embs_npa = np.array(embeddings)

with open('embs_npa.npy','wb') as f:
    np.save(f,embs_npa)'''

'embeddings = [[]]*src_vocab_size\ncount = 0\nwith open(\'glove.42B.300d.txt\',\'rt\',encoding="utf-8") as fi:\n    for line in fi:\n        i_word = line.split(\' \')[0]\n        i_embeddings = [float(val) for val in line.split(\' \')[1:]]\n        if(i_word in word2index):\n            count+=1\n            embeddings[word2index[i_word]] = i_embeddings\nfor i,emb in enumerate(embeddings):\n    if emb == []:\n        embeddings[i] = [0]*dimension_model\nembeddings[word2index[\'unk\']] = [0]*dimension_model\nembs_npa = np.array(embeddings)\n\nwith open(\'embs_npa.npy\',\'wb\') as f:\n    np.save(f,embs_npa)'

In [127]:
print(src_vocab_size)
#print(count)

6933


In [128]:
embs_npa = np.load('embs_npa.npy')

In [129]:
from torch.autograd import Variable
def batch_matmul(seq, weight, nonlinearity=''):
    s = None
    for i in range(seq.size(0)):
        _s = torch.mm(seq[i], weight)
        if(nonlinearity=='tanh'):
            _s = torch.tanh(_s)
        _s = _s.unsqueeze(0)
        if(s is None):
            s = _s
        else:
            s = torch.cat((s,_s),0)
    return s.squeeze()

class AttentionLayer(torch.nn.Module):
    """Implements an Attention Layer"""

    def __init__(self, layer_size):
        super(AttentionLayer, self).__init__()
        self.layer_size = layer_size
        self.weight_W = torch.nn.Parameter(torch.Tensor(layer_size,layer_size))
        self.bias = torch.nn.Parameter(torch.Tensor(layer_size))
        self.weight_proj = torch.nn.Parameter(torch.Tensor(layer_size, 1))
        self.softmax = torch.nn.Softmax(dim=1)
        self.weight_W.data.uniform_(-0.1, 0.1)
        self.weight_proj.data.uniform_(-0.1,0.1)

    def forward(self, inputs, attention_width=3):
        results = None
        for i in range(inputs.size(0)):
            if(i<attention_width):
                output = inputs[i]
                output = output.unsqueeze(0)
            else:
                lb = i - attention_width
                if(lb<0):
                    lb = 0
                selector = torch.from_numpy(np.array(np.arange(lb, i)))
                selector = Variable(selector)
                selector = selector.cuda()
                vec = torch.index_select(inputs, 0, selector)
                #print(vec.shape, self.weight_W.shape)
                u = batch_matmul(vec, self.weight_W, nonlinearity='tanh')
                a = batch_matmul(u, self.weight_proj)
                a = self.softmax(a)
                output = None
                for i in range(vec.size(0)):
                    
                    h_i = vec[i]
                    #print(h_i.shape)
                    a_i = a[i].unsqueeze(1).expand_as(h_i)
                    h_i = a_i * h_i
                    h_i = h_i.unsqueeze(0)
                    if(output is None):
                        output = h_i
                    else:
                        output = torch.cat((output,h_i),0)
                #print(output.size())
                #output = torch.sum(output,0)
                #print(output.size())
            if(results is None):
                results = output
            else:
                #print(results.shape, output.shape)
                results = torch.cat((results,output),0)
            # print(results.size())
        return results

In [130]:
class LSTM(torch.nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        #self.embed = torch.nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float())
        self.embed = torch.nn.Embedding(src_vocab_size, dimension_model)                  
        self.lstm = torch.nn.LSTM(input_size=dimension_model, hidden_size=hidden_size,num_layers=num_layers,dropout=dropout)
        self.linear = torch.nn.Linear(hidden_size, classes)
        #self.linear1 = torch.nn.Linear(linear_hidden_size, classes)
        self.AttentionLayer = AttentionLayer(hidden_size)
    def forward(self,data):
        x = self.embed(data)                          
        x,(h_n, c_n) = self.lstm(x.transpose(0, 1))  
        #attention
        #print(x.shape)
        #print(h_n.shape)
        #x = self.AttentionLayer(x,attention_width=attention_width)                                          
                                                    
        x = self.linear(x[-1])                          
        #x = self.linear1(x)                            
        return x

In [131]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [132]:
k=5
def cv_idx():
    idxarr = np.arange(len(total_y))
    np.random.shuffle(idxarr)
    Q = int( len(total_y) / k )
    #print(Q)
    rem = len(total_y) % k
    
    ret = [[] for _ in range(k)]
    foldsize = np.zeros(k)
    for i in range(k):
        if i < rem:
            foldsize[i] = Q + 1
        else:
            foldsize[i] = Q
    for i in range(k):
        train = []
        val = []
        start = 0
        for j in range(k): 
            if j == i:
                for s in range(int(foldsize[j])):
                    val.append(idxarr[start])
                    start += 1
            else:
                for s in range(int(foldsize[j])):
                    train.append(idxarr[start])
                    start += 1
        ret[i].append(np.array(train))
        ret[i].append(np.array(val))
    return ret

kfold_data = cv_idx()

def cross_val(i):
    batch_size = 32
    train_x = total_x[kfold_data[i][0]]
    train_y = total_y[kfold_data[i][0]]
    dev_x = total_x[kfold_data[i][1]]
    dev_y = total_y[kfold_data[i][1]]
    train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
    dev_ds = TensorDataset(torch.from_numpy(dev_x), torch.from_numpy(dev_y))

    train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
    dev_dl = DataLoader(dev_ds, shuffle=True, batch_size=batch_size, drop_last=True)
    return train_dl, dev_dl

In [133]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
total_v_acc = 0
best_acc = 0

for i in range(k):
    model = LSTM().to(device)                           
    criterion = torch.nn.CrossEntropyLoss()                  
    optimizer = torch.optim.Adam(model.parameters(),lr=lr) 

    train_dl, dev_dl = cross_val(i)
    
    
    
    for epoch in range(10):
        train_loss, train_acc = train(model, train_dl,
                                        optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, dev_dl,
                                            criterion)
        

        print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc * 100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc * 100:.2f}%')    
        if best_acc <= valid_acc:
            best_acc = valid_acc
            PATH=f"epoch{epoch+1}_{i}_val.accuracy{valid_acc:.3f}%.pt"
            torch.save({
                    'epoch': epoch+1,
                    'i': i,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': valid_loss,
                    }, PATH)
    break

Epoch: 01, Train Loss: 1.433, Train Acc: 51.60%, Val. Loss: 1.423, Val. Acc: 51.90%
Epoch: 02, Train Loss: 1.334, Train Acc: 53.25%, Val. Loss: 1.357, Val. Acc: 52.53%
Epoch: 03, Train Loss: 1.208, Train Acc: 57.77%, Val. Loss: 1.301, Val. Acc: 54.88%
Epoch: 04, Train Loss: 1.056, Train Acc: 64.67%, Val. Loss: 1.266, Val. Acc: 57.83%
Epoch: 05, Train Loss: 0.909, Train Acc: 70.69%, Val. Loss: 1.323, Val. Acc: 60.04%
Epoch: 06, Train Loss: 0.777, Train Acc: 75.77%, Val. Loss: 1.297, Val. Acc: 62.22%
Epoch: 07, Train Loss: 0.663, Train Acc: 79.25%, Val. Loss: 1.327, Val. Acc: 61.27%
Epoch: 08, Train Loss: 0.580, Train Acc: 81.82%, Val. Loss: 1.399, Val. Acc: 62.71%
Epoch: 09, Train Loss: 0.499, Train Acc: 84.32%, Val. Loss: 1.372, Val. Acc: 63.41%
Epoch: 10, Train Loss: 0.435, Train Acc: 86.33%, Val. Loss: 1.403, Val. Acc: 64.26%


In [134]:
test_df = pd.read_csv('test_HW2dataset.csv')
test_df=test_df[['Utterance','Speaker']]
#test_set = test_df.values.tolist()
test_set = list(test_df.to_records(index=False))
new_test_set = []
for text,speaker in test_set:
    #newText = [ps.stem(w) for w in text]
    #newText = [lm.lemmatize(w) for w in text]
    newText = [w.lower() for w in text]
    new_test_set.append((text,speaker))
#print(test_set)
test_encoded=[]
test_encoded+=[encode_test(Utterance, word2index, 10, s) for Utterance,s in test_set]
test_x = np.array(test_encoded)
test_ds = TensorDataset(torch.from_numpy(test_x))
test_dl = DataLoader(test_ds, shuffle=False, batch_size=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
model = LSTM().to(device)                               
criterion = torch.nn.CrossEntropyLoss()                  
optimizer = torch.optim.Adam(model.parameters(),lr=lr) 
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']

model.eval()
predict=[]
a = 0
for deta in test_dl:
    a+=1
    text = deta[0].to(device)
    #print(text)
    preds = model(text)
    _, pred = torch.max(preds, 1)
    for i in pred:
        #print(i.item())
        predict.append(i.item())
print(a)

1700


In [135]:
print(len(predict))
ans = [[i, pre] for [i,pre] in enumerate(predict)]


3400


In [136]:
import csv
h = ['index', 'emotion']
with open('predict.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(h)
    writer.writerows(ans)