In [47]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

In [48]:
def label_map(label):
    if label == "neutral":
        return 0
    elif label == "anger":
        return 1
    elif label == "joy":
        return 2
    elif label == "surprise":
        return 3
    elif label == "sadness":
        return 4
    elif label == "disgust":
        return 5
    elif label == "fear":
        return 6

s2idx = {"Chandler":0, "The Interviewer":1, "Joey":2, "Rachel":3, "Monica":4, "Phoebe":5, "Ross":6, "Jade":7, "Mona":8, "Charlie":9}
    
    
def encode(text, word2index, label, N, speaker):
    tokenized = word_tokenize(text)
    encoded = [0]*N
    enc1 = [word2index.get(word) for word in tokenized]
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    if(speaker in s2idx):
        idx = s2idx[speaker]
    else:
        idx = 10
    encoded.insert(0,idx)
    return (encoded,label)

def encode_test(text, word2index, N, speaker):
    tokenized = word_tokenize(text)
    for i,word in enumerate(tokenized):
        if word2index.get(word)==None:
            tokenized[i]='unk'

    encoded = [0]*N

    enc1 = [word2index.get(word) for word in tokenized]

    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    if(speaker in s2idx):
        idx = s2idx[speaker]
    else:
        idx = 10
    encoded.insert(0,idx)
    return encoded


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        acc = accuracy_score(pred.tolist(), target.tolist())

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    for batch in iterator:
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        acc = accuracy_score(pred.tolist(), target.tolist())

        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [58]:
train_df = pd.read_csv('train_HW2dataset.csv')
dev_df = pd.read_csv('dev_HW2dataset.csv')

train_df=train_df[['Emotion','Utterance','Speaker']]
dev_df=dev_df[['Emotion','Utterance','Speaker']]

train_set = list(train_df.to_records(index=False))
dev_set = list(dev_df.to_records(index=False))

counts = Counter()
for ds in [train_set, dev_set]:
    for label,text,speaker in ds:
        counts.update(word_tokenize(text))

word2index = {'unk':0}
i = 1
for word in counts.keys():
    #if(counts[word]>1):
    if(1):
        word2index[word] = i
        i += 1
index2word = {v:k for k,v in word2index.items()}

train_encoded = [(encode(Utterance,word2index,label_map(label),12,s)) for label, Utterance, s in train_set]
dev_encoded   = [(encode(Utterance,word2index,label_map(label),12,s)) for label, Utterance,s in dev_set]

train_x = np.array([tweet for tweet, label in train_encoded])
train_y = np.array([label for tweet, label in train_encoded])
dev_x = np.array([tweet for tweet, label in dev_encoded])
dev_y = np.array([label for tweet, label in dev_encoded])

batch_size = 32
print(type(train_x))
#train_x = train_x.astype(int)
#train_y = train_y.astype(int)
#dev_x = dev_x.astype(int)
#dev_y = dev_y.astype(int)
train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
dev_ds = TensorDataset(torch.from_numpy(dev_x), torch.from_numpy(dev_y))

train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
dev_dl = DataLoader(dev_ds, shuffle=True, batch_size=batch_size, drop_last=True)


<class 'numpy.ndarray'>


In [50]:
src_vocab_size = len(word2index)      
dimension_model = 128                             
num_layers = 5                       
hidden_size = 60                         
linear_hidden_size = 30              
classes = 7                          
dropout = 0.2                            
lr = 1e-3

In [51]:
class LSTM(torch.nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.embed = torch.nn.Embedding(src_vocab_size, dimension_model)                        
        self.lstm = torch.nn.LSTM(input_size=dimension_model, hidden_size=hidden_size,num_layers=num_layers,dropout=dropout)
        self.linear = torch.nn.Linear(hidden_size, linear_hidden_size)
        self.linear1 = torch.nn.Linear(linear_hidden_size, classes)
    def forward(self,data):
        x = self.embed(data)                          
        x,(h_n, c_n) = self.lstm(x.transpose(0, 1))  
                                                         
                                                    
        x = self.linear(x[-1])                          
        x = self.linear1(x)                            
        return x

In [52]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [59]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
#device = 'cpu'
model = LSTM().to(device)                           
criterion = torch.nn.CrossEntropyLoss()                  
optimizer = torch.optim.Adam(model.parameters(),lr=lr) 

best_acc = 0
for epoch in range(10):
    train_loss, train_acc = train(model, train_dl,
                                  optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dev_dl,
                                     criterion)

    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc * 100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc * 100:.2f}%')    
    if best_acc <= valid_acc:
        best_acc = valid_acc
        PATH=f"epoch{epoch+1}_val.accuracy{valid_loss:.3f}%.pt"
        torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': valid_loss,
                }, PATH)

Epoch: 01, Train Loss: 1.459, Train Acc: 49.68%, Val. Loss: 1.472, Val. Acc: 48.75%
Epoch: 02, Train Loss: 1.323, Train Acc: 53.67%, Val. Loss: 1.509, Val. Acc: 46.25%
Epoch: 03, Train Loss: 1.213, Train Acc: 58.17%, Val. Loss: 1.498, Val. Acc: 47.92%
Epoch: 04, Train Loss: 1.110, Train Acc: 61.28%, Val. Loss: 1.523, Val. Acc: 47.99%
Epoch: 05, Train Loss: 0.999, Train Acc: 65.63%, Val. Loss: 1.674, Val. Acc: 45.97%
Epoch: 06, Train Loss: 0.902, Train Acc: 69.79%, Val. Loss: 1.728, Val. Acc: 45.83%
Epoch: 07, Train Loss: 0.803, Train Acc: 73.79%, Val. Loss: 1.871, Val. Acc: 46.32%
Epoch: 08, Train Loss: 0.720, Train Acc: 76.85%, Val. Loss: 1.878, Val. Acc: 46.94%
Epoch: 09, Train Loss: 0.637, Train Acc: 79.50%, Val. Loss: 2.126, Val. Acc: 44.58%
Epoch: 10, Train Loss: 0.570, Train Acc: 82.00%, Val. Loss: 2.118, Val. Acc: 47.15%


In [60]:
test_df = pd.read_csv('test_HW2dataset.csv')
test_df=test_df[['Utterance','Speaker']]
#test_set = test_df.values.tolist()
test_set = list(test_df.to_records(index=False))
#print(test_set)
test_encoded=[]
test_encoded+=[encode_test(Utterance, word2index, 10, s) for Utterance,s in test_set]
test_x = np.array(test_encoded)
test_ds = TensorDataset(torch.from_numpy(test_x))
test_dl = DataLoader(test_ds, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
model = LSTM().to(device)                               
criterion = torch.nn.CrossEntropyLoss()                  
optimizer = torch.optim.Adam(model.parameters(),lr=lr) 
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']

model.eval()
predict=[]
for deta in test_dl:
    text = deta[0].to(device)
    preds = model(text)
    _, pred = torch.max(preds, 1)
    predict.append(pred.item())

In [55]:
print(checkpoint)

{'epoch': 2, 'model_state_dict': OrderedDict([('embed.weight', tensor([[ 0.5784,  0.4773,  0.9698,  ...,  0.9511,  1.4772,  0.0563],
        [-0.2156, -1.3910, -0.2349,  ...,  0.2803,  0.1844, -0.0525],
        [ 0.3552, -0.7626,  0.1286,  ...,  0.1894,  1.2521,  0.4697],
        ...,
        [-0.0141, -0.3147, -0.8574,  ...,  0.6471, -0.3274,  1.1028],
        [ 0.0098, -0.3627, -0.4126,  ...,  0.1110,  0.4236, -0.5340],
        [-1.0170, -1.5434,  0.6221,  ..., -1.1312,  1.1682,  0.2477]],
       device='cuda:0')), ('lstm.weight_ih_l0', tensor([[-0.0571, -0.1234, -0.0435,  ..., -0.1000,  0.1124,  0.1143],
        [-0.1513,  0.0518, -0.0840,  ...,  0.1136, -0.0417, -0.1631],
        [ 0.1329, -0.0638, -0.1000,  ...,  0.0507, -0.0074, -0.0068],
        ...,
        [-0.0696,  0.0135, -0.1492,  ...,  0.0765,  0.0672, -0.0163],
        [-0.1418, -0.1178, -0.1026,  ...,  0.0473,  0.0194,  0.0723],
        [-0.0796,  0.0753, -0.0804,  ...,  0.1328,  0.1582,  0.0621]],
       device='cuda:0

In [61]:
print(len(predict))
ans = [[i, pre] for [i,pre] in enumerate(predict)]


3400


In [62]:
import csv
h = ['index', 'emotion']
with open('predict.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(h)
    writer.writerows(ans)