In [50]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

In [51]:
def label_map(label):
    if label == "neutral":
        return 0
    elif label == "anger":
        return 1
    elif label == "joy":
        return 2
    elif label == "surprise":
        return 3
    elif label == "sadness":
        return 4
    elif label == "disgust":
        return 5
    elif label == "fear":
        return 6

s2idx = {"Chandler":0, "The Interviewer":1, "Joey":2, "Rachel":3, "Monica":4, "Phoebe":5, "Ross":6, "Jade":7, "Mona":8, "Charlie":9}
    
    
def encode(text, word2index, label, N, speaker):
    tokenized = word_tokenize(text)
    encoded = [0]*N
    enc1 = [word2index.get(word) for word in tokenized]
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    if(speaker in s2idx):
        idx = s2idx[speaker]
    else:
        idx = 10
    encoded.insert(0,idx)
    return (encoded,label)

def encode_test(text, word2index, N):
    tokenized = word_tokenize(text)
    for i,word in enumerate(tokenized):
        if word2index.get(word)==None:
            tokenized[i]='unk'

    encoded = [0]*N
    enc1 = [word2index.get(word) for word in tokenized]

    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        acc = accuracy_score(pred.tolist(), target.tolist())

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    for batch in iterator:
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        acc = accuracy_score(pred.tolist(), target.tolist())

        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [52]:
train_df = pd.read_csv('train_HW2dataset.csv')
dev_df = pd.read_csv('dev_HW2dataset.csv')

train_df=train_df[['Emotion','Utterance','Speaker']]
dev_df=dev_df[['Emotion','Utterance','Speaker']]

train_set = list(train_df.to_records(index=False))
dev_set = list(dev_df.to_records(index=False))

counts = Counter()
for ds in [train_set, dev_set]:
    for label,text,speaker in ds:
        counts.update(word_tokenize(text))

word2index = {'unk':0}
for i,word in enumerate(counts.keys()):
    word2index[word] = i+1
index2word = {v:k for k,v in word2index.items()}

train_encoded = [(encode(Utterance,word2index,label_map(label),12,s)) for label, Utterance, s in train_set]
dev_encoded   = [(encode(Utterance,word2index,label_map(label),12,s)) for label, Utterance,s in dev_set]

train_x = np.array([tweet for tweet, label in train_encoded])
train_y = np.array([label for tweet, label in train_encoded])
dev_x = np.array([tweet for tweet, label in dev_encoded])
dev_y = np.array([label for tweet, label in dev_encoded])

batch_size = 32

train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
dev_ds = TensorDataset(torch.from_numpy(dev_x), torch.from_numpy(dev_y))

train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
dev_dl = DataLoader(dev_ds, shuffle=True, batch_size=batch_size, drop_last=True)


In [53]:
print(dev_x.shape)
print(dev_y.shape)

(1462, 13)
(1462,)


In [54]:
print(type(train_set))

<class 'list'>


In [55]:
src_vocab_size = len(word2index)      
dimension_model = 32                             
num_layers = 5                       
hidden_size = 30                         
linear_hidden_size = 10               
classes = 7                          
dropout = 0.2                            
lr = 1e-3

In [56]:
class LSTM(torch.nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.embed = torch.nn.Embedding(src_vocab_size, dimension_model)                        
        self.lstm = torch.nn.LSTM(input_size=dimension_model, hidden_size=hidden_size,num_layers=num_layers,dropout=dropout)
        self.linear = torch.nn.Linear(hidden_size, linear_hidden_size)
        self.linear1 = torch.nn.Linear(linear_hidden_size, classes)
    def forward(self,data):
        x = self.embed(data)                          
        x,(h_n, c_n) = self.lstm(x.transpose(0, 1))  
                                                         
                                                    
        x = self.linear(x[-1])                          
        x = self.linear1(x)                            
        return x

In [57]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
model = LSTM().to(device)                           
criterion = torch.nn.CrossEntropyLoss()                  
optimizer = torch.optim.Adam(model.parameters(),lr=lr) 

best_acc = 0
for epoch in range(10):
    train_loss, train_acc = train(model, train_dl,
                                  optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dev_dl,
                                     criterion)

    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc * 100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc * 100:.2f}%')    
    if best_acc <= valid_acc:
        best_acc = valid_acc
        PATH=f"epoch{epoch+1}_val.accuracy{valid_loss:.3f}%.pt"
        torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': valid_loss,
                }, PATH)

Epoch: 01, Train Loss: 1.513, Train Acc: 47.32%, Val. Loss: 1.493, Val. Acc: 47.15%
Epoch: 02, Train Loss: 1.399, Train Acc: 53.05%, Val. Loss: 1.518, Val. Acc: 46.67%
Epoch: 03, Train Loss: 1.366, Train Acc: 53.54%, Val. Loss: 1.473, Val. Acc: 46.88%
Epoch: 04, Train Loss: 1.329, Train Acc: 54.40%, Val. Loss: 1.486, Val. Acc: 48.75%
Epoch: 05, Train Loss: 1.280, Train Acc: 55.89%, Val. Loss: 1.567, Val. Acc: 48.61%
Epoch: 06, Train Loss: 1.236, Train Acc: 57.39%, Val. Loss: 1.520, Val. Acc: 47.85%
Epoch: 07, Train Loss: 1.194, Train Acc: 58.57%, Val. Loss: 1.554, Val. Acc: 46.39%
Epoch: 08, Train Loss: 1.158, Train Acc: 59.83%, Val. Loss: 1.564, Val. Acc: 47.15%
Epoch: 09, Train Loss: 1.130, Train Acc: 61.00%, Val. Loss: 1.605, Val. Acc: 45.07%
Epoch: 10, Train Loss: 1.102, Train Acc: 62.49%, Val. Loss: 1.623, Val. Acc: 44.72%


In [58]:
test_df = pd.read_csv('test_HW2dataset.csv')
test_df=test_df[['Utterance']]
test_set = test_df.values.tolist()
test_encoded=[]
for sentence in test_set:
    test_encoded+=[encode_test(Utterance, word2index, 10) for Utterance in sentence]
test_x = np.array(test_encoded)
test_ds = TensorDataset(torch.from_numpy(test_x))
test_dl = DataLoader(test_ds, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
model = LSTM().to(device)                               
criterion = torch.nn.CrossEntropyLoss()                  
optimizer = torch.optim.Adam(model.parameters(),lr=lr) 
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']

model.eval()
predict=[]
for deta in test_dl:
    text = deta[0].to(device)
    preds = model(text)
    _, pred = torch.max(preds, 1)
    predict.append(pred.item())

In [59]:
print(checkpoint)

{'epoch': 4, 'model_state_dict': OrderedDict([('embed.weight', tensor([[-1.5797, -1.4423, -0.4430,  ..., -1.6993,  0.7786,  0.2246],
        [-0.1985, -0.1513, -0.3783,  ..., -0.0615, -0.5038, -1.8056],
        [-1.6948,  0.0181, -1.4189,  ...,  1.7508,  0.3057, -0.8188],
        ...,
        [-0.2839,  1.5234, -0.1709,  ..., -0.6172,  0.8009,  2.1462],
        [-0.5400,  0.7030, -0.6753,  ...,  1.6072, -0.8323, -1.7899],
        [-1.7127, -0.2272,  0.1858,  ..., -0.7374,  1.9107,  0.7697]],
       device='cuda:0')), ('lstm.weight_ih_l0', tensor([[ 0.0392,  0.1144, -0.0415,  ..., -0.1423,  0.1407,  0.1048],
        [-0.2436, -0.0682,  0.1299,  ..., -0.0695,  0.1201,  0.0442],
        [-0.0673, -0.2719, -0.1611,  ..., -0.0516,  0.1564,  0.2159],
        ...,
        [-0.1138, -0.3235,  0.0042,  ..., -0.1366,  0.1884,  0.1161],
        [-0.0761, -0.3112,  0.0713,  ..., -0.0238, -0.0393, -0.0673],
        [-0.0638,  0.0110, -0.1951,  ..., -0.0164, -0.0992,  0.0607]],
       device='cuda:0

In [60]:
print(len(predict))
ans = [[i, pre] for [i,pre] in enumerate(predict)]


3400


In [61]:
import csv
h = ['index', 'emotion']
with open('predict.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(h)
    writer.writerows(ans)