In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

In [2]:
def label_map(label):
    if label == "neutral":
        return 0
    elif label == "anger":
        return 1
    elif label == "joy":
        return 2
    elif label == "surprise":
        return 3
    elif label == "sadness":
        return 4
    elif label == "disgust":
        return 5
    elif label == "fear":
        return 6

s2idx = {"Chandler":0, "The Interviewer":1, "Joey":2, "Rachel":3, "Monica":4, "Phoebe":5, "Ross":6, "Jade":7, "Mona":8, "Charlie":9}
    
    
def encode(text, word2index, label, N, speaker):
    
    tokenized = word_tokenize(text)
    for i,word in enumerate(tokenized):
        if word2index.get(word)==None:
            tokenized[i]='unk'
    encoded = [0]*N
    enc1 = [word2index.get(word) for word in tokenized]
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    if(speaker in s2idx):
        idx = s2idx[speaker]
    else:
        idx = 10
    encoded.insert(0,idx)
    return (encoded,label)

def encode_test(text, word2index, N, speaker):
    tokenized = word_tokenize(text)
    for i,word in enumerate(tokenized):
        if word2index.get(word)==None:
            tokenized[i]='unk'

    encoded = [0]*N
    enc1 = [word2index.get(word) for word in tokenized]

    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    if(speaker in s2idx):
        idx = s2idx[speaker]
    else:
        idx = 10
    encoded.insert(0,idx)
    return encoded


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        acc = accuracy_score(pred.tolist(), target.tolist())

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    for batch in iterator:
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        acc = accuracy_score(pred.tolist(), target.tolist())

        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [3]:
train_df = pd.read_csv('train_HW2dataset.csv')
dev_df = pd.read_csv('dev_HW2dataset.csv')

train_df=train_df[['Emotion','Utterance','Speaker']]
dev_df=dev_df[['Emotion','Utterance','Speaker']]

train_set = list(train_df.to_records(index=False))
dev_set = list(dev_df.to_records(index=False))

counts = Counter()
for ds in [train_set, dev_set]:
    for label,text,speaker in ds:
        counts.update(word_tokenize(text))

word2index = {'unk':0}
for i,word in enumerate(counts.keys()):
    if(counts[word]>3):
        word2index[word] = i+1
index2word = {v:k for k,v in word2index.items()}

train_encoded = [(encode(Utterance,word2index,label_map(label),12,s)) for label, Utterance, s in train_set]
dev_encoded   = [(encode(Utterance,word2index,label_map(label),12,s)) for label, Utterance,s in dev_set]

train_x = np.array([tweet for tweet, label in train_encoded])
train_y = np.array([label for tweet, label in train_encoded])
dev_x = np.array([tweet for tweet, label in dev_encoded])
dev_y = np.array([label for tweet, label in dev_encoded])

batch_size = 32
print(type(train_x))
train_x = train_x.astype(int)
train_y = train_y.astype(int)
dev_x = dev_x.astype(int)
dev_y = dev_y.astype(int)
train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
dev_ds = TensorDataset(torch.from_numpy(dev_x), torch.from_numpy(dev_y))

train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
dev_dl = DataLoader(dev_ds, shuffle=True, batch_size=batch_size, drop_last=True)


<class 'numpy.ndarray'>


In [4]:
src_vocab_size = len(word2index)      
dimension_model = 128                             
num_layers = 5                       
hidden_size = 60                         
linear_hidden_size = 30              
classes = 7                          
dropout = 0.2                            
lr = 1e-3

In [5]:
class LSTM(torch.nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.embed = torch.nn.Embedding(src_vocab_size, dimension_model)                        
        self.lstm = torch.nn.LSTM(input_size=dimension_model, hidden_size=hidden_size,num_layers=num_layers,dropout=dropout)
        self.linear = torch.nn.Linear(hidden_size, linear_hidden_size)
        self.linear1 = torch.nn.Linear(linear_hidden_size, classes)
    def forward(self,data):
        x = self.embed(data)                          
        x,(h_n, c_n) = self.lstm(x.transpose(0, 1))  
                                                         
                                                    
        x = self.linear(x[-1])                          
        x = self.linear1(x)                            
        return x

In [6]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [8]:
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
device = 'cpu'
model = LSTM().to(device)                           
criterion = torch.nn.CrossEntropyLoss()                  
optimizer = torch.optim.Adam(model.parameters(),lr=lr) 

best_acc = 0
for epoch in range(10):
    train_loss, train_acc = train(model, train_dl,
                                  optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dev_dl,
                                     criterion)

    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc * 100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc * 100:.2f}%')    
    if best_acc <= valid_acc:
        best_acc = valid_acc
        PATH=f"epoch{epoch+1}_val.accuracy{valid_loss:.3f}%.pt"
        torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': valid_loss,
                }, PATH)

IndexError: index out of range in self

In [None]:
test_df = pd.read_csv('test_HW2dataset.csv')
test_df=test_df[['Utterance','Speaker']]
#test_set = test_df.values.tolist()
test_set = list(test_df.to_records(index=False))
#print(test_set)
test_encoded=[]
test_encoded+=[encode_test(Utterance, word2index, 10, s) for Utterance,s in test_set]
test_x = np.array(test_encoded)
test_ds = TensorDataset(torch.from_numpy(test_x))
test_dl = DataLoader(test_ds, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
model = LSTM().to(device)                               
criterion = torch.nn.CrossEntropyLoss()                  
optimizer = torch.optim.Adam(model.parameters(),lr=lr) 
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']

model.eval()
predict=[]
for deta in test_dl:
    text = deta[0].to(device)
    preds = model(text)
    _, pred = torch.max(preds, 1)
    predict.append(pred.item())

In [None]:
print(checkpoint)

{'epoch': 3, 'model_state_dict': OrderedDict([('embed.weight', tensor([[ 1.2513, -1.4973,  0.1638,  ...,  1.0216, -1.3446,  0.8944],
        [-0.4079,  0.8722, -1.1470,  ...,  0.0627,  0.8336, -0.0408],
        [ 0.1251,  0.5372, -0.8063,  ..., -0.5627,  0.0426, -3.6253],
        ...,
        [-0.0241,  1.0167, -0.1347,  ..., -0.0938,  0.2886,  0.4427],
        [-0.5913,  0.4733, -0.0281,  ...,  2.1596,  1.0488, -0.5972],
        [-2.3653, -0.6908, -1.3227,  ...,  0.8520, -0.6019,  0.7295]],
       device='cuda:0')), ('lstm.weight_ih_l0', tensor([[ 0.0421,  0.1095, -0.0328,  ..., -0.0131, -0.1078, -0.0019],
        [-0.0074, -0.1113,  0.1875,  ..., -0.0100,  0.2072,  0.0844],
        [ 0.0631,  0.0612,  0.1653,  ...,  0.0642,  0.0438,  0.1161],
        ...,
        [ 0.0967, -0.1051, -0.0159,  ..., -0.0335, -0.0964,  0.1297],
        [-0.1863,  0.0467,  0.1118,  ..., -0.0115,  0.1834, -0.1249],
        [ 0.0855,  0.0170, -0.0900,  ..., -0.0465,  0.0074, -0.0789]],
       device='cuda:0

In [None]:
print(len(predict))
ans = [[i, pre] for [i,pre] in enumerate(predict)]


3400


In [None]:
import csv
h = ['index', 'emotion']
with open('predict.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(h)
    writer.writerows(ans)