In [29]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [30]:
def label_map(label):
    if label == "neutral":
        return 0
    elif label == "anger":
        return 1
    elif label == "joy":
        return 2
    elif label == "surprise":
        return 3
    elif label == "sadness":
        return 4
    elif label == "disgust":
        return 5
    elif label == "fear":
        return 6

s2idx = {"Chandler":0, "The Interviewer":1, "Joey":2, "Rachel":3, "Monica":4, "Phoebe":5, "Ross":6, "Jade":7, "Mona":8, "Charlie":9}
    
    
def encode(text, word2index, label, N, speaker):
    tokenized = word_tokenize(text)
    encoded = [0]*N
    enc1 = [word2index.get(word) for word in tokenized]
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    '''AttributeErrorif(speaker in s2idx):
        idx = s2idx[speaker]
    else:
        idx = 10
    encoded.insert(0,idx)'''
    return (encoded,label)

def encode_test(text, word2index, N, speaker):
    tokenized = word_tokenize(text)
    for i,word in enumerate(tokenized):
        if word2index.get(word)==None:
            tokenized[i]='unk'

    encoded = [0]*N
    enc1 = [word2index.get(word) for word in tokenized]

    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    '''if(speaker in s2idx):
        idx = s2idx[speaker]
    else:
        idx = 10
    encoded.insert(0,idx)'''
    return encoded


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        acc = accuracy_score(pred.tolist(), target.tolist())

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    for batch in iterator:
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        acc = accuracy_score(pred.tolist(), target.tolist())

        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [31]:
train_df = pd.read_csv('train_HW2dataset.csv')
dev_df = pd.read_csv('dev_HW2dataset.csv')

train_df=train_df[['Emotion','Utterance','Speaker']]
dev_df=dev_df[['Emotion','Utterance','Speaker']]

train_set = list(train_df.to_records(index=False))
dev_set = list(dev_df.to_records(index=False))
ps = PorterStemmer()
lm = WordNetLemmatizer()
counts = Counter()

new_train_set = []
for label,text,speaker in train_set:
    #newText = [ps.stem(w) for w in text]
    newText = [lm.lemmatize(w) for w in text]
    new_train_set.append((label,text,speaker))
new_dev_set = []
for label,text,speaker in dev_set:
    #newText = [ps.stem(w) for w in text]
    newText = [lm.lemmatize(w) for w in text]
    new_dev_set.append((label,text,speaker))

for ds in [train_set, dev_set]:
    for label,text,speaker in ds:
        counts.update(word_tokenize(text))

word2index = {'unk':0}
for i,word in enumerate(counts.keys()):
    word2index[word] = i+1
index2word = {v:k for k,v in word2index.items()}

train_encoded = [(encode(Utterance,word2index,label_map(label),12,s)) for label, Utterance, s in train_set]
dev_encoded   = [(encode(Utterance,word2index,label_map(label),12,s)) for label, Utterance,s in dev_set]

train_x = np.array([tweet for tweet, label in train_encoded])
train_y = np.array([label for tweet, label in train_encoded])
dev_x = np.array([tweet for tweet, label in dev_encoded])
dev_y = np.array([label for tweet, label in dev_encoded])

'''batch_size = 32

train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
dev_ds = TensorDataset(torch.from_numpy(dev_x), torch.from_numpy(dev_y))

train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
dev_dl = DataLoader(dev_ds, shuffle=True, batch_size=batch_size, drop_last=True)'''


KeyboardInterrupt: 

In [None]:
total_x = np.append(train_x, dev_x, axis=0)
total_y = np.append(train_y, dev_y, axis=0)

In [None]:
src_vocab_size = len(word2index)
dimension_model = 300
num_layers = 5
hidden_size = 100
linear_hidden_size = 30
classes = 7
dropout = 0.2        
lr = 1e-3

In [None]:
'''vocab,embeddings = [],[]
with open('glove.6B.300d.txt','rt',encoding="utf-8") as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)
vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)
#insert '<pad>' and '<unk>' tokens at start of vocab_npa.
vocab_npa = np.insert(vocab_npa, 0, '<pad>')
vocab_npa = np.insert(vocab_npa, 1, '<unk>')
print(vocab_npa[:10])

pad_emb_npa = np.zeros((1,embs_npa.shape[1]))   #embedding for '<pad>' token.
unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)    #embedding for '<unk>' token.

#insert embeddings for pad and unk tokens at top of embs_npa.
embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))
with open('vocab_npa.npy','wb') as f:
    np.save(f,vocab_npa)

with open('embs_npa.npy','wb') as f:
    np.save(f,embs_npa)'''

'vocab,embeddings = [],[]\nwith open(\'glove.6B.300d.txt\',\'rt\',encoding="utf-8") as fi:\n    full_content = fi.read().strip().split(\'\n\')\nfor i in range(len(full_content)):\n    i_word = full_content[i].split(\' \')[0]\n    i_embeddings = [float(val) for val in full_content[i].split(\' \')[1:]]\n    vocab.append(i_word)\n    embeddings.append(i_embeddings)\nvocab_npa = np.array(vocab)\nembs_npa = np.array(embeddings)\n#insert \'<pad>\' and \'<unk>\' tokens at start of vocab_npa.\nvocab_npa = np.insert(vocab_npa, 0, \'<pad>\')\nvocab_npa = np.insert(vocab_npa, 1, \'<unk>\')\nprint(vocab_npa[:10])\n\npad_emb_npa = np.zeros((1,embs_npa.shape[1]))   #embedding for \'<pad>\' token.\nunk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)    #embedding for \'<unk>\' token.\n\n#insert embeddings for pad and unk tokens at top of embs_npa.\nembs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))\nwith open(\'vocab_npa.npy\',\'wb\') as f:\n    np.save(f,vocab_npa)\n\nwith open(\'embs_npa.npy

In [None]:
class LSTM(torch.nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        #self.embed = torch.nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float())
        self.embed = torch.nn.Embedding(src_vocab_size, dimension_model)                  
        self.lstm = torch.nn.LSTM(input_size=dimension_model, hidden_size=hidden_size,num_layers=num_layers,dropout=dropout)
        self.linear = torch.nn.Linear(hidden_size, classes)
        #self.linear1 = torch.nn.Linear(linear_hidden_size, classes)
    def forward(self,data):
        x = self.embed(data)                          
        x,(h_n, c_n) = self.lstm(x.transpose(0, 1))  
                                                         
                                                    
        x = self.linear(x[-1])                          
        #x = self.linear1(x)                            
        return x

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
k=5
def cv_idx():
    idxarr = np.arange(len(total_y))
    np.random.shuffle(idxarr)
    Q = int( len(total_y) / k )
    #print(Q)
    rem = len(total_y) % k
    
    ret = [[] for _ in range(k)]
    foldsize = np.zeros(k)
    for i in range(k):
        if k < rem:
            foldsize[i] = Q + 1
        else:
            foldsize[i] = Q
    for i in range(k):
        train = []
        val = []
        start = 0
        for j in range(k): 
            if j == i:
                for s in range(int(foldsize[j])):
                    val.append(idxarr[start])
                    start += 1
            else:
                for s in range(int(foldsize[j])):
                    train.append(idxarr[start])
                    start += 1
        ret[i].append(np.array(train))
        ret[i].append(np.array(val))
    return ret

kfold_data = cv_idx()

def cross_val(i):
    batch_size = 32
    train_x = total_x[kfold_data[i][0]]
    train_y = total_y[kfold_data[i][0]]
    dev_x = total_x[kfold_data[i][1]]
    dev_y = total_y[kfold_data[i][1]]
    train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
    dev_ds = TensorDataset(torch.from_numpy(dev_x), torch.from_numpy(dev_y))

    train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
    dev_dl = DataLoader(dev_ds, shuffle=True, batch_size=batch_size, drop_last=True)
    return train_dl, dev_dl

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
total_v_acc = 0
best_acc = 0

for i in range(k):
    model = LSTM().to(device)                           
    criterion = torch.nn.CrossEntropyLoss()                  
    optimizer = torch.optim.Adam(model.parameters(),lr=lr) 

    train_dl, dev_dl = cross_val(i)
    
    
    
    for epoch in range(10):
        train_loss, train_acc = train(model, train_dl,
                                        optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, dev_dl,
                                            criterion)
        

        print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc * 100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc * 100:.2f}%')    
        if best_acc <= valid_acc:
            best_acc = valid_acc
            PATH=f"epoch{epoch+1}_{i}_val.accuracy{valid_acc:.3f}%.pt"
            torch.save({
                    'epoch': epoch+1,
                    'i': i,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': valid_loss,
                    }, PATH)
    total_v_acc += best_acc
total_v_acc /= k

Epoch: 01, Train Loss: 1.442, Train Acc: 49.92%, Val. Loss: 1.354, Val. Acc: 52.35%
Epoch: 02, Train Loss: 1.296, Train Acc: 55.24%, Val. Loss: 1.287, Val. Acc: 55.16%


KeyboardInterrupt: 

In [None]:
print(total_v_acc)

0.5748595505617978


In [None]:
test_df = pd.read_csv('test_HW2dataset.csv')
test_df=test_df[['Utterance','Speaker']]
#test_set = test_df.values.tolist()
test_set = list(test_df.to_records(index=False))
#print(test_set)
test_encoded=[]
test_encoded+=[encode_test(Utterance, word2index, 10, s) for Utterance,s in test_set]
test_x = np.array(test_encoded)
test_ds = TensorDataset(torch.from_numpy(test_x))
test_dl = DataLoader(test_ds, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
model = LSTM().to(device)                               
criterion = torch.nn.CrossEntropyLoss()                  
optimizer = torch.optim.Adam(model.parameters(),lr=lr) 
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']

model.eval()
predict=[]
for deta in test_dl:
    text = deta[0].to(device)
    preds = model(text)
    _, pred = torch.max(preds, 1)
    predict.append(pred.item())

In [None]:
print(len(predict))
ans = [[i, pre] for [i,pre] in enumerate(predict)]


3400


In [None]:
import csv
h = ['index', 'emotion']
with open('predict.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(h)
    writer.writerows(ans)