In [1]:
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
import torch
from autocorrect import spell
import itertools
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
from IPython.display import clear_output

In [2]:
import csv

trainData = []
trainValue = []
with open('data/train.txt') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        try:
            text,sentiment = row[0].split("\t")
            trainData.append(text)
            trainValue.append(sentiment)
        except(ValueError):
            continue
        
print(len(trainData))
print(len(trainValue))

680
680


In [3]:
import gensim
word_to_ix = gensim.models.KeyedVectors.load_word2vec_format('data/fasttext/wiki.en.vec')

In [4]:
def prepare_wordvec(seq):
    word_vector=[]
    for w in seq:
        if w in word_to_ix:
            word_vector.append(word_to_ix[w])
        else:
            w = spell(w)  #spelling errors
            if w in word_to_ix:
                word_vector.append(word_to_ix[w])
            else:
                word_vector.append(word_to_ix["none"]) #if the word cant be figured out
    c = len(word_vector)
    word_vector = np.array(word_vector)
    word_vector = word_vector.reshape(c,300)
    vec = Variable(torch.from_numpy(word_vector))
    return vec

In [5]:
class LSTMClassifier(nn.Module):
    def __init__(self):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(300, 128, 3, dropout=0.2, batch_first=True)
        self.fcn1 = nn.Linear(128, 64)
        self.fcn2 = nn.Linear(64, 32)
        self.fcn3 = nn.Linear(32, 2)
        self.dropout = nn.Dropout(p=0.25)
        
    def init_hidden(self, batch_size):
        return (Variable(torch.zeros(3,batch_size,128)), Variable(torch.zeros(3,batch_size,128)))
    
    def forward(self, x, batch_size):
        self.hidden = self.init_hidden(batch_size)
        output, hn = self.lstm(x, self.hidden)
        output, seq_index = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        new_tensor = Variable(torch.zeros(len(seq_index), 128))

        for idx, batch_element in enumerate(output):
            col_index = seq_index[idx] - 1
            col_element = batch_element[col_index]
            new_tensor[idx] = col_element
        
        output = self.fcn1(new_tensor)
        output = self.dropout(output)
        output = self.fcn2(output)
        output = self.dropout(output)
        output = self.fcn3(output)
        output = F.log_softmax(output, dim=1)
        return output

In [6]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 32

In [7]:
import unicodedata
import re
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"won't", "will not", s)
    s = re.sub(r"can\'t", "can not", s)

    # general
    s = re.sub(r"n\'t", " not", s)
    s = re.sub(r"\'re", " are", s)
    s = re.sub(r"\'s", " is", s)
    s = re.sub(r"\'d", " would", s)
    s = re.sub(r"\'ll", " will", s)
    s = re.sub(r"\'t", " not", s)
    s = re.sub(r"\'ve", " have", s)
    s = re.sub(r"\'m", " am", s)
    s = re.sub(r"([.!?])", r" ", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [8]:
training_data = []
for sentence in trainData:
    s = normalizeString(sentence)
    length = len(s.split())
    inputs = prepare_wordvec(s.split())
    test_utterance = inputs.view(1, length, 300)
    training_data.append(test_utterance)

In [9]:
tag_to_ix = {"BAD": 0, "GOOD": 1}
def prepare_sequence(ix):
    if ix =="0":
        idxs = [0]
    else:
        idxs = [1]
    return torch.tensor(idxs, dtype=torch.long)
training_result = []
for value in trainValue:
    x = prepare_sequence(value)
    training_result.append(x)
    

In [10]:
model = LSTMClassifier()
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [11]:
for epoch in range(95):  
    if(epoch == 0):
        print("#epoch: 0")
    else:
        print("#epoch: "+str(epoch)+"/95   accuraccy%: "+str(correct*100/len(training_data)))
    correct = 0
    for idx,sentence in enumerate(training_data):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        k = nn.utils.rnn.pack_padded_sequence(sentence, [len(sentence[0])], batch_first=True)
        # Step 3. Run our forward pass.
        tag_scores = model(k,1)
        if(torch.Tensor.item(tag_scores.argmax())==int(trainValue[idx])):
            correct+= 1
        else:
            print(trainData[idx]+" "+trainValue[idx])
            print(tag_scores)
        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, training_result[idx])
        loss.backward()
        optimizer.step()
    clear_output()


#epoch: 90/300   accuraccy%: 100.0


KeyboardInterrupt: 

In [24]:
test = "Good food, good wine. perfect!"
test = normalizeString(test)
length = len(test.split())
test = prepare_wordvec(test.split())
test = test.view(1, length, 300)
k = nn.utils.rnn.pack_padded_sequence(test, [length], batch_first=True)
tag_scores = model(k,1)
print(torch.exp(tag_scores))
print(torch.Tensor.item(tag_scores.argmax()))

tensor([[ 3.0664e-16,  1.0000e+00]])
1


torch.save(model.state_dict(), "./sentiment.pt")

testData = []
with open('data/test.txt') as csvfile:
    readCSV = csv.reader(csvfile)
    for row in readCSV:
        s = normalizeString(row[0])
        length = len(s.split())
        inputs = prepare_wordvec(s.split())
        test_utterance = inputs.view(1, length, 300)
        testData.append(test_utterance)
        
for idx,sentence in enumerate(testData):
        k = nn.utils.rnn.pack_padded_sequence(sentence, [len(sentence[0])], batch_first=True)
        # Step 3. Run our forward pass.
        tag_scores = model(k,1)
        print(idx+1,torch.exp(tag_scores))
        