# LSTM Text Classifier
## Sentiment Analysis

In [505]:
import sys
print(sys.prefix)
!which python

/Users/eunbeejang/anaconda3
/usr/local/bin/python


In [740]:
import torch
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe, Vectors
from torch.autograd import Variable
from torch.nn import functional

import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemma = WordNetLemmatizer()

from tqdm import tqdm # progress bar


In [743]:
def load_data():
    tokenize = lambda x: lemma.lemmatize(re.sub(r'<.*?>|[^\w\s]|\d+', '', x)).split()
    
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True,
                      fix_length=200, include_lengths=True, batch_first=True, dtype=torch.long)
    LABEL = data.LabelField(dtype=torch.float, sequential=False)

    train, test = datasets.IMDB.splits(TEXT, LABEL)
    
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) # Glove Embedding
    LABEL.build_vocab(train)
    word_emb = TEXT.vocab.vectors
    
    train, valid = train.split()
    train_data, valid_data, test_data = data.BucketIterator.splits((train, valid, test),
                                                                   batch_size=32, repeat=False, shuffle=True)

    vocab_size = len(TEXT.vocab)

    print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print ("Label Length: " + str(len(LABEL.vocab)))
    print ("\nSize of train set: {} \nSize of validation set: {} \nSize of test set: {}".format(len(train_data.dataset), len(valid_data.dataset), len(test_data.dataset)))
    print(LABEL.vocab.freqs.most_common(2))

    return TEXT, word_emb, train_data, valid_data, test_data, vocab_size

In [744]:
TEXT, word_emb, train_data, valid_data, test_data, vocab_size = load_data()

Loading vectors from .vector_cache/glove.6B.300d.txt.pt


[('pos', 12500), ('neg', 12500)]
Length of Text Vocabulary: 138163
Vector size of Text Vocabulary:  torch.Size([138163, 300])
Label Length: 2

Size of train set: 17500 
Size of validation set: 7500 
Size of test set: 25000


In [746]:
for a in train_data:
    b = a.text
    break
b[0].shape

torch.Size([32, 200])

In [747]:
b[0][:, :10]

tensor([[ 4.0000e+00,  2.1324e+04,  8.4260e+03,  5.6230e+03,  4.2660e+03,
          4.4170e+03,  6.1800e+03,  3.0760e+03,  5.3300e+02,  3.0000e+00],
        [ 8.6690e+03,  1.1911e+04,  7.0000e+00,  2.8000e+01,  5.0000e+00,
          1.4200e+02,  1.2160e+03,  9.3000e+01,  1.2000e+01,  2.8880e+03],
        [ 1.0000e+01,  7.0000e+00,  4.0000e+00,  4.9000e+01,  1.9000e+01,
          1.6000e+01,  5.0000e+00,  2.0000e+00,  5.7780e+03,  1.1000e+01],
        [ 1.0000e+01,  7.0000e+00,  2.8000e+01,  5.0000e+00,  1.4200e+02,
          3.8100e+02,  5.3111e+04,  6.2050e+03,  1.2000e+01,  4.0100e+02],
        [ 2.3400e+02,  1.0000e+01,  1.7000e+01,  1.3000e+01,  4.4700e+02,
          2.0000e+01,  4.0000e+00,  2.7800e+02,  6.3000e+01,  5.0000e+00],
        [ 1.1000e+01,  6.6000e+01,  1.0900e+02,  5.3600e+02,  5.0000e+00,
          6.5100e+02,  2.2350e+03,  1.4872e+04,  1.5500e+02,  2.0610e+03],
        [ 4.1000e+01,  1.1229e+05,  1.0000e+01,  1.1000e+01,  8.9000e+01,
          1.7700e+02,  6.0000e+0

In [748]:
TEXT.vocab.itos[1]

'<pad>'

In [749]:
TEXT.vocab.itos[35]

'who'

In [750]:
word_emb.size()

torch.Size([138163, 300])

In [751]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 328121), ('and', 161575), ('a', 161309), ('of', 145166), ('to', 134822), ('is', 106799), ('in', 92187), ('it', 76313), ('this', 73186), ('i', 72475), ('that', 69198), ('was', 47988), ('as', 46058), ('with', 43724), ('for', 43701), ('movie', 41826), ('but', 40999), ('film', 37487), ('on', 33340), ('not', 30012)]


In [752]:
print(TEXT.vocab.itos[:20])

['<unk>', '<pad>', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'this', 'i', 'that', 'was', 'as', 'with', 'for', 'movie', 'but', 'film']


In [753]:
class LSTM(torch.nn.Module):
    def __init__(self, seq_len, emb_dim, hidden_dim, output_dim, embedding, batch_size):
        super(LSTM, self).__init__()
        
        self.seq_len = seq_len # vocabulary dim
        self.emb_dim = emb_dim # glove dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.embedding = embedding # glove embedding
        self.batch_size = batch_size
        
        # Initalize look-up table
        self.word_emb = torch.nn.Embedding(seq_len, emb_dim, padding_idx=1)
        # connect look-up table to glove embedding
        self.word_emb.weight = torch.nn.Parameter(embedding, requires_grad=False)
        
        # Layers: one LSTM, one Fully-connected
        self.lstm = nn.LSTM(emb_dim, hidden_dim)
        #self.lstm = nn.LSTM(seg_len, hidden_dim)
        self.out = nn.Linear(hidden_dim, output_dim)
        #print("testing")
        self.dropout = nn.Dropout(0.5)


        
    def forward(self, x, batch_size):
        
        # input_emb (num_seq, batch_size, emb_dim)
        #print("X: ", x.shape)
        emb_input = self.word_emb(x).permute(1, 0, 2)  
        #print("emb_input: ", emb_input.shape)
        hidden_state = Variable(torch.zeros(1, batch_size, self.hidden_dim))
        cell_state = Variable(torch.zeros(1, batch_size, self.hidden_dim))
        #print("hidden_state: ", hidden_state.shape)
        out, (hidden_out, cell_out) = self.lstm(emb_input, (hidden_state, cell_state))
        #print("out: ", out.shape)
        #print("hidden_out: ", hidden_out.shape)
        y_pred = (self.out(hidden_out[-1]))
        #print("y_pred: ", y_pred)
        return y_pred
        """
        embedded = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.LSTM(embedded)
        return outputs, hidden
    """

In [766]:
seq_len = 200 #vocab_size
emb_dim = 300
hidden_dim = 256
output_dim = 2
embedding = word_emb
batch_size = 32

model = LSTM(seq_len, emb_dim, hidden_dim, output_dim, embedding, batch_size)
criterion = torch.nn.BCELoss(size_average=True)
#criterion = nn.CrossEntropyLoss(ignore_index=TEXT.vocab.stoi['<pad>'])
#criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(filter(lambda param: param.requires_grad, model.parameters()),lr=0.01)

In [767]:
model.parameters()

<generator object Module.parameters at 0x1a2c8617d8>

In [821]:
def train(model, data_iter, epoch_size, criterion, optimizer):
    
    model.train()
    #epoch = 0
    #for epoch in tqdm(range(epoch_size)):
    for epoch in range(epoch_size):
        loss = 0
        acc = 0
        for i, batch in enumerate(data_iter):
            #print("\n batch {}: ".format(i))
            batch_size = len(batch.text[0])
            pred = model(batch.text[0], batch_size)
            pred = torch.max(pred, 1)[1]
            #print("\tprediction: ", pred)
            #print("\ttarget: ", batch.label)
            loss = Variable(criterion(pred.float(), batch.label.float()), requires_grad = True)
            correct = ((pred.float() == batch.label).sum()).numpy()
            #correct = (pred == batch.label.type(torch.LongTensor)).sum().data
            #print("Correct: ", correct)
            #print(correct)
            acc = correct/pred.shape[0]
            #print(acc)
            optimizer.zero_grad()      
            loss.backward()
            optimizer.step()
        
            #print("This Batch {}/{}, Loss: {:.4f}, Accuracy: {:.4f}".format(epoch+1,epoch_size, loss, acc))
        print("------epoch {}/{}, Loss: {:.4f}, Accuracy: {:.4f}".format(epoch+1,epoch_size, loss, acc))



        

In [822]:
train(model, valid_data, 10, criterion, optimizer)

------epoch 1/10, Loss: 18.4207, Accuracy: 0.3333
------epoch 2/10, Loss: 18.4207, Accuracy: 0.3333
------epoch 3/10, Loss: 18.4207, Accuracy: 0.3333
------epoch 4/10, Loss: 18.4207, Accuracy: 0.3333
------epoch 5/10, Loss: 18.4207, Accuracy: 0.3333
------epoch 6/10, Loss: 18.4207, Accuracy: 0.3333


KeyboardInterrupt: 