# LSTM Text Classifier
## Sentiment Analysis

In [19]:
import sys
print(sys.prefix)
!which python

/Users/eunbeejang/anaconda3
/usr/local/bin/python


In [75]:
import torch
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe, Vectors
from torch.autograd import Variable
from torch.nn import functional

import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemma = WordNetLemmatizer()


In [110]:
def load_data():
    tokenize = lambda x: lemma.lemmatize(re.sub(r'<.*?>|[^\w\s]|\d+', '', x)).split()
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True,
                      fix_length=200, include_lengths=True, batch_first=True)
    LABEL = data.LabelField(dtype=torch.float, sequential=False)

    train, test = datasets.IMDB.splits(TEXT, LABEL)
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train)
    word_emb = TEXT.vocab.vectors
    print(LABEL.vocab.freqs.most_common(2))
    print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print ("Label Length: " + str(len(LABEL.vocab)))
    train, valid = train.split()
    train_data, valid_data, test_data = data.BucketIterator.splits((train, valid, test),
                                                                   batch_size=32, repeat=False, shuffle=True)

    vocab_size = len(TEXT.vocab)

    return TEXT, word_emb, train_data, valid_data, test_data, vocab_size

In [111]:
TEXT, word_emb, train_data, valid_data, test_data, vocab_size = load_data()

Loading vectors from .vector_cache/glove.6B.300d.txt.pt


[('pos', 12500), ('neg', 12500)]
Length of Text Vocabulary: 138163
Vector size of Text Vocabulary:  torch.Size([138163, 300])
Label Length: 2


In [112]:
word_emb.size()

torch.Size([138163, 300])

In [113]:
print(TEXT.vocab.freqs.most_common(3))

[('the', 328121), ('and', 161575), ('a', 161309)]


In [114]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 328121), ('and', 161575), ('a', 161309), ('of', 145166), ('to', 134822), ('is', 106799), ('in', 92187), ('it', 76313), ('this', 73186), ('i', 72475), ('that', 69198), ('was', 47988), ('as', 46058), ('with', 43724), ('for', 43701), ('movie', 41826), ('but', 40999), ('film', 37487), ('on', 33340), ('not', 30012)]


In [115]:
print(TEXT.vocab.itos[:20])

['<unk>', '<pad>', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'this', 'i', 'that', 'was', 'as', 'with', 'for', 'movie', 'but', 'film']


In [286]:
class LSTM(torch.nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, output_dim, embedding, batch_size):
        super(LSTM, self).__init__()
        
        self.input_dim = input_dim # vocabulary dim
        self.emb_dim = emb_dim # glove dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.embedding = embedding # glove embedding
        self.batch_size = batch_size
        
        # Initalize look-up table
        self.word_emb = torch.nn.Embedding(input_dim, emb_dim)
        # connect look-up table to glove embedding
        self.word_emb.weight = torch.nn.Parameter(embedding, requires_grad=False)
        
        # Layers: one LSTM, one Fully-connected
        self.lstm = nn.LSTM(emb_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        #print("testing")
        
    def forward(self, x):
        # input_emb (num_seq, batch_size, emb_dim)
        emb_input = self.word_emb(x).permute(1, 0, 2)
        #print(emb_input.shape)
        hidden_state = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        cell_state = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        _, (hidden_out, cell_out) = self.lstm(emb_input, (hidden_state, cell_state))
        #print("hidden shape: ", hidden_out.shape)
        y_pred = functional.sigmoid(self.fc(hidden_out[-1]))
        #print("y_pred shape: ", y_pred.shape)

        return y_pred # (batch_size, output_dim)

In [287]:
input_dim = vocab_size
emb_dim = 300
hidden_dim = 256
output_dim = 2
embedding = word_emb
batch_size = 32

model = LSTM(input_dim, emb_dim, hidden_dim, output_dim, embedding, batch_size)
criterion = torch.nn.BCELoss(size_average=True)
optimizer = torch.optim.Adam(filter(lambda param: param.requires_grad, model.parameters()),lr=0.01)

In [288]:
model.parameters()

<generator object Module.parameters at 0x1a7dd01830>

In [307]:
def train(model, data_iter, epoch_size, criterion, optimizer):
    
    model.train()
    #epoch = 0
    for epoch in range(epoch_size):
        loss = 0
        acc = 0
        for i, batch in enumerate(data_iter):
            #print(i, batch.text[0])
            pred = model(batch.text[0]).squeeze(1)
            pred = torch.max(pred, 1)[1].type(torch.FloatTensor)
            #print(pred.type)
            #print(batch.label.type)
            loss = Variable(criterion(pred, batch.label.type(torch.FloatTensor)), requires_grad = True)
            correct = (pred == batch.label.type(torch.FloatTensor)).float().sum()
            acc = correct/pred.shape[0]
            optimizer.zero_grad()      
            loss.backward()
            optimizer.step()
        
            #print("This Batch {}/{}, Loss: {:.4f}, Accuracy: {:.4f}".format(epoch+1,epoch_size, loss, acc))
        print("------epoch {}/{}, Loss: {:.4f}, Accuracy: {:.4f}".format(epoch+1,epoch_size, loss, acc))


        

In [309]:
train(model, train_data, 10, criterion, optimizer)

RuntimeError: Expected hidden[0] size (1, 28, 256), got (1, 32, 256)

In [None]:
def eval()