# LSTM Text Classifier
## Sentiment Analysis

In [6]:
import sys
print(sys.prefix)
!which python

/Users/eunbeejang/anaconda3
/usr/local/bin/python


In [8]:
import torch
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe, Vectors
from torch.autograd import Variable
from torch.nn import functional
import copy
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemma = WordNetLemmatizer()

from tqdm import tqdm # progress bar


In [84]:
def load_data():
    tokenize = lambda x: lemma.lemmatize(re.sub(r'<.*?>|[^\w\s]|\d+', '', x)).split()
    
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True,
                       include_lengths=True, batch_first=True, dtype=torch.long) #fix_length=200,
    LABEL = data.LabelField(dtype=torch.float, sequential=False)

    train, test = datasets.IMDB.splits(TEXT, LABEL)
    
    TEXT.build_vocab(train, max_size=25000, vectors=GloVe(name='6B', dim=300)) # Glove Embedding
    LABEL.build_vocab(train)
    word_emb = TEXT.vocab.vectors
    
    train, valid = train.split()
    train_data, valid_data, test_data = data.BucketIterator.splits((train, valid, test),
                                                                   batch_size=64, repeat=False, shuffle=True)

    vocab_size = len(TEXT.vocab)

    print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print ("Label Length: " + str(len(LABEL.vocab)))
    print ("\nSize of train set: {} \nSize of validation set: {} \nSize of test set: {}".format(len(train_data.dataset), len(valid_data.dataset), len(test_data.dataset)))
    print(LABEL.vocab.freqs.most_common(2))

    return TEXT, word_emb, train_data, valid_data, test_data, vocab_size

In [85]:
TEXT, word_emb, train_data, valid_data, test_data, vocab_size = load_data()

Length of Text Vocabulary: 25002
Vector size of Text Vocabulary:  torch.Size([25002, 300])
Label Length: 2

Size of train set: 17500 
Size of validation set: 7500 
Size of test set: 25000
[('pos', 12500), ('neg', 12500)]


In [86]:
for a in train_data:
    b = a.text
    break
b[0].shape

torch.Size([64, 823])

In [87]:
b[0][:, :10]

tensor([[    10,     17,     90,     69,     51,    664,     29,   1127,
             21,      6],
        [    10,   3405,   3544,   2758,     19,     43,   5118,    394,
              4,    493],
        [    11,   1016,     10,     19,    527,     20,      4,   1708,
            309,   2653],
        [    11,    208,     10,     17,     20,    746,   1098,    729,
              9,   2063],
        [   796,   4160,    283,   1795,      0,   8463,      4,   3052,
             16,      4],
        [     2,     19,   2143,   2557,     20,      4,   9969,    651,
           1807,     14],
        [   976,      0,     11,    976,  18726,      2,   7157,      3,
              2,   1798],
        [    10,     19,      7,    464,      3,     11,     57,    365,
              6,    508],
        [    11,    208,     10,   1640,     19,     52,      9,     13,
              8,   9446],
        [  6025,      5,      2,    444,   3080,     64,   8255,     45,
             22,    208],
        [ 

In [88]:
TEXT.vocab.itos[1]

'<pad>'

In [89]:
TEXT.vocab.itos[35]

'who'

In [90]:
word_emb.size()

torch.Size([25002, 300])

In [91]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 328121), ('and', 161575), ('a', 161309), ('of', 145166), ('to', 134822), ('is', 106799), ('in', 92187), ('it', 76313), ('this', 73186), ('i', 72475), ('that', 69198), ('was', 47988), ('as', 46058), ('with', 43724), ('for', 43701), ('movie', 41826), ('but', 40999), ('film', 37487), ('on', 33340), ('not', 30012)]


In [92]:
print(TEXT.vocab.itos[:20])

['<unk>', '<pad>', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'this', 'i', 'that', 'was', 'as', 'with', 'for', 'movie', 'but', 'film']


In [154]:
class LSTM(torch.nn.Module):
    def __init__(self, seq_len, emb_dim, hidden_dim, output_dim, embedding, batch_size):
        super(LSTM, self).__init__()
        self.seq_len = seq_len # vocabulary dim
        self.emb_dim = emb_dim # glove dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.embedding = embedding # glove embedding
        self.batch_size = batch_size
        
        # Initalize look-up table
        self.word_emb = torch.nn.Embedding(seq_len, emb_dim, padding_idx=1)
        # connect look-up table to glove embedding
        self.word_emb.weight = torch.nn.Parameter(embedding, requires_grad=False)
        
        # Layers: one LSTM, one Fully-connected
        self.lstm = torch.nn.LSTM(emb_dim, hidden_dim, dropout=0.5)
        #self.lstm = nn.LSTM(seg_len, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        #print("testing")
        self.dropout = torch.nn.Dropout(0.5)
        #self.init_weights()


        
    def forward(self, x, batch_size):
        
        # input_emb (num_seq, batch_size, emb_dim)
        #print("X: ", x.shape)
        emb_input = self.word_emb(x).permute(1, 0, 2)  
        #print("emb_input: ", emb_input.shape)
        hidden_state = (torch.zeros(1, batch_size, self.hidden_dim))
        cell_state = (torch.zeros(1, batch_size, self.hidden_dim))
        #print("hidden_state: ", hidden_state.shape)
        output, (hidden_out, cell_out) = self.lstm(emb_input, (hidden_state, cell_state))
        #print("out: ", out.shape)
        #print("hidden_out: ", hidden_out.shape)
        output = self.dropout(output)
        y_pred = (self.fc(hidden_out[-1]))
        #print("y_pred: ", y_pred)
        return y_pred
        """
        embedded = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.LSTM(embedded)
        return outputs, hidden
    """
                

In [155]:
seq_len = 200 #vocab_size
emb_dim = 300
hidden_dim = 256
output_dim = 2
embedding = word_emb
batch_size = 64

model = LSTM(seq_len, emb_dim, hidden_dim, output_dim, embedding, batch_size)
init_lstm_weights = copy.deepcopy(model.state_dict())
#criterion = torch.nn.BCEWithLogitsLoss()
criterion = torch.nn.BCELoss(size_average=True)
#criterion = torch.nn.CrossEntropyLoss(ignore_index=TEXT.vocab.stoi['<pad>'])
optimizer = torch.optim.SGD(filter(lambda param: param.requires_grad, model.parameters()), lr=0.01)
#optimizer = torch.optim.Adam(filter(lambda param: param.requires_grad, model.parameters()) ) #,lr=0.1

  "num_layers={}".format(dropout, num_layers))


In [156]:
model.parameters()

<generator object Module.parameters at 0x1a306c52b0>

In [162]:
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)
        
def train(model, data_iter, epoch_size, criterion, optimizer):
    
    model.train()
    total_loss = []
    total_acc = []
    #for epoch in tqdm(range(epoch_size)):
    for epoch in range(epoch_size):
        epoch_loss = []
        epoch_acc = []
        for i, batch in enumerate(data_iter):
            optimizer.zero_grad() 
            loss = 0
            correct = 0
            #print("\n batch {}: ".format(i))
            batch_size = len(batch.text[0])
            pred = model(batch.text[0], batch_size)
            #pred = torch.max(pred, 1)[1].view(-1)
            #print("\tprediction: ", pred)
            pred = torch.max(pred, 1)[1]
            #print("\tprediction: ", pred)
            #print("\ttarget: ", batch.label.view(-1))
            loss = Variable(criterion(pred.float(), batch.label.float()), requires_grad = True)
            epoch_loss.append(loss)
            #loss = (criterion(pred.float(), batch.label.float()))

            correct = ((pred.float() == batch.label).sum()).numpy()
            #correct = (pred == batch.label.type(torch.LongTensor)).sum().data
            #print("Correct: ", correct)
            #print(correct)
            acc = correct/pred.shape[0]
            epoch_acc.append(acc)
            #print(acc)
     
            loss.backward()
            clip_gradient(model, 0.25)
            optimizer.step()

            #print("------Batch {}/{}, Batch Loss: {:.4f}, Accuracy: {:.4f}".format(i+1,len(data_iter), loss, acc))
        total_loss.append((sum(epoch_loss)/len(data_iter)))
        total_acc.append((sum(total_acc)/len(data_iter)))
        print("****** Epoch {} Loss: {}, Epoch {} Acc: {}".format(epoch, (sum(epoch_loss)/len(data_iter)),
                                                                  epoch, (sum(epoch_acc)/len(data_iter))))          
    return total_loss, total_acc


        

In [163]:
train_loss, train_acc = train(model, train_data, 20, criterion, optimizer)

****** Epoch 0 Loss: 13.894750595092773, Epoch 0 Acc: 0.49713242961418147
****** Epoch 1 Loss: 13.893407821655273, Epoch 1 Acc: 0.4971813086548487
****** Epoch 2 Loss: 13.901503562927246, Epoch 2 Acc: 0.49688803441084467
****** Epoch 3 Loss: 13.87471866607666, Epoch 3 Acc: 0.49785746871741404
****** Epoch 4 Loss: 13.901054382324219, Epoch 4 Acc: 0.49690432742440044
****** Epoch 5 Loss: 13.88597297668457, Epoch 5 Acc: 0.4974501433785193
****** Epoch 6 Loss: 13.887322425842285, Epoch 6 Acc: 0.497401264337852


KeyboardInterrupt: 

In [138]:
"""
Hyperparameter optimization
a) learning rates
b) different numbers of hidden layers
c) different sizes of hidden layers
""" 

'\nHyperparameter optimization\na) learning rates\nb) different numbers of hidden layers\nc) different sizes of hidden layers\n'