# LSTM Text Classifier
## Sentiment Analysis

In [6]:
import sys
print(sys.prefix)
!which python

/Users/eunbeejang/anaconda3
/usr/local/bin/python


In [4]:
import torch
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe, Vectors
from torch.autograd import Variable
from torch.nn import functional
import copy
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemma = WordNetLemmatizer()

from tqdm import tqdm # progress bar


In [437]:
def load_data():
    tokenize = lambda x: lemma.lemmatize(re.sub(r'<.*?>|[^\w\s]|\d+', '', x)).split()
    
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True,
                       include_lengths=True, batch_first=True, dtype=torch.long) #fix_length=200,
    LABEL = data.LabelField(batch_first=True, sequential=False)

    train, test = datasets.IMDB.splits(TEXT, LABEL)
    
    TEXT.build_vocab(train, max_size=25000, vectors=GloVe(name='6B', dim=300)) # Glove Embedding
    LABEL.build_vocab(train)
    word_emb = TEXT.vocab.vectors
    
    train, valid = train.split()
    train_data, valid_data, test_data = data.BucketIterator.splits((train, valid, test),
                                                                   batch_size=64, repeat=False, shuffle=True)

    vocab_size = len(TEXT.vocab)

    print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print ("Label Length: " + str(len(LABEL.vocab)))
    print ("\nSize of train set: {} \nSize of validation set: {} \nSize of test set: {}".format(len(train_data.dataset), len(valid_data.dataset), len(test_data.dataset)))
    print(LABEL.vocab.freqs.most_common(2))

    return TEXT, word_emb, train_data, valid_data, test_data, vocab_size

In [438]:
TEXT, word_emb, train_data, valid_data, test_data, vocab_size = load_data()

Length of Text Vocabulary: 25002
Vector size of Text Vocabulary:  torch.Size([25002, 300])
Label Length: 2

Size of train set: 17500 
Size of validation set: 7500 
Size of test set: 25000
[('pos', 12500), ('neg', 12500)]


In [439]:
for a in train_data:
    b = a.text
    break
b[0].shape

torch.Size([64, 994])

In [440]:
b[0][:, :10]

tensor([[    16,      2,   2274,    230,     40,     38,     11,     13,
            146,      9],
        [     2,    231,    116,      4,    169,      5,    432,   1570,
             16,      4],
        [    33,   6848,  22956,  14019,    251,     32,      4,    176,
              3,    805],
        [  1250,  22470,    108,   1810,    647,      6,      2,    245,
             36,      2],
        [   484,    130,     85,    476,   6536,     66,   2762,     14,
             32,     10],
        [     2,    426,     15,      2,      0,      7,   7548,     55,
             32,      2],
        [    10,     17,     13,      4,    517,   1096,      9,     13,
            196,   4465],
        [  1760,   5437,      7,      4,    293,     60,   3360,   1253,
           6528,   1980],
        [   208,     10,     17,     52,      9,    364,     44,      8,
            304,      4],
        [   386,     44,      5,      4,    618,      3,     12,      7,
            106,   2087],
        [ 

In [441]:
TEXT.vocab.itos[1]

'<pad>'

In [444]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 328121), ('and', 161575), ('a', 161309), ('of', 145166), ('to', 134822), ('is', 106799), ('in', 92187), ('it', 76313), ('this', 73186), ('i', 72475), ('that', 69198), ('was', 47988), ('as', 46058), ('with', 43724), ('for', 43701), ('movie', 41826), ('but', 40999), ('film', 37487), ('on', 33340), ('not', 30012)]


In [445]:
print(TEXT.vocab.itos[:20])

['<unk>', '<pad>', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'this', 'i', 'that', 'was', 'as', 'with', 'for', 'movie', 'but', 'film']


In [534]:
class LSTM(torch.nn.Module):
    def __init__(self, seq_len, emb_dim, hidden_dim, output_dim, embedding, batch_size, num_layers=1, dropout=0.2, bidirectional=False):
        super(LSTM, self).__init__()
        self.seq_len = seq_len 
        self.emb_dim = emb_dim # glove dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.embedding = embedding # glove embedding
        self.batch_size = batch_size
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        
        # Initalize look-up table and assign weight
        self.word_emb = torch.nn.Embedding(25002, emb_dim)
        #self.word_emb.weight = torch.nn.Parameter(embedding)
        # Layers: one LSTM, one Fully-connected
        self.lstm = torch.nn.LSTM(emb_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        self.dropout = torch.nn.Dropout(dropout)
        
    
    def forward(self, x, batch):
        x = self.word_emb(x).permute(1, 0, 2)
        h_0 = self._init_state(batch_size=batch)
        #print("h_0 = ", h_0)
        out, (hidden_out, cell_out) = self.lstm(x, h_0)
        #print("h_t = ", hidden_out)

        self.dropout(hidden_out)
        y_pred = self.fc(hidden_out[-1])
        return y_pred

    
    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return (
            weight.new(self.num_layers, batch_size, self.hidden_dim).zero_(),
            weight.new(self.num_layers, batch_size, self.hidden_dim).zero_()
        )


In [535]:
seq_len = 200
emb_dim = 300
hidden_dim = 256
output_dim = 2
embedding = word_emb
lr = 0.001
max_grad_norm = 5


model = LSTM(seq_len, emb_dim, hidden_dim, output_dim, embedding, batch_size)
print(model)
optimizer = torch.optim.Adam(filter(lambda param: param.requires_grad, model.parameters()), lr=lr ) #,lr=0.1

LSTM(
  (word_emb): Embedding(25002, 300)
  (lstm): LSTM(300, 256)
  (fc): Linear(in_features=256, out_features=2, bias=True)
  (dropout): Dropout(p=0.2)
)


In [None]:
"""
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)
        
"""

def train(model, data_iter, epoch_size, optimizer):
    #init_lstm_weights = copy.deepcopy(model.state_dict())
    model.train()
    total_loss = []
    total_acc = []
#    for epoch in tqdm(range(epoch_size)):
    for epoch in range(epoch_size):
        epoch_loss = []
        epoch_acc = []
        
        for i, batch in enumerate(data_iter):
            batch_size = len(batch.text[0])
            optimizer.zero_grad() 

            #loss = 0
            #correct = 0
            pred = model(batch.text[0],batch_size)
            loss = functional.cross_entropy(pred, batch.label, size_average=False)            
            correct = ((torch.max(pred, 1)[1] == batch.label)).sum().numpy()
            acc = correct/pred.shape[0]
            
            epoch_loss.append(loss.item())
            epoch_acc.append(acc)
     
            loss.backward() # calculate the gradient
        
            #clip_gradient(model, 0.25) # limit the norm
            # Clip to the gradient to avoid exploding gradient.
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            
            optimizer.step() # update param

            print("------TRAINBatch {}/{}, Batch Loss: {:.4f}, Accuracy: {:.4f}".format(i+1,len(data_iter), loss, acc))
        
        total_loss.append((sum(epoch_loss)/len(data_iter)))
        total_acc.append((sum(total_acc)/len(data_iter)))
        print("****** Epoch {} Loss: {}, Epoch {} Acc: {}".format(epoch, (sum(epoch_loss)/len(data_iter)),
                                                                  epoch, (sum(epoch_acc)/len(data_iter))))          
    return total_loss, total_acc

In [None]:
def evaluate(model, val_iter):
    total_loss = []
    total_acc = []
    model.eval()

    for i, batch in enumerate(val_iter):
        batch_size = len(batch.text[0])
        pred = model(batch.text[0],batch_size)
        loss = functional.cross_entropy(pred, batch.label, size_average=False)            
        correct = ((torch.max(pred, 1)[1] == batch.label)).sum().numpy()
        acc = correct/pred.shape[0]
        total_loss.append(loss.item())
        total_acc.append(acc)
        print("++++++EVAL Batch {}/{}, Batch Loss: {:.4f}, Accuracy: {:.4f}".format(i+1,len(val_iter), loss, acc))
    print("Average EVAL Loss: ", (sum(total_loss) / len(val_iter))) 
    print("Average EVAL Acc: ", (sum(total_acc) / len(val_iter))) 
    return avg_total_loss, total_loss, total_acc

In [None]:
def hyperpram_tune(hidden_dim_lst = [64,128,256], lr_lst = [0.1,0.01,0.001], max_grad_norm_lst = [3,4,5], epoch_lst = [10,20,30,40,50]):
    best_valid_loss = 0
    best_model = model
    for dim in hidden_dim_lst:
        for rate in lr_lst:
            for norm in max_grad_norm_lst:
                for epoch in epoch_lst:
                    lr = rate
                    max_grad_norm = norm
                    print("&&&& hidden_dim {}, lr {}, max_grad_norm {}, epoch {}".format(dim,rate,norm,epoch))
                    this_model = LSTM(seq_len, emb_dim, dim, output_dim, embedding, batch_size)
                    _, _ = train(model, train_data, epoch , optimizer)  
                    avg_valid_loss, _, _ = evaluate(model, valid_data)
                    if avg_valid_loss > best_valid_loss:
                        best_model = this_model
    return best_model
                    

In [551]:
#train_loss, train_acc = train(model, train_data, 20, optimizer)

In [552]:
#valid_total_loss, valid_total_acc = evaluate(model, valid_data)

In [553]:
best_model = hyperpram_tune()

------Batch 1/274, Batch Loss: 44.4360, Accuracy: 0.4531
------Batch 2/274, Batch Loss: 53.1576, Accuracy: 0.5938


KeyboardInterrupt: 


TRY
a) learning rates
b) different numbers of hidden layers
c) different dim of hidden layers
d) try gradient clipping instead of changing the learning rate


weight updates
epoch batch