<a href="https://colab.research.google.com/github/deekshakoul/Examples-of-DL-NLP-using-Pytorch/blob/master/Sentiment_Analysis_using_LSTM_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [2]:
df = pd.read_csv('/content/drive/My Drive/iisc/summer/datasets/train.csv')
df = df.sample(frac=1).reset_index(drop=True) #shuffle rows
df_test = pd.read_csv('/content/drive/My Drive/iisc/summer/datasets/test.csv')
df_test = df_test.sample(frac=1).reset_index(drop=True) #shuffle rows

Pytorch - Text Classification

---

* Handling OOV words - pytorch supports feature that replaces the rare words in our training data with Unknown token.

* Handling Variable Length sequences - As we deal with static networks(models), so we convert the variable length  input sentences into sentences with the same length(threshod also called as sequence length) by adding padding tokens.
The issue with padding tokens is that now our network also tries to model the padding like as any other information.
This is taken care by **Pytorch's Packed Padding sequence** 

Packed padding ignores the input timesteps with padding token. These values are never shown to the Recurrent Neural Network which helps us in building a dynamic Recurrent Neural Network.


![alt text](https://cdn.analyticsvidhya.com/wp-content/uploads/2020/01/Untitled-Diagram1.png)



In [3]:
#deal with tensors
import torch   

#handling text data
from torchtext import data  

#Reproducing same results
SEED = 147

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

Pre-processing

In [4]:
#Punctuations
import string
sym = list(string.punctuation)
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
#import nltk
from nltk.corpus import stopwords
#nltk.download("stopwords")

stopwords = set(stopwords.words("english")) | set(["br"]) | set(sym)  | set(["/><br",'\'s'])
#Tokens to discard during the preprocessing step

In [9]:
TEXT = data.Field(tokenize='spacy',lower=True,batch_first=True,include_lengths=True,stop_words=stopwords)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)
fields = [('label', LABEL), ('text',TEXT)]

In [10]:
train_data = data.TabularDataset(skip_header = True,path='/content/drive/My Drive/iisc/summer/datasets/train.csv',format = 'csv', fields=fields)#defines data stored in csv in terms of column
#torchtext.data.dataset.TabularDataset
test_data = data.TabularDataset(skip_header = True,path='/content/drive/My Drive/iisc/summer/datasets/test.csv',format = 'csv', fields=fields)#defines data stored in csv in terms of column

In [18]:
print(vars(train_data.examples[1])) #Defines a single training or test example

{'label': '0', 'text': ['wanted', 'see', 'movie', 'article', 'film', 'magazine', "n't", 'highly', 'recommended', 'one', 'critic', 'storyline', 'different', 'sure', 'could', 'good', 'movie', 'right', 'hands', 'directing', 'acting', 'awful', 'feeling', 'watching', 'movie', 'made', 'bunch', 'amateurs', 'although', 'movie', 'started', 'promisingly', 'got', 'worse', 'worse', 'think', 'unoriginal', 'movie', 'awkward', 'characters', '..', 'still', 'think', 'worth', 'watching', "n't", 'seen', 'films', 'subjecting', 'gay', 'porn', "n't", 'keep', 'expectations', 'high', 'though', 'disappointed']}


Creating Vocab and indexing words while taking care of above said problems of OOV words and  variable length sequence

In [19]:
#Initializing Glove embeddings
TEXT.build_vocab(train_data, min_freq=3, vectors = "glove.6B.100d") 
LABEL.build_vocab(train_data)

In [22]:
#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
print(TEXT.vocab.stoi)   # mapping token strings to numerical identifiers

Size of TEXT vocabulary: 41753
Size of LABEL vocabulary: 2
[('movie', 43063), ('film', 39285), ("n't", 32846), ('one', 26135), ('like', 20100), ('good', 14905), ('would', 13458), ('even', 12429), ('time', 12338), ('story', 11725)]
defaultdict(<function _default_unk_index at 0x7ff6a81e3b70>, {'<unk>': 0, '<pad>': 1, 'movie': 2, 'film': 3, "n't": 4, 'one': 5, 'like': 6, 'good': 7})


Iterator over batch sizes

In [23]:
#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
print("cuda? ",torch.cuda.is_available())

#A key to use for sorting examples in order to batch together examples with similar lengths and minimize padding
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data), 
    batch_size = 50,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

#or use a simply torchtext.data.Iterator(dataset, batch_size, sort_key=None, device=None, batch_size_fn=None, 
#train=True, repeat=False, shuffle=None, sort=None, sort_within_batch=None)
#https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.Iterator 

#can also use torch.utils.data.DataLoader ??

#explaining about train_iterator
''' 
in batches
 for every i in batch of 50:
    we have a i.text of length 2 
        *remember i is basically a train_data.examples[1] that has "text" and "label"
        i.text[0]  -  contains numerical tokens
        i.text[1]  -  length of review 
    we also have i.label that is tensor either of 0 or 1       
'''        

cuda?  True


' \nin batches\n for every i in batch of 50:\n    we have a i.text of length 2 \n        *remember i is basically a train_data.examples[1] that has "text" and "label"\n        i.text[0]  -  contains numerical tokens\n        i.text[1]  -  length of review \n    we also have i.label that is tensor either of 0 or 1       \n'

In [29]:
for i in train_iterator:
  print(i.text)
  break

(tensor([[ 253,    3,   99,  ...,   11,    5, 5115],
        [ 178,    2,  255,  ...,  619,   16,  589],
        [  41,    2,   37,  ...,    4,   54,  259],
        ...,
        [1844,   43,  113,  ...,    1,    1,    1],
        [1195,    7,   37,  ...,    1,    1,    1],
        [1367, 4761,  429,  ...,    1,    1,    1]], device='cuda:0'), tensor([26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25,
        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24,
        24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23],
       device='cuda:0'))


**torch.nn.utils.rnn.pack_padded_sequence**(input, lengths, batch_first=False, enforce_sorted=True)

* Packs a Tensor containing padded sequences of variable length.
* Input can be of size T x B x * where T is the length of the longest sequence 
* If batch_first is True, B x T x * input is expected.
* Parameters:
    * input (Tensor) – padded batch of variable length sequences
    * lengths (Tensor) – list of sequences lengths of each batch element.
    * batch_first
    * enforce_sorted (bool, optional) – if True, the input is expected to contain sequences sorted by length in a decreasing order. If False, the input will get sorted unconditionally. Default: True.
* Returns a PackedSequence object    
* Internally packed sequence is a tuple of two lists. One contains the elements of sequences. Elements are interleaved by time steps

In [None]:
#example to understand packed sequence better
a = [torch.tensor([1,2,3]), torch.tensor([3,4])]
b = torch.nn.utils.rnn.pad_sequence(a, batch_first=True)
c = torch.nn.utils.rnn.pack_padded_sequence(b, batch_first=True, lengths=[3,2])
#Data part is just all the tensors concatenated along the time axis. Batch_size is actually the array of batch sizes at each time step
#The batch_sizes=[2, 2, 1] represents grouping [1, 3] [2, 4] and [3] respectively(according to time step)

 **Modellling NN**
 
 Two functions defined as:
 * init : arguments passed to the class are initialized by this constructor
 * forward : defines forward pass of the inputs.

**Layers of Model:**
* Embedding layer : Get word embeddings for each indexed word, pytorch has nn.embedding which takes input as: 
    * num_embeddings: No. of unique words in dictionary
    * embedding_dim:  No. of dimensions for representing a word (user-defined)

* LSTM:
Detail explanation of paramaters, input, output of lstm can be found on this stackoverflow [SO link](https://stackoverflow.com/questions/45022734/understanding-a-simple-lstm-pytorch)
 
* Linear Layer
* Pack Padding: As explained above, it defines a dynamic recurrent neural network.  It simply ignores the values and returns the hidden state of the non padded element.




In [59]:
class LSTMNetwork(nn.Module):
  #input_size: size of input per time step, in this case size of xt
  def __init__(self, output_size, vocab_size, embedding_size, hidden_size, nlayers,dropout):
    super(LSTMNetwork,self).__init__();
    #embedding 
    #returns a matrix of size vocab_size x embedding_size
    self.embeds =  nn.Embedding(vocab_size, embedding_size)  


    self.lstm = nn.LSTM(embedding_size, 
                        hidden_size, 
                        num_layers=nlayers, 
                        batch_first=True,dropout=dropout);
   
    #fully connected layer
    self.fc = nn.Linear(hidden_size, output_size)
    #sigmoid layer
    #self.sigmoid = nn.Sigmoid()

  def forward(self, text, text_lengths):
    #text = [batch size,sent_length]
    # embedded shape = (batch_size, sentence_length,  embedding_size)
    embedded = self.embeds(text) 

    #using packed packing sequence
    embedding_packed = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)# If batch_first is True, B x T x * input is expected.

    #input, (h_0, c_0) where input is of shape (seq_len, batch, input_size) or  also be a packed variable length sequenc
    #h_0 of shape (num_layers * num_directions, batch, hidden_size)
    #c_0 of shape (num_layers * num_directions, batch, hidden_size)
    #no need to provide as by default they are initialized as zero 
    packed_output, (final_hidden_state, final_cell_state)  = self.lstm(embedding_packed)

    #output, (h_n, c_n): output of shape (seq_len, batch, num_directions * hidden_size) or packedSequence object
    #h_n of shape (num_layers * num_directions, batch, hidden_size)  : hidden state when  t = seq_len
    #c_n of shape (num_layers * num_directions, batch, hidden_size)  : cell state when  t = seq_len
        # final_hidden_state is 1 x batch_size x hidden_size
        # therefore doing final_hidden_state[-1] will give dimesnions as 2x3 and thus can now be easily passed to linear function(fc) as it takes only 2D tensors 

    out_fc = self.fc(final_hidden_state[-1]) #batch_size x hidden_size
    #out_fc  will be (batch_size, output_size) : this is similar to FNN
    #out = self.sigmoid(out_fc); # to get op between 0 and 1 as we use BCELoss

    return out_fc ; 

**Instantiate the model class**

In [74]:
output_size = 2
vocab_size = len(TEXT.vocab)
embedding_size = 100 #this is TEXT.vocab.vectors.shape, glove100d
hidden_size = 400
nlayers = 2
dropout=0.4
model = LSTMNetwork(output_size, vocab_size, embedding_size, hidden_size, nlayers,dropout)

In [75]:
print(model)

LSTMNetwork(
  (embeds): Embedding(41753, 100)
  (lstm): LSTM(100, 400, num_layers=2, batch_first=True, dropout=0.4)
  (fc): Linear(in_features=400, out_features=2, bias=True)
)


**Pretrained word embeddings**

vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.

In [76]:
pretrained_embeddings = TEXT.vocab.vectors
model.embeds.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape) # each words is rep by 100 size vector

torch.Size([41753, 100])


**Instantiate loss and optimizer Class**

Here I  have used CrossEntropyLoss with output size of LL as 2, the CEloss itself calculates softmax and corresponding label

I could have used BCE loss if output size of my last linear layer is kept as 1.

In [77]:
criterion = nn.CrossEntropyLoss()
learning_rate = 5e-4

In [78]:
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

**TRAINING PHASE**

In [79]:
#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

def trainingNetwork(model, iterator, criterion):
  epoch_loss = 0.0
  epoch_accuracy = 0.0
  cou = 0
  optimizer = optim.Adam(model.parameters())
  model.train()
  for batch in iterator:
    texts, text_lengths = batch.text # we have set inlude_lengths as True while defining data.TEXT
    target = batch.label.type(torch.LongTensor)

    if torch.cuda.is_available():
      texts = texts.cuda()
      target = target.cuda()    

    optimizer.zero_grad() 
    
    predictions_batch  =  model(texts, text_lengths)
    
    
    #compute the loss
    loss = criterion(predictions_batch, target)
    num_corrects = (torch.max(predictions_batch, 1)[1].view(target.size()).data == target.data).float().sum()
    acc = 100.0 * num_corrects/len(batch)

    #backpropage the loss and compute the gradients
    loss.backward()       
        
    #nn.utils.clip_grad_norm_(model.parameters(), clip);

    #update the weights
    optimizer.step()   
    
    #loss and accuracy
    epoch_loss += loss.item()  
    epoch_accuracy += acc.item()     

  return epoch_loss / len(iterator), epoch_accuracy / len(iterator)  

Evaluation Phase

In [80]:
def evaluate(model, iterator):#, criterion):
    
    #initialize every epoch
    total_epoch_loss = 0
    total_epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text

            target = batch.label.type(torch.LongTensor)

            if torch.cuda.is_available():
              text = text.cuda()
              target = target.cuda()    

            #convert to 1d tensor
            predictions = model(text, text_lengths)
            
            #compute loss and accuracy
            loss = criterion(predictions, target)
            #acc = binary_accuracy(predictions, batch.label)
            num_corrects = (torch.max(predictions, 1)[1].view(target.size()).data == target.data).sum()
            acc = 100.0 * num_corrects/len(batch)
            total_epoch_loss += loss.item()
            
            total_epoch_acc += acc.item()

    return total_epoch_loss/len(iterator), total_epoch_acc/len(iterator)


In [81]:
#train_iterator - BucketIterator 
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model

    train_loss, train_acc= trainingNetwork(model, train_iterator, criterion)
    #print(train_loss,"#", 'epoch: ',epoch)
    #test the model
    test_loss, test_acc = evaluate(model, test_iterator)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Test Loss: {test_loss:3f}, test Acc: {test_acc:.2f}%')    

Epoch: 01, Train Loss: 0.680, Train Acc: 57.14%, Test Loss: 0.606280, test Acc: 67.54%
Epoch: 02, Train Loss: 0.449, Train Acc: 79.08%, Test Loss: 0.359293, test Acc: 85.17%
Epoch: 03, Train Loss: 0.266, Train Acc: 89.28%, Test Loss: 0.291130, test Acc: 87.78%
Epoch: 04, Train Loss: 0.171, Train Acc: 93.46%, Test Loss: 0.314287, test Acc: 87.28%
Epoch: 05, Train Loss: 0.110, Train Acc: 96.11%, Test Loss: 0.353058, test Acc: 86.96%
