<a href="https://colab.research.google.com/github/rabimist/Deep-Learning-for-Natural-Language-Processing/blob/main/Movie_Review_using_RNN_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Name:** Deen Mohammad Abdullah

The purpose of this project is to train and test Movie Review dataset from IMDB using RNN (LSTM).

**Deep Learning for Natural Language Processing**


In [None]:
############################################ Required Packages #####################################################################
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords') # ------------------------------------ downloading the stopwords from NLTK
nltk.download('wordnet')  # ------------------------------------- for lematization we need to download wordnet (WordNetLemmatizer)
nltk.download('punkt') # ---------------------------------------- used for word tokenization
from nltk.corpus import stopwords # ----------------------------- importing the stopwords
from nltk.stem import WordNetLemmatizer # ----------------------- importing wordNetLemmatizer for lemmatization
from nltk.tokenize import word_tokenize # ----------------------- tokenize words from text
from nltk.corpus import stopwords #------------------------------ importing stop words 
import re # ----------------------------------------------------- importing regularExpression to extract only text from the dataset
from sklearn.model_selection import train_test_split #----------- split the dataset for training and validation set
import torch
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler, SequentialSampler)
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import random
from tqdm.notebook import tqdm
from collections import Counter
######################################################################################################################################

########################################### Function Definitions (I have defined nine functions) #####################################
#------ (1) This function removes special charecters from text ---------
def removeSpecialCharacter(word_list):
  cleanWordList = []
  
  for word in word_list:
    if (re.match('[a-zA-Z0-9]+', word)):
      cleanWordList.append(word.lower())
  
  return cleanWordList
#-----------------------------------------------------------------------

#------ (2) This function removes stop words from text -----------------
def removeStopWords (word_list):
  stop_words = set(stopwords.words('english'))
  
  filteredWords = [] 
  
  for word in word_list:
    if word not in stop_words: 
      filteredWords.append(word)
      
  return filteredWords
#------------------------------------------------------------------------

#------ (3) This function uses WordNet and lematizes the text -----------
def lemmatize (word_list):
  lemmatizer = WordNetLemmatizer()
  
  filteredWords = []
  
  for word in word_list:
    filteredWords.append(lemmatizer.lemmatize(word))
    
  return filteredWords
#------------------------------------------------------------------------

#--- (4) This function tokenize text, build vocabulary, preparing input  
def extractFeature(document, category):
  texts = []
  print ('Processing data!!! Please wait...')
  
  for i in tqdm(range( len(document))):
    word_list = removeSpecialCharacter (word_tokenize(document[i]))
    word_list = removeStopWords (word_list)
    word_list = lemmatize (word_list)
    
    txt = ''
    for w in word_list:
      txt = txt + w + ' '
      
    texts.append(txt)
  
  label = [1 if label =='positive' else 0 for label in category]
  x_train, x_val, y_train, y_val = train_test_split(texts, label, test_size=0.2)
  
  word_list = []
  for sent in x_train:
    for word in sent.lower().split():
      word_list.append(word)
  
  corpus = Counter(word_list)
  corpus_ = sorted(corpus,key=corpus.get,reverse=True)[:1000]
  vocab = {w:i+1 for i,w in enumerate(corpus_)}
  
  final_list_train,final_list_test = [],[]
  for sent in x_train:
    final_list_train.append([vocab[word] for word in sent.lower().split() 
                                     if word in vocab.keys()])
  for sent in x_val:
    final_list_test.append([vocab[word] for word in sent.lower().split() 
                                    if word in vocab.keys()])
  
  return np.array(final_list_train), np.array(y_train), np.array(final_list_test), np.array(y_val), vocab
#------------------------------------------------------------------------

# ---- (5) Purpose of this function is to convert training ----------------------
#---------- and validation dataset into torchTensor -----------------------------
def data_loader(x_train, x_test, y_train, y_test):
  seq_len = 500
  
  x_train_pad = np.zeros((len(x_train), seq_len),dtype=int)
  for ii, review in enumerate(x_train):
    if len(review) != 0:
      x_train_pad[ii, -len(review):] = np.array(review)[:seq_len]
  
  x_test_pad = np.zeros((len(x_test), seq_len),dtype=int)
  for ii, review in enumerate(x_test):
    if len(review) != 0:
      x_test_pad[ii, -len(review):] = np.array(review)[:seq_len]

  train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train))
  valid_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))
  batch_size = 50
  train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
  valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

  return train_loader, valid_loader, batch_size

#--------------------------------------------------------------------------------

########### LSTM Model Architecture starts from here #############################
class LSTM(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,output_dim,drop_prob=0.5):
        super(LSTM,self).__init__()
 
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
 
        self.no_layers = no_layers
        self.vocab_size = vocab_size
    
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=self.hidden_dim, num_layers=no_layers, batch_first=True,  bidirectional=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
    
        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()
        
    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)

        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 
        # converting [batch_size, sequence_len, hidden_dim] to [batch_size*sequence_len, hidden_dim]
        # to match the Linear layers expectation. Most of the data will be discarded later on after the sigmoid
        
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)

        # get last batch of labels
        sig_out = sig_out[:, -1] 
        # Shape if sig_out: [batch_size]
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
        
                
    def init_hidden(self, batch_size):
        # hidden state
        h0 = torch.zeros((self.no_layers*2,batch_size,self.hidden_dim)).to(device)
        
        # cell state
        c0 = torch.zeros((self.no_layers*2,batch_size,self.hidden_dim)).to(device)

        hidden = (h0,c0)
        return hidden
#---------------------------------------------------------------------------------
criterion = nn.BCELoss()
# -------- (6) This function is initializing the CNN model ----------------------------
def initilize_model(vocab):
  no_layers = 2
  vocab_size = len(vocab) + 1 #extra 1 for padding
  embedding_dim = 64
  output_dim = 1
  hidden_dim = 256
  
  model = LSTM(no_layers,vocab_size,hidden_dim,embedding_dim,output_dim,drop_prob=0.5)
  model.to(device)
  
  lr=0.001  
  optimizer = torch.optim.Adam(model.parameters(), lr=lr)
  
  return model, optimizer
#--------------------------------------------------------------------------------
############ RNN Model architecture ends here #####################################

###################### Training and Evaluation of Model ###########################
#---- (7) this function evaluates the model -------------------------------------
def evaluate(model, valid_loader, batch_size):
  val_h = model.init_hidden(batch_size)
  val_losses = []
  val_acc = 0.0
  model.eval()
  for inputs, labels in valid_loader:
    val_h = tuple([each.data for each in val_h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, val_h = model(inputs, val_h)
    val_loss = criterion(output.squeeze(), labels.float())
    val_losses.append(val_loss.item())
    pred = torch.round(output.squeeze())
    accuracy = torch.sum(pred == labels.squeeze()).item()
    val_acc += accuracy
  
  return val_losses, val_acc

#------ (8) this function train the model -------------------------------------- 
def train(model, optimizer, train_loader, valid_loader, batch_size, epochs):
  clip = 5
  
  print ('\n Training Starts...')
  for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    # initialize hidden state 
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:
      inputs, labels = inputs.to(device), labels.to(device)   
      h = tuple([each.data for each in h])
      
      model.zero_grad()
      output,h = model(inputs,h)
      
      # calculate the loss and perform backprop
      loss = criterion(output.squeeze(), labels.float())
      loss.backward()
      train_losses.append(loss.item())

      pred = torch.round(output.squeeze())
      accuracy = torch.sum(pred == labels.squeeze()).item()
      
      train_acc += accuracy
      nn.utils.clip_grad_norm_(model.parameters(), clip)
      optimizer.step()
    
    val_losses, val_acc = evaluate(model, valid_loader, batch_size)
    
    avg_train_loss = np.mean(train_losses)
    val_loss = np.mean(val_losses)
    train_accuracy = train_acc/len(train_loader.dataset)*100
    val_accuracy = val_acc/len(valid_loader.dataset)*100
       
    print ("Epoch: " + str (epoch + 1))
    print ("Training Loss: " + str ("{:.5f}".format(avg_train_loss)) + ", Validation Loss: " + str ("{:.5f}".format(val_loss)))
    print ("Training Accuracy: " + str ("{:.2f}".format(train_accuracy)) + "%, Validation Accuracy: " + str ("{:.2f}".format(val_accuracy))+ "%")
    print ("-------------------------------------------------------------------")
    
###############            Main Function              #########################
############### Executable statements start from here #########################
if torch.cuda.is_available():       
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

data = '/content/drive/MyDrive/IMDB Dataset.csv'
df = pd.read_csv(data)
df.head()

document, category = df['review'].values,df['sentiment'].values

x_train, y_train, x_test, y_test, vocab = extractFeature(document, category)
train_loader, valid_loader, batch_size = data_loader (x_train, x_test, y_train, y_test)
lstm, optimizer = initilize_model(vocab)
train(lstm, optimizer, train_loader, valid_loader, batch_size, epochs=3)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Processing data!!! Please wait...


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))







 Training Starts...
Epoch: 1
Training Loss: 0.52733, Validation Loss: 0.44731
Training Accuracy: 73.89%, Validation Accuracy: 78.88%
-------------------------------------------------------------------
Epoch: 2
Training Loss: 0.37944, Validation Loss: 0.35703
Training Accuracy: 83.56%, Validation Accuracy: 84.51%
-------------------------------------------------------------------
Epoch: 3
Training Loss: 0.33225, Validation Loss: 0.34810
Training Accuracy: 86.00%, Validation Accuracy: 85.34%
-------------------------------------------------------------------
