<a href="https://colab.research.google.com/github/rabimist/Deep-Learning-for-Natural-Language-Processing/blob/main/Movie_Review_using_CNN_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Name:** Deen Mohammad Abdullah

The purpose of this project is to train and test Movie Review dataset of NLTK using CNN.

**Deep Learning for Natural Language Processing**


Execute the following code to run the project:

In [None]:
############################################ Required Packages #####################################################################
import numpy as np
import nltk
nltk.download('movie_reviews') # -------------------------------- downloading the movie_review dataset
nltk.download('stopwords') # ------------------------------------ downloading the stopwords from NLTK
nltk.download('wordnet')  # ------------------------------------- for lematization we need to download wordnet (WordNetLemmatizer)
nltk.download('punkt') # ---------------------------------------- used for word tokenization
from nltk.corpus import movie_reviews # ------------------------- importing the movie_review dataset
from nltk.corpus import stopwords # ----------------------------- importing the stopwords
from nltk.stem import WordNetLemmatizer # ----------------------- importing wordNetLemmatizer for lemmatization
from nltk.tokenize import word_tokenize # ----------------------- tokenize words from text
import re # ----------------------------------------------------- importing regularExpression to extract only text from the dataset
from sklearn.model_selection import train_test_split #----------- split the dataset for training and validation set
import torch
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler, SequentialSampler)
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import random
######################################################################################################################################

########################################### Function Definitions (I have defined nine functions) #####################################
#------ (1) This function removes special charecters from text ---------
def removeSpecialCharacter(word_list):
  cleanWordList = []
  
  for word in word_list:
    if (re.match('[a-zA-Z0-9]+', word)):
      cleanWordList.append(word.lower())
  
  return cleanWordList
#-----------------------------------------------------------------------

#------ (2) This function removes stop words from text -----------------
def removeStopWords (word_list):
  stop_words = set(stopwords.words('english'))
  
  filteredWords = [] 
  
  for word in word_list:
    if word not in stop_words: 
      filteredWords.append(word)
      
  return filteredWords
#------------------------------------------------------------------------

#------ (3) This function uses WordNet and lematizes the text -----------
def lemmatize (word_list):
  lemmatizer = WordNetLemmatizer()
  
  filteredWords = []
  
  for word in word_list:
    filteredWords.append(lemmatizer.lemmatize(word))
    
  return filteredWords
#------------------------------------------------------------------------

#--- (4) This function tokenize text, build vocabulary, preparing input  
#----------- for cnn layer and calculate max length of text -------------
def extractFeature(texts):
    max_len = 0
    tokenized_texts = []
    word2idx = {}

    word2idx['<pad>'] = 0
    
    idx = 1
    for sent in texts:
        tokenized_sent = word_tokenize(sent)
        tokenized_texts.append(tokenized_sent)

        for token in tokenized_sent:
            if token not in word2idx:
                word2idx[token] = idx
                idx = idx + 1

        if len(tokenized_sent) > max_len:
          max_len = len(tokenized_sent)

    input_ids = []
    for tokenized_sent in tokenized_texts:
      tokenized_sent += ['<pad>'] * (max_len - len(tokenized_sent))
      input_id = [word2idx.get(token) for token in tokenized_sent]
      input_ids.append(input_id)
    input_ids = np.array(input_ids)

    return tokenized_texts, word2idx, input_ids, max_len
#----------------------------------------------------------------------

# ---- (5) This function loads fastText as pretrained vector ----------
#------------- and prepares input embedding ---------------------------
def pretrained_vectors(word2idx):
  URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
  FILE = "fastText"
  
  if os.path.isdir(FILE):
    print("fastText exists.")
  else:
    print ('fastText pretrained vector does not exist. It will take time (few minutes) to download it ...')
    !wget -P $FILE $URL
    !unzip $FILE/crawl-300d-2M.vec.zip -d $FILE

  fname = "fastText/crawl-300d-2M.vec"
  print ("Using pretrained vectors for embedding. It will take some minutes. Please wait...")
  fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
  n, d = map(int, fin.readline().split())
  
  embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
  embeddings[word2idx['<pad>']] = np.zeros((d,))
  
  count = 0
  for line in fin:
    tokens = line.rstrip().split(' ')
    word = tokens[0]
    if word in word2idx:
      count += 1
      embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)
  
  embeddings = torch.tensor(embeddings)
  print("Done Embedding")
  return embeddings
#--------------------------------------------------------------------------------

# ---- (6) Purpose of this function is to convert training ----------------------
#---------- and validation dataset into torchTensor -----------------------------
def data_loader(train_inputs, val_inputs, train_labels, val_labels):
    train_inputs, val_inputs, train_labels, val_labels = tuple(torch.tensor(data) for data in [train_inputs, val_inputs, train_labels, val_labels])

    train_data = TensorDataset(train_inputs, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = 50)

    val_data = TensorDataset(val_inputs, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size = 50)

    return train_dataloader, val_dataloader
#--------------------------------------------------------------------------------

########### CNN Model Architecture starts from here #############################
class CNN_MODEL(nn.Module):
  def __init__(self, pretrained_embedding = None, freeze_embedding = False, vocab_size = None, embed_dim = 300, filter_sizes = [3, 4, 5], num_filters = [100, 100, 100], num_classes = 2, dropout = 0.5):
    super(CNN_MODEL, self).__init__()
    
    # Initializes input embedding layer
    self.vocab_size, self.embed_dim = pretrained_embedding.shape
    self.embedding = nn.Embedding.from_pretrained(pretrained_embedding, freeze = freeze_embedding)

    # Initializes convolution Layer    
    self.conv1d_list = nn.ModuleList([nn.Conv1d(in_channels = self.embed_dim, out_channels = num_filters[i], kernel_size = filter_sizes[i]) for i in range(len(filter_sizes))])
    
    # Initializes fully connected layer
    self.fc = nn.Linear(np.sum(num_filters), num_classes) # there are two classes (pos/neg) in movie review
    self.dropout = nn.Dropout(p=dropout)
  
  def forward(self, input_ids):
    # Embedding from input_ids
    x_embed = self.embedding(input_ids).float()
    
    # Preparing the input for convolution layer
    x_reshaped = x_embed.permute(0, 2, 1)

    # Covolution Layer and ReLU layer    
    x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]
    
    # Applied Max pooling
    x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2]) for x_conv in x_conv_list]
    
    # Fully connected layer (Concatenated all the output from the pooling layer and put it to the fully conected layer)
    x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list], dim=1)
    logits = self.fc(self.dropout(x_fc))
    
    return logits

# -------- (7) This function is initializing the CNN model ----------------------------
def initilize_model(pretrained_embedding):
  filter_sizes=[3, 4, 5]
  num_filters=[100, 100, 100]
  learning_rate=0.1
  
  assert (len(filter_sizes) == len(num_filters)), "filter_sizes and num_filters need to be of the same length."
  
  # Here cnn_model is the instance of CNN_MODEL Class
  cnn_model = CNN_MODEL(pretrained_embedding = pretrained_embedding, freeze_embedding = True, vocab_size = None, embed_dim = 300, filter_sizes = filter_sizes, num_filters = num_filters, num_classes=2, dropout = 0.5)
  cnn_model.to(device)
  
  optimizer = optim.Adadelta (cnn_model.parameters(), lr=learning_rate, rho=0.95)
  
  return cnn_model, optimizer
#--------------------------------------------------------------------------------
############ CNN Model architecture ends here #####################################

###################### Training and Evaluation of Model ###########################
#---- (8) this function evaluates the model -------------------------------------
def evaluate(model, val_dataloader):
  model.eval()
  val_accuracy = []
  val_loss = []
  
  for batch in val_dataloader:
    b_input_ids, b_labels = tuple(t.to(device) for t in batch)
    
    with torch.no_grad():
      logits = model(b_input_ids)
    
    loss = loss_fn(logits, b_labels)
    val_loss.append(loss.item())
    preds = torch.argmax(logits, dim=1).flatten()
    
    accuracy = (preds == b_labels).cpu().numpy().mean() * 100
    val_accuracy.append(accuracy)
  
  val_loss = np.mean(val_loss)
  val_accuracy = np.mean(val_accuracy)
  
  return val_loss, val_accuracy
#-------------------------------------------------------------------------------

#------ (9) this function train the model -------------------------------------- 
loss_fn = nn.CrossEntropyLoss()
def train(model, optimizer, train_dataloader, val_dataloader, epochs):
  best_accuracy = 0
  
  print("Training starts...")
  
  for epoch_i in range(epochs):
    total_loss = 0
    
    model.train()
    
    for step, batch in enumerate(train_dataloader):
      b_input_ids, b_labels = tuple(t.to(device) for t in batch)
      model.zero_grad() # initializing all gradients with zero value
      logits = model(b_input_ids)
      loss = loss_fn(logits, b_labels)
      total_loss += loss.item()
      loss.backward()
      optimizer.step()
    
    avg_train_loss = total_loss / len(train_dataloader)
    val_loss, val_accuracy = evaluate(model, val_dataloader)
    if val_accuracy > best_accuracy:
      best_accuracy = val_accuracy
    print ("Epoch: " + str (epoch_i + 1))
    print ("Training Loss: " + str ("{:.2f}".format(avg_train_loss)) + ", Validation Loss: " + str ("{:.2f}".format(val_loss)))
    print ("Model Accuracy: " + str (val_accuracy) + "%")
    print ("-------------------------------------------------------------------")
  print ("\n")
  print ("Training is Done")
  print ("Best Model Accuracy: " + str("{:.2f}".format(best_accuracy)) + "%")
#------------------------------------------------------------------------------



###############            Main Function              #########################
############### Executable statements start from here #########################
if torch.cuda.is_available():       
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

document = [(movie_reviews.words(file_id),category) for file_id in movie_reviews.fileids() for category in movie_reviews.categories(file_id)]

texts = []
labels = []
for (word_list,category) in document:
  word_list = removeSpecialCharacter (word_list)
  word_list = removeStopWords (word_list)
  word_list = lemmatize (word_list)
  
  txt = ''
  for w in word_list:
    txt = txt + w + ' '
    
  texts.append(txt)
  
  if category == 'neg':
    labels.append(0)
  else:
    labels.append(1)

tokenized_texts, word2idx, input_ids, max_len = extractFeature(texts)
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, test_size=0.1)
train_dataloader, val_dataloader = data_loader(train_inputs, val_inputs, train_labels, val_labels)
embeddings = pretrained_vectors(word2idx)
cnn_model, optimizer = initilize_model (pretrained_embedding=embeddings)
train(cnn_model, optimizer, train_dataloader, val_dataloader, epochs=25)

######################## End of Main function ##################################


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
fastText exists.
Using pretrained vectors for embedding. It will take some minutes. Please wait...
Done Embedding
Training starts...
Epoch: 1
Training Loss: 0.71, Validation Loss: 0.69
Model Accuracy: 47.0%
-------------------------------------------------------------------
Epoch: 2
Training Loss: 0.68, Validation Loss: 0.67
Model Accuracy: 73.0%
-------------------------------------------------------------------
Epoch: 3
Training Loss: 0.66, Validation Loss: 0.67
Model Accuracy: 69.0%
-------------------------------------