#Review Analysis using RNN/LSTM


###Mounting Drive

In [1]:
from google.colab import files, drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


###Importing Required libraries

In [2]:
import numpy as np
import pandas as pd
import os, sys
import sklearn
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
import glob
from torch.utils.data import Dataset, DataLoader,TensorDataset
from sklearn.model_selection import train_test_split
from torch.utils.data import random_split
from torchvision import transforms,utils
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tensorflow as tf
from __future__ import unicode_literals, print_function, division

###Processing Data

In [3]:
# loading 500 files from each folder.

neg_reviews = glob.glob("/content/drive/My Drive/reviews/neg/*.txt")[:500]
pos_reviews = glob.glob("/content/drive/My Drive/reviews/pos/*.txt")[:500]

In [4]:
max_features = 200

#Creating corpus and assigning labels
corpus = []
labels = []

for i in pos_reviews:
  with open(i) as fh:
    corpus.append(fh.read().replace('\n',' '))
    labels.append([1,0])

for i in neg_reviews:
  with open(i) as fh:
    corpus.append(fh.read().replace('\n',' '))
    labels.append([1,0])

In [5]:
## using `TfidfVectorizer` from sklearn to generate tf-idf values for every word in each document.
vectorizer = TfidfVectorizer(max_features=200, stop_words='english')

X = vectorizer.fit_transform(corpus)
y = np.array(labels)

print(X.shape,y.shape)

(1000, 200) (1000, 2)


In [6]:
seq_length = -1

word_tokenizer = vectorizer.build_tokenizer()
vocab = vectorizer.vocabulary_

#max_features = 200

doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in corpus]
docs = []
for i in range(len(doc_terms_list_train)):
  terms = []
  for j in range(len(doc_terms_list_train[i])):
    w = doc_terms_list_train[i][j]
    if w in vocab:
      terms.append(w)
  if len(terms) > seq_length:
    seq_length=len(terms)
  docs.append(terms)

datasets = np.zeros((X.shape[0],seq_length,max_features))

for i in range(len(docs)):
  n_padding = seq_length - len(docs[i])

  for j in range(len(docs[i])):
    w = docs[i][j]
    idx = vocab[w]
    tfidf_val = X[i,idx]
    datasets[i,j+n_padding,idx] = tfidf_val

datasets = datasets.astype(np.float32)
y = y.astype(np.float32)

X_train, X_val, y_train, y_val = train_test_split(datasets, y, test_size=0.3, random_state = 1012)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(700, 311, 200) (300, 311, 200) (700, 2) (300, 2)


###Creating Dataloader objects

In [7]:
#create train/val dataloader

batch_size = 16

train_data = TensorDataset(torch.from_numpy(X_train),torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(X_val),torch.from_numpy(y_val))

train_loader = DataLoader(train_data,shuffle=True,batch_size=batch_size)
val_loader = DataLoader(val_data,shuffle=True,batch_size=batch_size)


###Creating RNN Model

In [8]:
class Model(nn.Module):

  def __init__(self,input_size,output_size,hidden_size,n_layers):
    super().__init__()

    self.hidden_size = hidden_size
    self.n_layers = n_layers

    self.rnn = nn.RNN(input_size,hidden_size,n_layers,batch_first=True)
    self.fc1 = nn.Linear(hidden_size,output_size)
    self.fc2 = nn.Linear(output_size,2)

  def forward(self,x,hidden):
    batch_size = x.size()[0]

    hidden = self.init_hidden(batch_size)
    #print(hidden.size())
    rnn_out,hidden = self.rnn(x,hidden)

    rnn_out = self.fc1(rnn_out)

    last_out = rnn_out[:,-1,:].view(batch_size,-1)

    out = F.softmax(self.fc2(last_out))

    return out,hidden

  def init_hidden(self,batch_size):
    hidden = torch.zeros(self.n_layers,batch_size,self.hidden_size).cuda()
    return hidden

model = Model(200,32,256,2)
print(model)

Model(
  (rnn): RNN(200, 256, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=256, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=2, bias=True)
)


###Please Activate GPU before running

In [9]:
#Using GPU
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
  model.to(device) 

In [11]:
# defining hyperparameters

n_epochs = 10
lr = 1e-4
counter = 0
clip = 5

# defining loss and optimzier functions

criterion = nn.CrossEntropyLoss()
optimzier = torch.optim.Adam(model.parameters(), lr=lr)

model.train()
for epoch in range(n_epochs):
  #initialize hidden state
  h = model.init_hidden(batch_size)

  #batch loop
  for inputs, labels in train_loader:
    counter+=1
    inputs, labels = inputs.to(device), labels.to(device)
    model.zero_grad()
    outputs,h = model(inputs,h)
    loss = criterion(outputs,torch.max(labels,1)[1])
    loss.backward()

    #using clip grad norm which prevents exploding gradient prob in RNN
    nn.utils.clip_grad_norm_(model.parameters(),clip)
    optimzier.step()

    #validation loss
    
    val_h = model.init_hidden(batch_size).cuda()
    val_losses =[]

    model.eval()

    for inputs,labels in val_loader:
  
      inputs,labels = inputs.to(device), labels.to(device)
      val_outputs,val_h = model(inputs,val_h)
      val_loss = criterion(val_outputs,torch.max(labels,1)[1])
      val_losses.append(val_loss.item())

    model.train()

    print('Epoch:{}/{}'.format(epoch+1,n_epochs),
          'Batch:{}'.format(counter),
          'Train Loss:{:.5f}'.format(loss.item()),
          'Val Loss:{:.5f}'.format(np.mean(val_losses)))



Epoch:1/10 Batch:1 Train Loss:0.74687 Val Loss:0.73718
Epoch:1/10 Batch:2 Train Loss:0.73665 Val Loss:0.72694
Epoch:1/10 Batch:3 Train Loss:0.72692 Val Loss:0.71632
Epoch:1/10 Batch:4 Train Loss:0.71615 Val Loss:0.70519
Epoch:1/10 Batch:5 Train Loss:0.70481 Val Loss:0.69342
Epoch:1/10 Batch:6 Train Loss:0.69344 Val Loss:0.68086
Epoch:1/10 Batch:7 Train Loss:0.68062 Val Loss:0.66742
Epoch:1/10 Batch:8 Train Loss:0.66720 Val Loss:0.65294
Epoch:1/10 Batch:9 Train Loss:0.65303 Val Loss:0.63726
Epoch:1/10 Batch:10 Train Loss:0.63729 Val Loss:0.62019
Epoch:1/10 Batch:11 Train Loss:0.62016 Val Loss:0.60158
Epoch:1/10 Batch:12 Train Loss:0.60151 Val Loss:0.58126
Epoch:1/10 Batch:13 Train Loss:0.58103 Val Loss:0.55914
Epoch:1/10 Batch:14 Train Loss:0.55898 Val Loss:0.53531
Epoch:1/10 Batch:15 Train Loss:0.53575 Val Loss:0.50998
Epoch:1/10 Batch:16 Train Loss:0.50976 Val Loss:0.48367
Epoch:1/10 Batch:17 Train Loss:0.48330 Val Loss:0.45716
Epoch:1/10 Batch:18 Train Loss:0.45714 Val Loss:0.43151
E