## HW2 - Part 3: Q5 (RNN and GRU)

In [1]:
'''
Importing all libraries
'''
from copy import deepcopy
from numpy import argmax
import contractions
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import nltk
import torch
import gensim
import warnings
from sklearn.metrics import accuracy_score
from numpy import vstack
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
import gensim.downloader as api
from sklearn.svm import LinearSVC as SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix as cm
from sklearn.linear_model import Perceptron
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('wordnet')
warnings.filterwarnings('ignore')
CUDA_LAUNCH_BLOCKING = 1

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/darkghost/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
class DataTranformation(object):

    def __init__(self, filename, preprocess):
        self.filename = filename
        self.random_state = 10
        self.n = 50000
        self.preprocess = preprocess
        print("Preproces: " + str(preprocess))

    def read_file(self, error_bad_lines=False, warn_bad_lines=False, sep="\t"):
        df = pd.read_csv(self.filename, sep=sep,
                         error_bad_lines=error_bad_lines, warn_bad_lines=warn_bad_lines)
        df = df.dropna()
        return df

    def formation(self, row1='review_body', row2='star_rating', ):
        df = self.read_file()
        df = df[[row1, row2]]
        df = df.dropna()

        dataset = pd.concat([df[df['star_rating'] == 1].sample(n=50000, random_state=10),
                             df[df['star_rating'] == 2].sample(
                                 n=50000, random_state=10),
                             df[df['star_rating'] == 3].sample(
                                 n=50000, random_state=10),
                             df[df['star_rating'] == 4].sample(
                                 n=50000, random_state=10),
                             df[df['star_rating'] == 5].sample(n=50000, random_state=10)])

        dataset = dataset.reset_index(drop=True)

        return dataset

    def label(self, rows):
        if rows.star_rating > 3:
            return 1
        elif rows.star_rating < 3:
            return 2
        else:
            return 3

    def apply_label(self):
        dataset = self.formation()
        dataset['label'] = dataset.apply(lambda row: self.label(row), axis=1)

        return dataset

    def remove_html_and_url(self, s):
        s = re.sub(
            r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', '', s, flags=re.MULTILINE)
        soup = BeautifulSoup(s, 'html.parser')
        s = soup.get_text()
        return s

    def tokenize(self, s):
        text_tokens = word_tokenize(s)
        return text_tokens

    def without_preprocess(self):
        dataset = self.apply_label()
        dataset.review_body = dataset.review_body.apply(self.tokenize)
        return dataset

    def with_preprocess(self):
        dataset = self.apply_label()
        dataset.review_body = dataset.review_body.str.lower()

        dataset.review_body = dataset.review_body.apply(
            lambda s: self.remove_html_and_url(s))
        dataset.review_body = dataset.review_body.apply(
            lambda s: re.sub("[^a-zA-Z']+", " ", s))
        dataset.review_body = dataset.review_body.apply(
            lambda s: re.sub(' +', ' ', s))

        dataset.review_body = dataset.review_body.apply(self.tokenize)

        dataset.dropna()
        return dataset

    def train_test_split(self):

        if self.preprocess:
            dataset = self.with_preprocess()
        else:
            dataset = self.without_preprocess()

        train = dataset.sample(frac=0.8, random_state=200)
        test = dataset.drop(train.index)
        train = train.reset_index(drop=True)
        test = test.reset_index(drop=True)

        return train, test

In [3]:
class Vectorization(object):

    def __init__(self, model, dataset, model_type="model", classification="binary", mode="mean", pad=False):
        self.model = model
        self.dataset = dataset
        self.model_type = model_type 
        self.classification = classification  
        if self.model_type == "pretrained":
            self.vocab = self.model
        if self.model_type == "model":
            self.vocab = self.model.wv

        self.mode = mode
        self.pad = pad

    def get_mean_vector(self, data_review_body, data_label):

        if self.classification == "binary":
            if data_label != 3:
                if self.model_type == "model":
                    words = [
                        word for word in data_review_body if word in self.vocab.index_to_key]
                    if len(words) >= 1:
                        rev = []
                        for word in words:
                            rev.append(np.array(self.vocab[word]))

                        if type(data_label) is not int:
                            print("Found")
                        return rev, data_label
                else:
                    words = [
                        word for word in data_review_body if word in self.vocab]
                    if len(words) >= 1:
                        rev = []
                        for word in words:
                            rev.append(np.array(self.vocab[word]))

                        if type(data_label) is not int:
                            print("Found")
                        return rev, data_label

        else:
            if self.model_type == "mode":
                words = [
                    word for word in data_review_body if word in self.vocab.index_to_key]
                if len(words) >= 1:
                    rev = []
                    for word in words:
                        rev.append(np.array(self.vocab[word]))
                    return rev, data_label
            else:
                words = [word for word in data_review_body if word in self.vocab]
                if len(words) >= 1:
                    rev = []
                    for word in words:
                        rev.append(np.array(self.vocab[word]))
                    return rev, data_label

    def feature_extraction(self):
        feature = []
        y_label = []
        for data_review_body, data_label in zip(self.dataset.review_body, self.dataset.label):
            try:
                x, y = self.get_mean_vector(data_review_body, data_label)
                if self.pad:
                    if len(x) >= 50:
                        feature.append(x[:50])
                        y_label.append(y)
                    else:
                        feature.append(x)
                        y_label.append(y)
                else:
                    if self.mode == "vec":
                        if len(x) >= 10:
                            feature.append(x[:10])
                            y_label.append(y)
                    else:
                        feature.append(np.mean(x, axis=0))
                        y_label.append(y)
            except:
                pass
        print("Vectorization Completed")
        return feature, y_label

    def pad_review(self, review, seq_len):

        features = np.zeros((seq_len, 300), dtype=float)
        features[-len(review):] = np.array(review)[:seq_len]

        return features

    def join_words(self, x):
        y = ""
        for ele in x:
            y = ' '.join(ele)
        return y

In [4]:
class Sentence(object):
    def __init__(self, dataset):
        self.dataset = dataset

    def __iter__(self):
        for row in self.dataset:
            yield row

In [5]:
class RNN_Data(Dataset):

    def __init__(self, X_data, Y_data):

        self.X_data = X_data
        self.Y_data = Y_data

    def __len__(self):

        return len(self.X_data)

    def __getitem__(self, index):
        pad = np.zeros((50, 300), dtype=float)
        pad[-len(self.X_data[index]):] = np.array(self.X_data[index])[:50]
        X = torch.FloatTensor(pad)
        Y = torch.tensor(self.Y_data[index])
        return X, Y

In [6]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers, model_type="rnn"):
        super(Model, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.model_type = model_type

        if self.model_type == "gru":
            self.layer = nn.GRU(input_size, hidden_dim,
                                n_layers, batch_first=True)
        else:
            self.layer = nn.RNN(input_size, hidden_dim,
                                n_layers, batch_first=True)
        # Fully connected layer
        self.fc = nn.Linear(2500, output_size)

    def forward(self, x):

        batch_size = x.size(0)
        # Initializing hidden state for first input using method defined below
        hidden = self.init_hidden(batch_size)
        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.layer(x, hidden)
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out = out.contiguous().view(-1, out.shape[1] * out.shape[2])
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).cuda()
        return hidden

In [7]:
filename = "./amazon_reviews_us_Kitchen_v1_00.tsv"
dt = DataTranformation(filename, True)
train, test = dt.train_test_split()
sentences = Sentence(train['review_body'])

Preproces: True


In [8]:
pretrained_model = api.load('word2vec-google-news-300')
model = gensim.models.Word2Vec(
    sentences, vector_size=300, min_count=10, window=11, seed=200)

In [9]:
def my_collate(batch):
    '''
     collate_fn is your callable/function that processes the batch you want to return from your dataloader
    '''
    data = [item[0] for item in batch]
    target = [item[1] for item in batch]
    return data, target


def rnn_train(model, epoch, dataset_x, dataset_y, name):
    
    rnn_train = RNN_Data(dataset_x, dataset_y)
    train_loader_mode = DataLoader(dataset = rnn_train, batch_size=8, shuffle = True, collate_fn=my_collate, drop_last=True)
    
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    
    for ep in range(1, epoch + 1):
        
        for input_data, label in train_loader_mode:
            optimizer.zero_grad()
            input_data = torch.stack(input_data)
            label = torch.stack(label)
            output, hidden = model(input_data.to(device))
            loss = criterion(output, label.to(device))
            loss.backward()
            optimizer.step()
            
        # print('Epoch: {} \tTraining Loss: {:.6f}'.format(ep, loss.item()))
        torch.save(model.state_dict(), name + str(ep) + '.pt')
        
def rnn_test(model, epoch, dataset_x, dataset_y, name):
    
    rnn_test = RNN_Data(dataset_x, dataset_y)
    test_loader_mode = DataLoader(dataset = rnn_test, batch_size=8, collate_fn=my_collate, drop_last=True)
    tmp = 0
    for i in range(1, epoch+1):
        model.load_state_dict(torch.load(name +str(i) + '.pt'))
        model = model.to(device)
        
        predictions, actual = list(), list()
        for test_data, test_label in test_loader_mode:
            test_data = torch.stack(test_data)
            test_label = torch.stack(test_label)
            pred, hid = model(test_data.to('cuda'))
            pred = pred.to('cpu')
            pred = pred.detach().numpy()
            pred = argmax(pred, axis= 1)
            target = test_label.numpy()
            target = target.reshape((len(target), 1))
            pred = pred.reshape((len(pred)), 1)
            pred = pred.round()
            predictions.append(pred)
            actual.append(target)
                
        predictions, actual = vstack(predictions), vstack(actual)
        acc = accuracy_score(actual, predictions)
        print('Accuracy: %.3f' % acc)


In [10]:
device = torch.device("cuda")

RNN and GRU with binary and Self Trained Model

In [11]:
rnn_bin = Model(300, 3, 50, 1)
rnn_bin = rnn_bin.to(device)
gru_model_bin = Model(300, 3, 50, 1, model_type="gru")
gru_model_bin = gru_model_bin.to(device)

vec_rnn_train = Vectorization(model, train, classification = "binary", pad = True)
vec_rnn_test = Vectorization(model, test, classification ="binary", pad = True)

X_rnn_train, Y_rnn_train = vec_rnn_train.feature_extraction()
X_rnn_test, Y_rnn_test = vec_rnn_test.feature_extraction()

rnn_train(rnn_bin, 10, X_rnn_train, Y_rnn_train, name = "rnn_model")
rnn_test(rnn_bin, 10, X_rnn_test, Y_rnn_test, name = "rnn_model")

rnn_train(gru_model_bin, 10, X_rnn_train, Y_rnn_train, name = "gru_model")
rnn_test(gru_model_bin, 10, X_rnn_test, Y_rnn_test, name = "gru_model")

del vec_rnn_train, vec_rnn_test, X_rnn_train, X_rnn_test, Y_rnn_train, Y_rnn_test

Vectorization Completed
Accuracy: 0.757
Accuracy: 0.781


RNN and GRU with multi-classification self trained w2v model

In [12]:
rnn = Model(300, 4, 50, 1)
rnn = rnn.to(device)
vec_rnn_multi_train = Vectorization(model, train, classification = "multi-class", pad = True)
vec_rnn_multi_test = Vectorization(model, test, classification = "multi-class", pad = True)

X_rnn_multi_train, Y_rnn_multi_train = vec_rnn_multi_train.feature_extraction()
X_rnn_multi_test, Y_rnn_multi_test = vec_rnn_multi_test.feature_extraction()
print("RNN: ")
rnn_train(rnn, 10, X_rnn_multi_train, Y_rnn_multi_train, name = "rnn_multi_model")
rnn_test(rnn, 10, X_rnn_multi_test, Y_rnn_multi_test, name = "rnn_multi_model")

gru_model = Model(300, 4, 50, 1, model_type="gru")
gru_model = gru_model.to(device)
print("GRU: ")
rnn_train(gru_model, 10, X_rnn_multi_train, Y_rnn_multi_train, name = "gru_multi_model")
rnn_test(gru_model, 10, X_rnn_multi_test, Y_rnn_multi_test, name = "gru_multi_model")

del vec_rnn_multi_train, vec_rnn_multi_test, Y_rnn_multi_train, X_rnn_multi_test, Y_rnn_multi_test


Vectorization Completed
RNN: 
Accuracy: 0.581
GRU: 
Accuracy: 0.601


RNN and GRU with binary and pre-trained w2v model

In [16]:
vec_rnn_pre_train = Vectorization(model = pretrained_model, dataset = train, model_type="pretrained", classification = "binary", mode = "vec", pad = True)
vec_rnn_pre_test = Vectorization(model = pretrained_model, dataset = test, model_type = "pretrained", classification = "binary", mode = "vec", pad = True)

X_rnn_pre_train, Y_rnn_pre_train = vec_rnn_pre_train.feature_extraction()
X_rnn_pre_test, Y_rnn_pre_test = vec_rnn_pre_test.feature_extraction()

print("RNN:")
rnn_train(rnn_bin, 10, X_rnn_pre_train, Y_rnn_pre_train, name = "rnn_pre_model")
rnn_test(rnn_bin, 10, X_rnn_pre_test, Y_rnn_pre_test, name = "rnn_pre_model")

print("GRU: ")
rnn_train(gru_model_bin, 10, X_rnn_pre_train, Y_rnn_pre_train, name = "gru_pre_model")
rnn_test(gru_model_bin, 10, X_rnn_pre_test, Y_rnn_pre_test, name = "gru_pre_model")

del vec_rnn_pre_train, vec_rnn_pre_test,  X_rnn_pre_train, Y_rnn_pre_train, X_rnn_pre_test, Y_rnn_pre_test




Vectorization Completed
RNN:
Accuracy: 0.822
GRU: 
Accuracy: 0.871


RNN and GRU with multi-class and Pretrained w2v model

In [13]:
vec_rnn_pre_multi_train = Vectorization(model = pretrained_model, dataset = train, model_type = "pretrained", classification = "multi-class", mode = "vec", pad = True)
vec_rnn_pre_multi_test = Vectorization(model = pretrained_model, dataset = test, model_type = "pretrained", classification = "multi-class", mode = "vec", pad = True)

X_rnn_pre_multi_train, Y_rnn_pre_multi_train = vec_rnn_pre_multi_train.feature_extraction()
X_rnn_pre_multi_test, Y_rnn_pre_multi_test = vec_rnn_pre_multi_test.feature_extraction()
print("RNN:")
rnn_train(rnn, 10, X_rnn_pre_multi_train, Y_rnn_pre_multi_train, name = "rnn_pre_model_multi")
rnn_test(rnn, 10, X_rnn_pre_multi_test, Y_rnn_pre_multi_test, name = "rnn_pre_model_multi")
print("GRU: ")
rnn_train(gru_model, 10, X_rnn_pre_multi_train, Y_rnn_pre_multi_train, name = "gru_pre_model_multi")
rnn_test(gru_model, 10, X_rnn_pre_multi_test, Y_rnn_pre_multi_test, name = "gru_pre_model_multi")

del vec_rnn_pre_multi_train, vec_rnn_pre_multi_test,  X_rnn_pre_multi_train, Y_rnn_pre_multi_train, X_rnn_pre_multi_test, Y_rnn_pre_multi_test


Vectorization Completed
RNN:
Accuracy: 0.702
GRU: 
Accuracy: 0.738


## Observation:

GRU gives better accuracy compare to RNN in all cases with this data.
