### HW2 - Part 1

In [2]:
'''
Importing all libraries
'''
from copy import deepcopy
from numpy import argmax
import contractions
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import nltk
import torch
import gensim
import warnings
from sklearn.metrics import accuracy_score
from numpy import vstack
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
import gensim.downloader as api
from sklearn.svm import LinearSVC as SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix as cm
from sklearn.linear_model import Perceptron
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('wordnet')
warnings.filterwarnings('ignore')
CUDA_LAUNCH_BLOCKING = 1


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/darkghost/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Creating a class "DataTransformation" to manage preprocessing of data

Usage of functions:

1. read_file(): reads the tsv file and returns the dataframe
2. df_formation(): reads the dataframe and picks 50k reviews of each star rating and returns the final combined df
3. label() and apply_label(): To apply 1, 2 or 3 label to the reviews
4. remove_html_url(): removes the HTML and URL from the reviews 
5. tokenize(): tokenizes the reviews
6. without_preprocess(): returns df without doing all preprocessing, just tokenized
7. with_preprocess(): returns preprocessed and tokenized reviews
8. train_test_split(): splits the df into 80%-20% train-test split

In [4]:
class DataTranformation(object):

    def __init__(self, filename, preprocess):
        self.filename = filename
        self.random_state = 10
        self.n = 50000
        self.preprocess = preprocess
        print("Preproces: " + str(preprocess))

    def read_file(self, error_bad_lines=False, warn_bad_lines=False, sep="\t"):
        df = pd.read_csv(self.filename, sep=sep,
                         error_bad_lines=error_bad_lines, warn_bad_lines=warn_bad_lines)
        df = df.dropna()
        return df

    def df_formation(self, row1='review_body', row2='star_rating', ):
        df = self.read_file()
        df = df[[row1, row2]]
        df = df.dropna()

        dataset = pd.concat([df[df['star_rating'] == 1].sample(n=50000, random_state=10),
                             df[df['star_rating'] == 2].sample(
                                 n=50000, random_state=10),
                             df[df['star_rating'] == 3].sample(
                                 n=50000, random_state=10),
                             df[df['star_rating'] == 4].sample(
                                 n=50000, random_state=10),
                             df[df['star_rating'] == 5].sample(n=50000, random_state=10)])

        dataset = dataset.reset_index(drop=True)

        return dataset

    def label(self, rows):
        if rows.star_rating > 3:
            return 1
        elif rows.star_rating < 3:
            return 2
        else:
            return 3

    def apply_label(self):
        dataset = self.df_formation()
        dataset['label'] = dataset.apply(lambda row: self.label(row), axis=1)

        return dataset

    def remove_html_and_url(self, s):
        s = re.sub(
            r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', '', s, flags=re.MULTILINE)
        soup = BeautifulSoup(s, 'html.parser')
        s = soup.get_text()
        return s

    def tokenize(self, s):
        text_tokens = word_tokenize(s)
        return text_tokens

    def without_preprocess(self):
        dataset = self.apply_label()
        dataset.review_body = dataset.review_body.apply(self.tokenize)
        return dataset

    def with_preprocess(self):
        dataset = self.apply_label()
        dataset.review_body = dataset.review_body.str.lower()

        dataset.review_body = dataset.review_body.apply(
            lambda s: self.remove_html_and_url(s))
        dataset.review_body = dataset.review_body.apply(
            lambda s: re.sub("[^a-zA-Z']+", " ", s))
        dataset.review_body = dataset.review_body.apply(
            lambda s: re.sub(' +', ' ', s))

        dataset.review_body = dataset.review_body.apply(self.tokenize)

        dataset.dropna()
        return dataset

    def train_test_split(self):

        if self.preprocess:
            dataset = self.with_preprocess()
        else:
            dataset = self.without_preprocess()

        train = dataset.sample(frac=0.8, random_state=200)
        test = dataset.drop(train.index)
        train = train.reset_index(drop=True)
        test = test.reset_index(drop=True)

        return train, test

### Creating class Vectorization to generate feature vectors of the words based on the requirements

Functions are as follows:

1. get_mean_vector(): returns feature vector vlues for every word in the review
2. feature_extraction(): Either pads or takes first 10 vectors or calculate mean vector for full review
3. pad_review(): pads the reviews to the desired length
4. join_words(): list of words in converted to back to one sentence

In [5]:
class Vectorization(object):

    def __init__(self, model, dataset, model_type="model", classification="binary", mode="mean", pad=False):
        self.model = model
        self.dataset = dataset
        self.model_type = model_type  # our own model or pretrained
        self.classification = classification  # binary or multi-class
        if self.model_type == "pretrained":
            self.vocab = self.model
        if self.model_type == "model":
            self.vocab = self.model.wv

        self.mode = mode
        self.pad = pad

        print("Vectorizing training dataset....")
        print("Model Type: " + self.model_type)
        print("Classification: " + self.classification)

    def get_mean_vector(self, data_review_body, data_label):

        if self.classification == "binary":
            if data_label != 3:
                if self.model_type == "model":
                    words = [
                        word for word in data_review_body if word in self.vocab.index_to_key]
                    if len(words) >= 1:
                        rev = []
                        for word in words:
                            rev.append(np.array(self.vocab[word]))

                        if type(data_label) is not int:
                            print("Found")
                        return rev, data_label
                else:
                    words = [
                        word for word in data_review_body if word in self.vocab]
                    if len(words) >= 1:
                        rev = []
                        for word in words:
                            rev.append(np.array(self.vocab[word]))

                        if type(data_label) is not int:
                            print("Found")
                        return rev, data_label

        else:
            if self.model_type == "mode":
                words = [
                    word for word in data_review_body if word in self.vocab.index_to_key]
                if len(words) >= 1:
                    rev = []
                    for word in words:
                        rev.append(np.array(self.vocab[word]))
                    return rev, data_label
            else:
                words = [word for word in data_review_body if word in self.vocab]
                if len(words) >= 1:
                    rev = []
                    for word in words:
                        rev.append(np.array(self.vocab[word]))
                    return rev, data_label

    def feature_extraction(self):
        feature = []
        y_label = []
        # print(self.vocab.index_to_key)
        for data_review_body, data_label in zip(self.dataset.review_body, self.dataset.label):
            try:
                x, y = self.get_mean_vector(data_review_body, data_label)
                if self.pad:
                    if len(x) >= 50:
                        feature.append(x[:50])
                        y_label.append(y)
                    else:
                        feature.append(x)
                        y_label.append(y)
                else:
                    if self.mode == "vec":
                        if len(x) >= 10:
                            feature.append(x[:10])
                            y_label.append(y)
                    else:
                        feature.append(np.mean(x, axis=0))
                        y_label.append(y)
            except:
                pass
        print("Vectorization Completed")
        return feature, y_label

    def pad_review(self, review, seq_len):

        features = np.zeros((seq_len, 300), dtype=float)
        features[-len(review):] = np.array(review)[:seq_len]

        return features

    def join_words(self, x):
        y = ""
        for ele in x:
            y = ' '.join(ele)
        return y

Sentence class returns one review at a time from the dataset through the use of __iter__.

In [6]:
class Sentence(object):
    def __init__(self, dataset):
        self.dataset = dataset

    def __iter__(self):
        for row in self.dataset:
            yield row

### Class to train and evaluate the Perceptron

In [23]:
class Percept(object):

    def __init__(self, X_train, Y_train, X_test, Y_test, max_iter=100, random_state=20, eta0=0.01, verbose=0):
        self.X_train = X_train
        self.Y_train = Y_train
        self.X_test = X_test
        self.Y_test = Y_test
        self.max_iter = max_iter
        self.random_state = random_state
        self.eta0 = eta0
        self.verbose = verbose

    def metrics(self, true, pred):
        tn, fp, fn, tp = cm(true, pred).ravel()
        acc = (tp + tn)/(tn + fp + fn + tp)
        prec = tp/(tp + fp)
        rec = tp / (tp + fn)
        f1 = 2*(rec * prec) / (rec + prec)
        return [acc, prec, rec, f1]

    def print_seq(self, score_list):
        print("%.6f" % score_list[0], " %.6f" % score_list[1],
              " %.6f" % score_list[2], " %.6f" % score_list[3])

    def perceptron_model(self):
        percept = Perceptron(
            max_iter=self.max_iter, random_state=self.random_state, eta0=self.eta0, verbose=self.verbose)

        print("Fitting the Model...")
        percept.fit(self.X_train, self.Y_train)
        return percept

    def evaluation(self):
        percept = self.perceptron_model()
        Y_train_pred = percept.predict(self.X_train)
        train_score = self.metrics(self.Y_train, Y_train_pred)
        Y_test_pred = percept.predict(self.X_test)
        test_score = self.metrics(self.Y_test, Y_test_pred)

        print("Training Score")
        self.print_seq(train_score)

        print("Testing Score")
        self.print_seq(test_score)

        return test_score

### Class to train and evaluate the SVM

In [22]:
class SVM(object):

    def __init__(self, X_train, Y_train, X_test, Y_test, max_iter=500):
        self.X_train = X_train
        self.Y_train = Y_train
        self.X_test = X_test
        self.Y_test = Y_test
        self.max_iter = max_iter

    def intitalize_model(self):
        # Linear SVM
        svc = SVC(max_iter=self.max_iter)

        print("Fitting the SVM")
        svc_model = svc.fit(self.X_train, self.Y_train)
        return svc_model

    def print_seq(self, score_list):
        print("%.6f" % score_list[0], " %.6f" % score_list[1],
              " %.6f" % score_list[2], " %.6f" % score_list[3])

    def metrics(self, true, pred):
        tn, fp, fn, tp = cm(true, pred).ravel()
        acc = (tp + tn)/(tn + fp + fn + tp)
        prec = tp/(tp + fp)
        rec = tp / (tp + fn)
        f1 = 2*(rec * prec) / (rec + prec)
        return [acc, prec, rec, f1]

    def evaluation(self):
        svc_model = self.intitalize_model()
        Y_train_pred = svc_model.predict(self.X_train)
        train_score = self.metrics(self.Y_train, Y_train_pred)

        Y_test_pred = svc_model.predict(self.X_test)
        test_score = self.metrics(self.Y_test, Y_test_pred)

        print("Training Score")
        self.print_seq(train_score)

        print("Testing Score")
        self.print_seq(test_score)

        return test_score

### Reading the file and carrying out preprocessing

In [11]:
filename = "./amazon_reviews_us_Kitchen_v1_00.tsv"
dt = DataTranformation(filename, True)

Preproces: True


### Splitting data and generating pretrained and self-trained word2vec models

In [12]:
train, test = dt.train_test_split()

sentences = Sentence(train['review_body'])

pretrained_model = api.load('word2vec-google-news-300')
model = gensim.models.Word2Vec(sentences, vector_size = 300, min_count = 10, window = 11, seed = 200)

#### Semantic similarities in pretrained model

In [13]:

print(pretrained_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))
print(pretrained_model.similarity('excellent', 'outstanding'))

[('queen', 0.7118193507194519)]
0.5567486


#### Semantic Similarities in Self-Trained Model

In [14]:
print(model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))
print(model.wv.similarity('excellent', 'outstanding'))

[('arthur', 0.536632239818573)]
0.7561389


From the obervation, it looks like finding most similar word works better in pretrained model an it works better, but similarities between two words in some cases are better in our self-trained model

### Self-trained model feature extraction

In [15]:
vec_train = Vectorization(model = model, dataset = train)
vec_test = Vectorization(model, test)

X_train_model, Y_train_model = vec_train.feature_extraction()
X_test_model, Y_test_model = vec_test.feature_extraction()

Vectorizing training dataset....
Model Type: model
Classification: binary
Vectorizing training dataset....
Model Type: model
Classification: binary
Vectorization Completed
Vectorization Completed


### Pre-trained model feature extraction

In [16]:
vec2_train = Vectorization(model = pretrained_model, dataset = train, model_type = "pretrained")
vec2_test = Vectorization(model = pretrained_model, dataset = test, model_type = "pretrained")

X_train_pre, Y_train_pre = vec2_train.feature_extraction()
X_test_pre, Y_test_pre = vec2_test.feature_extraction()

Vectorizing training dataset....
Model Type: pretrained
Classification: binary
Vectorizing training dataset....
Model Type: pretrained
Classification: binary
Vectorization Completed
Vectorization Completed


### TF-IDF feature extraction

In [17]:
def get_tfidf(train, test):
    train_x = train.apply(lambda x: " ".join(ele for ele in x))
    test_x = test.apply(lambda x: " ".join(ele for ele in x))
    tfidf_vect = TfidfVectorizer(min_df = 0.001)
    train_x_vectors = tfidf_vect.fit_transform(train_x)
    train_x_vectors = pd.DataFrame(train_x_vectors.toarray(), columns = tfidf_vect.get_feature_names())
    test_x_vectors = tfidf_vect.transform(test_x)
    test_x_vectors = pd.DataFrame(test_x_vectors.toarray(), columns = tfidf_vect.get_feature_names())
    return train_x_vectors, test_x_vectors

In [18]:
train_tfidf = train[train.label != 3].reset_index(drop = True)
test_tfidf = test[test.label != 3].reset_index(drop = True)
X_train_tfidf, X_test_tfidf = get_tfidf(train_tfidf.review_body, test_tfidf.review_body)
Y_train_tfidf = train_tfidf['label']
Y_test_tfidf = test_tfidf['label']

### Training Perceptron on all three types of feature vectors

In [24]:
per = Percept(X_train = X_train_model, Y_train = Y_train_model, X_test = X_test_model, Y_test = Y_test_model)
model_test_score = per.evaluation()

per2 = Percept(X_train = X_train_pre, Y_train = Y_train_pre, X_test = X_test_pre, Y_test = Y_test_pre)
model_pre_test_score = per2.evaluation()

per3 = Percept(X_train = X_train_tfidf, Y_train = Y_train_tfidf, X_test = X_test_tfidf, Y_test = Y_test_tfidf)
model_tfidf_test_score = per3.evaluation()

Fitting the Model...
Training Score
0.828443  0.793716  0.886345  0.837477
Testing Score
0.829329  0.798938  0.885727  0.840097
Fitting the Model...
Training Score
0.763225  0.690646  0.951428  0.800329
Testing Score
0.767361  0.698484  0.950958  0.805398
Fitting the Model...
Training Score
0.783854  0.956979  0.593071  0.732307
Testing Score
0.778406  0.954277  0.590347  0.729439


From the observation, perceptron with self-trained feature vector model performed the best in terms of accuracy on current dataset, whereas pretrained model and tf-idf one performed similar on the basis of accuracy.

The results may vary according to the number of iterations and learning rate. 

### Training SVM on three types of Feature-vectors

In [25]:
svm = SVM(X_train = X_train_model, Y_train = Y_train_model, X_test = X_test_model, Y_test = Y_test_model)
svm_model_test_score = svm.evaluation()

svm2 = SVM(X_train = X_train_pre, Y_train = Y_train_pre, X_test = X_test_pre, Y_test = Y_test_pre)
svm_pre_test_score = svm2.evaluation()

svm3 = SVM(X_train = X_train_tfidf, Y_train = Y_train_tfidf, X_test = X_test_tfidf, Y_test = Y_test_tfidf)
svm_tfidf_test_score = svm3.evaluation()

Fitting the SVM
Training Score
0.865810  0.852399  0.883987  0.867906
Testing Score
0.867006  0.858566  0.882659  0.870446
Fitting the SVM
Training Score
0.829827  0.812739  0.856039  0.833827
Testing Score
0.830619  0.818565  0.854902  0.836339
Fitting the SVM
Training Score
0.892099  0.888155  0.896438  0.892277
Testing Score
0.886275  0.885772  0.890021  0.887891


From the obervation, TF-IDF SVM performs the best in terms of accuracy, followed by Self-trained and Pretrained SVM.
Results may vary according to type of kernel used and number of iterations.

### For conclusion, Self-trained Word2Vec gives decent performance on average compared to other two feature extraction models for Perceptron and SVM. 