In [1]:
# Allow multiple print statements in a cell in Jupyter Notebook
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"


In [2]:
import pandas as pd
import numpy as np
import nltk

nltk.download("wordnet")
import re


[nltk_data] Downloading package wordnet to /Users/aditya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Install some dependencies
! pip install emot contractions



In [4]:
####################################################
### Data Files
####################################################

DATA_PATH = "/Volumes/dataTwo/usc/CSCI_544/assignment_02/data"
MODEL_PATH = "/Volumes/dataTwo/usc/CSCI_544/assignment_02/model"

ORIGINAL_DATA_FILE = "amazon_reviews_us_Jewelry_v1_00.tsv"
SAMPLED_DATA_FILE = "data_sampled.csv"
CLEANED_DATA_FILE = "data_cleaned.csv"

# Files after clean and split
DATA_FILE = "data.pkl"

# Files after preprocessing the splitted dataset
PREPROCESSED_DATA_FILE = "data_preprocessed.csv"

# Files containing the tfidf data
TFIDF_DATA_FILE = "data_tfidf.csv"

# custom created word vectors for the review dataset
CUSTOM_WORD_VECTORS_MODEL_FILE = "gensim_w2v_amazon_reviews_model"

# train and test data for word2vec avg. word vectors approach
AVG_WORD_VECTORS_DATA_FILE = "data_avg_word_vectors.pkl"

# train and test data for word2vec contatenate top 10 vectors appraoch
TOP_10_WORD_VECTORS_DATA_FILE = "data_avg_word_vectors.pkl"

# train and test data for word2vec contatenate top 10 vectors appraoch
WORDS_20_WORD_VECTORS_DATA_FILE = "data_words_20_word_vectors.pkl"


####################################################
### Model Files
####################################################

PERCEPTRON_TFIDF_MODEL_FILE = "perceptron_tfidf.model"
PERCEPTRON_AVG_WV_MODEL_FILE = "perceptron_avg_wv.model"

SVM_TFIDF_MODEL_FILE = "svm_tfidf.model"
SVM_AVG_WV_MODEL_FILE = "svh_avg_wv.model"

FNN_AVG_WV_MODEL_FILE = "fnn_avg_wv.pth"
FNN_TOP_10_WV_MODEL_FILE = "fnn_top_10_wv.pth"

RNN_TOP_20_WV_MODEL_FILE = "rnn_top_20_wv.pth"
GRU_TOP_20_WV_MODEL_FILE = "gru_top_20_wv.pth"


DATA_COL = "review_body"
TARGET_COL = "star_rating"

N_SAMPLES = 20000

RANDOM_SEED = 42


In [5]:
np.random.seed(RANDOM_SEED)
rng = np.random.default_rng(seed=RANDOM_SEED)


# Data Generation


In [6]:
# Load the tab separated data file, and print the first 5 rows for confirmation
data = pd.read_csv(f"{DATA_PATH}/{ORIGINAL_DATA_FILE}", sep="\t", usecols=[TARGET_COL, DATA_COL], low_memory=True)

# Drop NA values
data.dropna(inplace=True)

# Drop the outlier which is star_rating = "2012-12-21"
data = data[data.star_rating != "2012-12-21"]

# Convert all star rating to integer
data[TARGET_COL] = data.star_rating.astype(int)

# Make target col in range of 0-4
data[TARGET_COL] = data[TARGET_COL] - 1

# Remove nan valued rows
data = data[data.review_body.notnull()]

data.head()


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,star_rating,review_body
0,4,so beautiful even tho clearly not high end ......
1,4,"Great product.. I got this set for my mother, ..."
2,4,Exactly as pictured and my daughter's friend l...
3,4,Love it. Fits great. Super comfortable and nea...
4,4,Got this as a Mother's Day gift for my Mom and...


In [7]:
sampled_data = data.groupby(TARGET_COL, group_keys=False).apply(lambda x: x.sample(N_SAMPLES, random_state=RANDOM_SEED))
sampled_data.reset_index(inplace=True, drop=True)


In [8]:
# save data without cleaning
sampled_data.to_csv(f"{DATA_PATH}/{SAMPLED_DATA_FILE}", header=True, index=False)


In [9]:
# Free Some Memory
del data, sampled_data


## Data Cleaning


### Cleaning Functions

In [10]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: pd.Series):
    return data.str.lower()


In [11]:
def remove_accented_characters(data: pd.Series):
    import unicodedata

    """Removes accented characters from the Series

    Args:
        data (pd.Series): Series of string

    Returns:
        _type_: pd.Series
    """
    import unicodedata

    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))


In [12]:
def remove_html_encodings(data: pd.Series):
    return data.str.replace(r"&#\d+;", " ", regex=True)


In [13]:
def remove_html_tags(data: pd.Series):
    return data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)


In [14]:
def remove_url(data: pd.Series):
    return data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)


In [15]:
def remove_html_and_url(data: pd.Series):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (pd.Series): A Pandas series of type string

    Returns:
        _type_: pd.Series
    """
    # Remove HTML encodings
    data.str.replace(r"&#\d+;", " ", regex=True)

    # Remove HTML tags (both open and closed)
    data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

    # Remove URLs
    data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

    return data


In [16]:
# Handle emoji
def convert_emoji_to_txt(data: pd.Series):
    from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO

    EMO_TO_TXT_DICT = dict()
    for emot in UNICODE_EMOJI:
        EMO_TO_TXT_DICT[emot] = f" {re.sub(r',|:|_', '', UNICODE_EMOJI[emot])} "

    for emo in EMOTICONS_EMO:
        EMO_TO_TXT_DICT[emot] = f" {re.sub(r',| ', '', EMOTICONS_EMO[emo])} "

    def convert_emojis(text, emo_to_txt_dict):
        for emot in emo_to_txt_dict:
            text = text.replace(emot, emo_to_txt_dict[emot])
        return text

    return data.apply(lambda x: convert_emojis(x, EMO_TO_TXT_DICT))


In [17]:
# Remove non-alphabetical characters
def remove_non_alpha_characters(data: pd.Series):
    return data.str.replace(r"_+|\\|[^a-zA-Z0-9\s]", " ", regex=True)


In [18]:
# Remove extra spaces
def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)


In [19]:
# Expanding contractions
def fix_contractions(data: pd.Series):
    import contractions

    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])

    return data.apply(contraction_fixer)


### Perform Cleaning

In [20]:
# Read Sample Data
sampled_data = pd.read_csv(f"{DATA_PATH}/{SAMPLED_DATA_FILE}", sep=",")
sampled_data.head()

Unnamed: 0,star_rating,review_body
0,0,Too small even for the knuckles.
1,0,Did not fit right
2,0,This stupid kit has 16 gauge needles not 14gauge.
3,0,I would not suggest this item I bought the one...
4,0,I am sure that it will be lovely once I get it...


In [21]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    DATA_COL: [
        convert_emoji_to_txt,
        to_lower,
        remove_accented_characters,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = sampled_data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = cleaned_data[col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    cleaned_data[col] = temp_data.copy()


# Remove empty reviews
cleaned_data = cleaned_data[cleaned_data[DATA_COL].str.len() != 0]


Starting: convert_emoji_to_txt
Ended: convert_emoji_to_txt
Starting: to_lower
Ended: to_lower
Starting: remove_accented_characters
Ended: remove_accented_characters
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [22]:
# Store data file
cleaned_data.to_csv(f"{DATA_PATH}/{CLEANED_DATA_FILE}", sep=",", index=False)


In [23]:
# Free Some Memory
del sampled_data, cleaned_data


## Data Preprocessing 
(used by TF-IDF Models)


### Preprocessing Functions

In [24]:
def tokenize(data: pd.Series):
    from nltk.tokenize import word_tokenize

    nltk.download("punkt")

    return data.apply(word_tokenize)


In [25]:
from typing import List, Set


def remove_stopwords(data: pd.Series):
    """Remove stop words using the NLTK stopwords dictionary

    Args:
        string (str): a document

    Returns:
        str: a document with stopwords removed
    """
    from nltk.corpus import stopwords

    nltk.download("stopwords")

    stopwords = set(stopwords.words())

    def remover(word_list: List[str], stopwords: Set[str]):
        return [word for word in word_list if not word in stopwords]

    return data.apply(lambda word_list: remover(word_list, stopwords))


In [26]:
def lemmatize(data: pd.Series, consider_pos_tag: bool = True):
    from nltk.corpus import wordnet
    from nltk.stem import WordNetLemmatizer

    nltk.download("omw-1.4")

    # POS tagging
    def perform_nltk_pos_tag(data: pd.Series):
        from nltk import pos_tag

        nltk.download("averaged_perceptron_tagger")

        return data.apply(pos_tag)

    # Convert POS tag to wordnet pos tags
    def wordnet_pos_tagger(tag: str):
        if tag.startswith("J"):
            return wordnet.ADJ
        elif tag.startswith("V"):
            return wordnet.VERB
        elif tag.startswith("N"):
            return wordnet.NOUN
        elif tag.startswith("R"):
            return wordnet.ADV
        else:
            return None

    lemmatizer = WordNetLemmatizer()
    lemmatized = list()

    if consider_pos_tag:
        pos_tagged_data = data.copy()
        pos_tagged_data = perform_nltk_pos_tag(data)

        for row in pos_tagged_data:

            lemmatized_row = list()

            if consider_pos_tag:
                for word, tag in row:
                    wordnet_pos_tag = wordnet_pos_tagger(tag)

                    if wordnet_pos_tag is None:
                        lemmatized_row.append(word)
                    else:
                        result = lemmatizer.lemmatize(word, wordnet_pos_tag)
                        lemmatized_row.append(lemmatizer.lemmatize(word, wordnet_pos_tag))

            lemmatized.append(lemmatized_row)
    else:
        for row in data:
            lemmatized_row = list()

            for word in row:
                lemmatized_row.append(lemmatizer.lemmatize(word))

            lemmatized.append(lemmatized_row)

    return pd.Series(lemmatized)


In [27]:
# Concatenate lemmatized sentences back into one sentence
def concatenate(data: pd.Series):
    return data.apply(lambda words: " ".join(words))


## Data Split


In [28]:
# Split the data 80-20 split
import pickle as pkl
from sklearn.model_selection import train_test_split

# load clean data
cleaned_data = pd.read_csv(f"{DATA_PATH}/{CLEANED_DATA_FILE}")

# split cleaned data into train and test
train, test = train_test_split(cleaned_data, test_size=0.2, stratify=cleaned_data[TARGET_COL], random_state=RANDOM_SEED)
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

# save the split
with open(f"{DATA_PATH}/{DATA_FILE}", mode="wb") as file:
    pkl.dump((train, test), file)


In [29]:
# Free Some Memory
del cleaned_data


### Do Preprocessing


In [30]:
import pickle as pkl

train, test = None, None
with open(f"{DATA_PATH}/{DATA_FILE}", mode="rb") as file:
    train, test = pkl.load(file)


In [31]:
import pickle as pkl

preprocessing_pipeline = {DATA_COL: [tokenize, lemmatize, concatenate]}

# Run the pipeline
preprocessed_train_data = train.copy()
preprocessed_test_data = test.copy()

# Process all the cleaning instructions
for col, pipeline in preprocessing_pipeline.items():
    # Get the column to perform cleaning on
    temp_data_train = preprocessed_train_data[col].copy()
    temp_data_test = preprocessed_test_data[col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")

        if func.__name__ == "lemmatize":
            temp_data_train = func(temp_data_train, consider_pos_tag=True)
            temp_data_test = func(temp_data_test, consider_pos_tag=True)
        else:
            temp_data_train = func(temp_data_train)
            temp_data_test = func(temp_data_test)

        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    preprocessed_train_data[col] = temp_data_train
    preprocessed_test_data[col] = temp_data_test

# Remove empty reviews
preprocessed_train_data = preprocessed_train_data[preprocessed_train_data[DATA_COL].str.len() != 0]
preprocessed_test_data = preprocessed_test_data[preprocessed_test_data[DATA_COL].str.len() != 0]

# Remove NaN
preprocessed_train_data.dropna(inplace=True)
preprocessed_test_data.dropna(inplace=True)


Starting: tokenize


[nltk_data] Downloading package punkt to /Users/aditya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/aditya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Ended: tokenize
Starting: lemmatize


[nltk_data] Downloading package omw-1.4 to /Users/aditya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/aditya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /Users/aditya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/aditya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Ended: lemmatize
Starting: concatenate
Ended: concatenate


In [32]:
# Save the preprocessed data
with open(f"{DATA_PATH}/{PREPROCESSED_DATA_FILE}", mode="wb") as file:
    pkl.dump((preprocessed_train_data, preprocessed_test_data), file)


In [33]:
preprocessed_train_data.head() #come_here

Unnamed: 0,star_rating,review_body
0,4,purchase this for my daughter she be very happ...
1,3,buy it to replace a broken chain and it be ver...
2,0,i love the design but unfortunately it be so c...
3,1,this item turn out to be much chunky than i ha...
4,1,i order dark palette and i get spring pastel t...


In [34]:
# Free Some Memory
del train, test, preprocessed_train_data, preprocessed_test_data


## Word Embedding


### Load `word2vec-google-news-300` Model

Learn how to extract word embeddings for your dataset. Try to check semantic similarities of the generated vectors using three examples of your own, e.g., King − Man + Woman = Queen or excellent ∼ outstanding.


In [35]:
! pip install gensim



In [148]:
import gensim.downloader as api

w2v_google = api.load("word2vec-google-news-300")


In [149]:
w2v_google.most_similar(positive=["king", "woman"], negative=["man"], topn=1)


[('queen', 0.7118192911148071)]

In [150]:
w2v_google.similarity("excellent", "outstanding")


0.5567486

Three Semantic Similarity Examples


In [151]:
w2v_google.similarity("worst", "terrible")


0.55750686

In [152]:
w2v_google.similarity("cheap", "best")


0.18926445

In [153]:
w2v_google.most_similar(positive=["cheap", "damaged"], negative=["worst"], topn=1)


[('inexpensive', 0.46950358152389526)]

### Train Word2Vec on own Dataset


In [154]:
from gensim.test.utils import datapath
from gensim import utils


class AmazonReviewCorpus:
    def __iter__(self):
        data = pd.read_csv(f"{DATA_PATH}/{CLEANED_DATA_FILE}", sep=",")

        for review_body in data["review_body"]:
            yield utils.simple_preprocess(review_body)


In [155]:
from gensim.models import Word2Vec
from gensim import utils

w2v_custom = Word2Vec(vector_size=300, min_count=10, window=11, workers=4)


In [156]:
# Build the vocabulary
reviews = AmazonReviewCorpus()
w2v_custom.build_vocab(reviews, progress_per=1000)


In [157]:
# Train the model
w2v_custom.train(reviews, total_examples=w2v_custom.corpus_count, epochs=w2v_custom.epochs)


(11712504, 16910110)

In [158]:
# Save the trained model
w2v_custom.save(f"{MODEL_PATH}/{CUSTOM_WORD_VECTORS_MODEL_FILE}")


#### Using trained model


In [159]:
# Load the trained model
w2v_custom = Word2Vec.load(f"{MODEL_PATH}/{CUSTOM_WORD_VECTORS_MODEL_FILE}")


In [160]:
w2v_custom.wv.most_similar(positive=["king", "woman"], negative=["man"], topn=1)


[('dealer', 0.5524013042449951)]

In [161]:
w2v_custom.wv.similarity("excellent", "outstanding")


0.81633675

Three Semantic Similarity Examples


In [162]:
w2v_custom.wv.similarity("worst", "terrible")


0.37887666

In [163]:
w2v_custom.wv.similarity("cheap", "best")


0.14116523

In [164]:
w2v_custom.wv.most_similar(positive=["cheap", "damaged"], negative=["worst"], topn=1)


[('dirty', 0.4841519594192505)]

In [165]:
# Free Some Memory
del w2v_custom


What do you conclude from comparing vectors generated by yourself and the pretrained model? Which of the Word2Vec models seems to encode semantic similarities between words better?


TODO: Answer


## Avg. Word Vectors


Use the average Word2Vec vectors for each review as the input feature (x = N1 􏰀Ni=1 Wi for a review with N words)


In [54]:
def calculate_avg_word_vector(words: List[str], w2v_model):
    result_vector = np.ndarray(shape=(300,), buffer=np.zeros((300,)), dtype=float)
    removed_word_count = 0
    for word in words:
        try:
            result_vector += w2v_model[word]
        except KeyError:
            removed_word_count += 1
    return result_vector / (len(words) - removed_word_count)


##### Perform Word2Vec conversion


In [55]:
# Load the cleaned but not preprocessed data
import pickle as pkl

train_wv, test_wv = None, None
with open(f"{DATA_PATH}/{DATA_FILE}", mode="rb") as file:
    train_wv, test_wv = pkl.load(file)


In [56]:
from functools import partial
from gensim.utils import simple_preprocess

# Preprocess using gensim'simple_preprocess
train_wv[DATA_COL] = train_wv[DATA_COL].apply(simple_preprocess)
test_wv[DATA_COL] = test_wv[DATA_COL].apply(simple_preprocess)

TEMP_COL = "avg_word_vector"
VECTOR_COLS = [f"vector_{i}" for i in range(300)]

# Train Data
train_wv[TEMP_COL] = train_wv[DATA_COL].apply(partial(calculate_avg_word_vector, w2v_model=w2v_google))

wv_df = pd.DataFrame(train_wv[TEMP_COL].to_list(), index=train_wv[TEMP_COL].index, columns=VECTOR_COLS)

train_wv = pd.concat([train_wv, wv_df], axis=1)
train_wv.dropna(inplace=True)

X_wv_train = train_wv.drop([DATA_COL, TEMP_COL, TARGET_COL], axis=1)
y_wv_train = train_wv[TARGET_COL]


# Test Data
test_wv[TEMP_COL] = test_wv[DATA_COL].apply(partial(calculate_avg_word_vector, w2v_model=w2v_google))

test_wv_df = pd.DataFrame(test_wv[TEMP_COL].to_list(), index=test_wv[TEMP_COL].index, columns=VECTOR_COLS)

test_wv = pd.concat([test_wv, test_wv_df], axis=1)
test_wv.dropna(inplace=True)

X_wv_test = test_wv.drop([DATA_COL, TEMP_COL, TARGET_COL], axis=1)
y_wv_test = test_wv[TARGET_COL]


In [57]:
X_wv_train.head()

Unnamed: 0,vector_0,vector_1,vector_2,vector_3,vector_4,vector_5,vector_6,vector_7,vector_8,vector_9,...,vector_290,vector_291,vector_292,vector_293,vector_294,vector_295,vector_296,vector_297,vector_298,vector_299
0,0.042153,0.014164,-0.031451,0.093822,-0.019848,-0.023349,0.06069,-0.084643,0.057523,0.043411,...,-0.065152,0.063943,-0.116344,0.008554,-0.038616,-0.005599,0.00415,-0.079447,0.062683,-0.045749
1,0.062426,0.022175,0.032739,0.080762,-0.113143,-0.01377,0.066288,-0.067008,0.025391,0.072654,...,-0.062382,0.053597,-0.101337,0.092712,-0.013452,0.061283,0.063713,-0.04164,0.076851,-0.067822
2,0.01635,0.009404,0.029967,0.091029,-0.042893,-0.05003,0.069718,-0.08467,0.088954,0.07595,...,-0.089612,0.045468,-0.122181,0.0462,-0.016519,0.012339,0.000575,-0.070973,0.06871,-0.078934
3,-0.01446,0.011441,0.032575,0.13002,-0.073035,-0.010424,0.089496,-0.063508,0.024117,0.094315,...,-0.102476,0.020297,-0.061961,0.107353,0.00371,0.016532,0.031951,-0.066222,0.059737,-0.078636
4,0.069726,0.051299,0.026449,0.081157,-0.036284,-0.056623,0.053578,-0.101608,0.029151,0.094448,...,-0.099021,0.018674,-0.094229,0.028641,0.015122,-0.019486,0.021722,-0.040834,0.069369,-0.000185


In [58]:
# Save the avg'ed word vectors dataset
import pickle as pkl

with open(f"{DATA_PATH}/{AVG_WORD_VECTORS_DATA_FILE}", mode="wb") as file:
    pkl.dump((X_wv_train, y_wv_train, X_wv_test, y_wv_test), file)


In [59]:
# Free Some Memory
del train_wv, test_wv, X_wv_train, y_wv_train, X_wv_test, y_wv_test


## Simple Models


Report your accuracy values on the testing split for these models similar to HW1, i.e., for each of perceptron and SVM models, report two accuracy values Word2Vec and TF-IDF features.


### TF-IDF Vectorization


In [60]:
# Load train and test data
import pickle as pkl

train_preprocessed, test_preprocessed = None, None
with open(f"{DATA_PATH}/{PREPROCESSED_DATA_FILE}", mode="rb") as file:
    train_preprocessed, test_preprocessed = pkl.load(file)


In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

nltk.download("punkt")

vectorizer = TfidfVectorizer(tokenizer=word_tokenize)

# Using entire data to fit as the dataset is small and as using entire dataset is not needed for homework requirement
all_data = pd.concat([train_preprocessed, test_preprocessed], axis=0)
vectorizer.fit(all_data[DATA_COL])

X_tfidf_train = vectorizer.transform(train_preprocessed[DATA_COL])
X_tfidf_test = vectorizer.transform(test_preprocessed[DATA_COL])
y_tfidf_train = train_preprocessed[TARGET_COL]
y_tfidf_test = test_preprocessed[TARGET_COL]


[nltk_data] Downloading package punkt to /Users/aditya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True



In [62]:
# Save train and test TFIDF vectors
import pickle as pkl

with open(f"{DATA_PATH}/{TFIDF_DATA_FILE}", mode="wb") as file:
    pkl.dump((X_tfidf_train, y_tfidf_train, X_tfidf_test, y_tfidf_test), file)


In [63]:
# Free Some Memory
del train_preprocessed, test_preprocessed, all_data, X_tfidf_train, y_tfidf_train, X_tfidf_test, y_tfidf_test


### Perceptron


> #### TF-IDF Based Approach


In [64]:
# Load TF-IDF data
import pickle as pkl

X_tfidf_train, y_tfidf_train, X_tfidf_test, y_tfidf_test = None, None, None, None
with open(f"{DATA_PATH}/{TFIDF_DATA_FILE}", mode="rb") as file:
    X_tfidf_train, y_tfidf_train, X_tfidf_test, y_tfidf_test = pkl.load(file)


In [65]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report

perceptron_tfidf_clf = Perceptron(
    max_iter=8000, alpha=0.012, random_state=RANDOM_SEED, tol=1e-4, early_stopping=True, class_weight="balanced"
)

perceptron_tfidf_clf.fit(X_tfidf_train, y_tfidf_train)

y_tfidf_pred = perceptron_tfidf_clf.predict(X_tfidf_test)


In [66]:
# Print Metrics
from sklearn.metrics import classification_report, accuracy_score

print(
    classification_report(y_tfidf_test, y_tfidf_pred), "\nAccuracy Score: ", accuracy_score(y_tfidf_test, y_tfidf_pred)
)


              precision    recall  f1-score   support

           0       0.57      0.33      0.42      3999
           1       0.30      0.59      0.40      4000
           2       0.34      0.24      0.28      4000
           3       0.39      0.36      0.38      3999
           4       0.61      0.52      0.56      3998

    accuracy                           0.41     19996
   macro avg       0.44      0.41      0.41     19996
weighted avg       0.44      0.41      0.41     19996
 
Accuracy Score:  0.4077815563112622


In [67]:
# Save the model
with open(f"{MODEL_PATH}/{PERCEPTRON_TFIDF_MODEL_FILE}", mode="wb") as file:
    import joblib

    joblib.dump(perceptron_tfidf_clf, file)


In [68]:
# Delete Model and related variables
del X_tfidf_train, y_tfidf_train, X_tfidf_test, y_tfidf_test, perceptron_tfidf_clf, y_tfidf_pred


> #### Word2Vec Based Approach - Avg. Word Vectors


In [69]:
# Load Avg. Word2Vec Data
import pickle as pkl

X_wv_train, y_wv_train, X_wv_test, y_wv_test = None, None, None, None
with open(f"{DATA_PATH}/{AVG_WORD_VECTORS_DATA_FILE}", mode="rb") as file:
    X_wv_train, y_wv_train, X_wv_test, y_wv_test = pkl.load(file)


In [70]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report

perceptron_wv_clf = Perceptron(
    max_iter=8000, alpha=0.012, random_state=RANDOM_SEED, tol=1e-4, early_stopping=True, class_weight="balanced"
)

perceptron_wv_clf = Perceptron(max_iter=8000, alpha=0.012, random_state=RANDOM_SEED, tol=1e-4, early_stopping=True)

perceptron_wv_clf.fit(X_wv_train, y_wv_train)

y_wv_pred = perceptron_wv_clf.predict(X_wv_test)


In [71]:
# Print Metrics
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_wv_test, y_wv_pred), "\nAccuracy Score: ", accuracy_score(y_wv_test, y_wv_pred))


              precision    recall  f1-score   support

           0       0.60      0.50      0.55      3996
           1       0.35      0.31      0.33      3998
           2       0.38      0.09      0.14      3999
           3       0.31      0.58      0.41      3994
           4       0.53      0.64      0.58      3992

    accuracy                           0.42     19979
   macro avg       0.43      0.42      0.40     19979
weighted avg       0.43      0.42      0.40     19979
 
Accuracy Score:  0.4237449321787877


In [72]:
# Save the model
with open(f"{MODEL_PATH}/{PERCEPTRON_AVG_WV_MODEL_FILE}", mode="wb") as file:
    import joblib

    joblib.dump(perceptron_wv_clf, file)


In [73]:
# Delete model and related variables
del X_wv_train, y_wv_train, X_wv_test, y_wv_test, perceptron_wv_clf, y_wv_pred


### SVM


> #### TF-IDF Based


In [74]:
# Load TF-IDF data
import pickle as pkl

X_tfidf_train, y_tfidf_train, X_tfidf_test, y_tfidf_test = None, None, None, None
with open(f"{DATA_PATH}/{TFIDF_DATA_FILE}", mode="rb") as file:
    X_tfidf_train, y_tfidf_train, X_tfidf_test, y_tfidf_test = pkl.load(file)


In [75]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

class_weight = {0: 0.9525, 1: 1.99825, 2: 1.9225, 3: 0.625, 4: 0.8585}

svm_tfidf_clf = LinearSVC(dual=False, C=0.1, max_iter=1000, class_weight=class_weight, random_state=RANDOM_SEED)

svm_tfidf_clf.fit(X_tfidf_train, y_tfidf_train)

y_tfidf_pred = svm_tfidf_clf.predict(X_tfidf_test)

print(classification_report(y_tfidf_test, y_tfidf_pred))


              precision    recall  f1-score   support

           0       0.66      0.50      0.57      3999
           1       0.38      0.51      0.44      4000
           2       0.38      0.57      0.46      4000
           3       0.61      0.14      0.23      3999
           4       0.66      0.78      0.72      3998

    accuracy                           0.50     19996
   macro avg       0.54      0.50      0.48     19996
weighted avg       0.54      0.50      0.48     19996



In [76]:
# Print Metrics
from sklearn.metrics import classification_report, accuracy_score

print(
    classification_report(y_tfidf_test, y_tfidf_pred), "\nAccuracy Score: ", accuracy_score(y_tfidf_test, y_tfidf_pred)
)


              precision    recall  f1-score   support

           0       0.66      0.50      0.57      3999
           1       0.38      0.51      0.44      4000
           2       0.38      0.57      0.46      4000
           3       0.61      0.14      0.23      3999
           4       0.66      0.78      0.72      3998

    accuracy                           0.50     19996
   macro avg       0.54      0.50      0.48     19996
weighted avg       0.54      0.50      0.48     19996
 
Accuracy Score:  0.5004500900180036


In [77]:
# Save the model
with open(f"{MODEL_PATH}/{SVM_TFIDF_MODEL_FILE}", mode="wb") as file:
    import joblib

    joblib.dump(svm_tfidf_clf, file)


In [78]:
# Delete model and variables
del X_tfidf_train, y_tfidf_train, X_tfidf_test, y_tfidf_test, svm_tfidf_clf, y_tfidf_pred


> #### Word2Vec Based Approach - Avg. Word Vectors


In [79]:
# Load Avg. Word2Vec Data
import pickle as pkl

X_wv_train, y_wv_train, X_wv_test, y_wv_test = None, None, None, None
with open(f"{DATA_PATH}/{AVG_WORD_VECTORS_DATA_FILE}", mode="rb") as file:
    X_wv_train, y_wv_train, X_wv_test, y_wv_test = pkl.load(file)


In [80]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

class_weight = {0: 0.9525, 1: 1.99825, 2: 1.9225, 3: 0.625, 4: 0.8585}

svm_wv_clf = LinearSVC(dual=False, C=0.1, max_iter=1000, class_weight=class_weight, random_state=RANDOM_SEED)

svm_wv_clf.fit(X_wv_train, y_wv_train)

y_wv_avged_pred = svm_wv_clf.predict(X_wv_test)


In [81]:
# Print Metrics
from sklearn.metrics import classification_report, accuracy_score

print(
    classification_report(y_wv_test, y_wv_avged_pred), "\nAccuracy Score: ", accuracy_score(y_wv_test, y_wv_avged_pred)
)


              precision    recall  f1-score   support

           0       0.66      0.37      0.48      3996
           1       0.35      0.57      0.43      3998
           2       0.33      0.56      0.42      3999
           3       0.51      0.02      0.04      3994
           4       0.64      0.69      0.67      3992

    accuracy                           0.44     19979
   macro avg       0.50      0.44      0.41     19979
weighted avg       0.50      0.44      0.41     19979
 
Accuracy Score:  0.4421142199309275


In [82]:
# Save the model
with open(f"{MODEL_PATH}/{SVM_AVG_WV_MODEL_FILE}", mode="wb") as file:
    import joblib

    joblib.dump(svm_wv_clf, file)


In [83]:
# Delete model and variables
del X_wv_train, y_wv_train, X_wv_test, y_wv_test, svm_wv_clf, y_wv_avged_pred


What do you conclude from comparing performances for the models trained using the two different feature types (TF-IDF and your trained Word2Vec features)?


TODO: Answer


> ## PyTorch Setup


In [84]:
import torch
import torch.nn as nn
import torch.nn.functional as func
from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler

device = None
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

## TODO: REMOVE THIS LINE
device = torch.device("cpu")

print(f"Using Device: {device}")


Using Device: cpu


#### Helper Functions


In [85]:
def train_model(data_loader, model, n_epochs, optimizer, criterion):
    for epoch in range(n_epochs):
        train_loss = 0.0

        ###################
        # train the model #
        ###################
        model.train()  # prep model for training
        for inputs, target in data_loader:
            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(inputs)
            # calculate the loss
            loss = criterion(output.squeeze(1), target)
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            # update running training loss
            train_loss += loss.item() * inputs.size(0)

        # calculate average loss over an epoch
        train_loss = train_loss / len(data_loader.dataset)

        print(f"Epoch: {epoch + 1} \tTraining Loss: {train_loss:.6f}")

    return model


In [86]:
def predict(model, data_loader):
    model.eval()
    with torch.no_grad():
        y_preds, y_true = list(), list()
        for input, target in data_loader:
            output = model(input)
            _, predictions = torch.max(output, 1)
            predictions = predictions.cpu().numpy().squeeze()
            targets = target.cpu().numpy().squeeze()
            y_preds.append(predictions)
            y_true.append(targets)
    return np.ravel(np.array(y_true)), np.ravel(np.array(y_preds))


## Feedforward Neural Network


Using the Word2Vec features, train a feedforward multilayer perceptron net- work for classification. Consider a network with two hidden layers, each with 50 and 10 nodes, respectively. You can use cross entropy loss and your own choice for other hyperparamters, e.g., nonlinearity, number of epochs, etc. Part of getting good results is to select good values for these hyperparamters.


### Create FNN


In [87]:
class FNN(nn.Module):
    def __init__(self, n_input, n_output, dropout_rate) -> None:
        super(FNN, self).__init__()

        self.n_input = n_input
        self.n_hidden_1 = 50
        self.n_hidden_2 = 10
        self.n_output = n_output
        self.dropout_rate = dropout_rate

        self.fc1 = nn.Linear(self.n_input, self.n_hidden_1)
        self.fc2 = nn.Linear(self.n_hidden_1, self.n_hidden_2)
        self.fc3 = nn.Linear(self.n_hidden_2, self.n_output)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = func.relu(self.fc1(x))
        x = self.dropout(x)
        x = func.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x


> ### Using Avg. Word2Vec Vectors


In [88]:
# Load Avg. Word2Vec Data
import pickle as pkl

X_wv_train, y_wv_train, X_wv_test, y_wv_test = None, None, None, None
with open(f"{DATA_PATH}/{AVG_WORD_VECTORS_DATA_FILE}", mode="rb") as file:
    X_wv_train, y_wv_train, X_wv_test, y_wv_test = pkl.load(file)


In [89]:
fnn_wv_model = FNN(n_input=300, n_output=5, dropout_rate=0.2).to(device)
fnn_wv_model


FNN(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [90]:
FNN_AVG_WV_BATCH_SIZE = 16


In [91]:
X_wv_train_tensor = torch.FloatTensor(X_wv_train.values).to(device)
y_wv_train_tensor = torch.LongTensor(y_wv_train.values).to(device)
X_wv_test_tensor = torch.FloatTensor(X_wv_test.values).to(device)
y_wv_test_tensor = torch.LongTensor(y_wv_test.values).to(device)

fnn_wv_train = TensorDataset(X_wv_train_tensor, y_wv_train_tensor)
fnn_wv_train_loader = DataLoader(fnn_wv_train, batch_size=FNN_AVG_WV_BATCH_SIZE, drop_last=True, shuffle=True)

fnn_wv_test = TensorDataset(X_wv_test_tensor, y_wv_test_tensor)
fnn_wv_test_loader = DataLoader(fnn_wv_test, batch_size=FNN_AVG_WV_BATCH_SIZE, drop_last=True, shuffle=True)


In [92]:
from sklearn.utils import class_weight

class_weights = torch.FloatTensor(
    class_weight.compute_class_weight("balanced", classes=np.unique(y_wv_train.values), y=y_wv_train.values)
).to(device)


In [93]:
# Loss Function and Optimizer
criterion_wv = nn.CrossEntropyLoss(weight=class_weights).to(device)

optimizer_wv = torch.optim.Adam(fnn_wv_model.parameters(), lr=32e-5)


In [94]:
# Free Some Memory
del X_wv_train, y_wv_train, X_wv_test, y_wv_test


In [95]:
fnn_wv_model = train_model(
    fnn_wv_train_loader, fnn_wv_model, n_epochs=32, optimizer=optimizer_wv, criterion=criterion_wv
)


Epoch: 1 	Training Loss: 1.334452
Epoch: 2 	Training Loss: 1.240569
Epoch: 3 	Training Loss: 1.222275
Epoch: 4 	Training Loss: 1.209150
Epoch: 5 	Training Loss: 1.200559
Epoch: 6 	Training Loss: 1.193135
Epoch: 7 	Training Loss: 1.189160
Epoch: 8 	Training Loss: 1.185600
Epoch: 9 	Training Loss: 1.175386
Epoch: 10 	Training Loss: 1.172929
Epoch: 11 	Training Loss: 1.169147
Epoch: 12 	Training Loss: 1.167287
Epoch: 13 	Training Loss: 1.162369
Epoch: 14 	Training Loss: 1.159795
Epoch: 15 	Training Loss: 1.157287
Epoch: 16 	Training Loss: 1.154580
Epoch: 17 	Training Loss: 1.151378
Epoch: 18 	Training Loss: 1.151889
Epoch: 19 	Training Loss: 1.148454
Epoch: 20 	Training Loss: 1.146562
Epoch: 21 	Training Loss: 1.142714
Epoch: 22 	Training Loss: 1.143547
Epoch: 23 	Training Loss: 1.140720
Epoch: 24 	Training Loss: 1.139199
Epoch: 25 	Training Loss: 1.138398
Epoch: 26 	Training Loss: 1.137973
Epoch: 27 	Training Loss: 1.136690
Epoch: 28 	Training Loss: 1.134158
Epoch: 29 	Training Loss: 1.1

In [96]:
y_wv_true, y_wv_pred = predict(fnn_wv_model, fnn_wv_test_loader)


Report accuracy on the testing split.


In [97]:
# Print Metrics
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_wv_true, y_wv_pred), "\nAccuracy Score: ", accuracy_score(y_wv_true, y_wv_pred))


              precision    recall  f1-score   support

           0       0.57      0.69      0.63      3994
           1       0.39      0.32      0.35      3996
           2       0.42      0.38      0.40      3998
           3       0.46      0.39      0.42      3991
           4       0.62      0.76      0.68      3989

    accuracy                           0.51     19968
   macro avg       0.49      0.51      0.50     19968
weighted avg       0.49      0.51      0.50     19968
 
Accuracy Score:  0.5084635416666666


In [98]:
# Save Model
torch.save(fnn_wv_model.state_dict(), f"{MODEL_PATH}/{FNN_AVG_WV_MODEL_FILE}")


In [99]:
del fnn_wv_model, criterion_wv, optimizer_wv, y_wv_true, y_wv_pred


> ### Concatenate First 10 Word2Vec Vectors


In [100]:
def concatenate_top_n_wv(words_in_sentence: str, w2v_model, n=10):
    top_n_wvs = np.zeros(300 * n, dtype=np.float64)
    count = 0

    for word in words_in_sentence:
        if count == n or count == words_in_sentence:
            break
        count += 1
        try:
            top_n_wvs[(count - 1) * 300 : count * 300] = w2v_model[word]
        except KeyError:
            count -= 1
            continue

    if np.all((top_n_wvs == 0)):
        return np.nan

    return top_n_wvs


##### Apply Top 10 Word Vectors


In [101]:
# Load the cleaned but not preprocessed data
import pickle as pkl

train_wv_top_10, test_wv_top_10 = None, None
with open(f"{DATA_PATH}/{DATA_FILE}", mode="rb") as file:
    train_wv_top_10, test_wv_top_10 = pkl.load(file)


In [102]:
from functools import partial
from gensim.utils import simple_preprocess

# Preprocess using gensim'simple_preprocess
train_wv_top_10[DATA_COL] = train_wv_top_10[DATA_COL].apply(simple_preprocess)
test_wv_top_10[DATA_COL] = test_wv_top_10[DATA_COL].apply(simple_preprocess)

TEMP_COL = "top_10_wv"
VECTOR_COLS = [f"vector_{i}" for i in range(3000)]

# Train
train_wv_top_10[TEMP_COL] = train_wv_top_10[DATA_COL].apply(partial(concatenate_top_n_wv, w2v_model=w2v_google, n=10))
train_wv_top_10.dropna(inplace=True)

wv_df = pd.DataFrame(train_wv_top_10[TEMP_COL].to_list(), index=train_wv_top_10[TEMP_COL].index, columns=VECTOR_COLS)

train_wv_top_10 = pd.concat([train_wv_top_10, wv_df], axis=1)
train_wv_top_10.dropna(inplace=True)

X_wv_top_10_train = train_wv_top_10.drop([DATA_COL, TEMP_COL, TARGET_COL], axis=1)
y_wv_top_10_train = train_wv_top_10[TARGET_COL]


# Test
test_wv_top_10[TEMP_COL] = test_wv_top_10[DATA_COL].apply(partial(concatenate_top_n_wv, w2v_model=w2v_google, n=10))
test_wv_top_10.dropna(inplace=True)

wv_df = pd.DataFrame(test_wv_top_10[TEMP_COL].to_list(), index=test_wv_top_10[TEMP_COL].index, columns=VECTOR_COLS)

test_wv_top_10 = pd.concat([test_wv_top_10, wv_df], axis=1)
test_wv_top_10.dropna(inplace=True)

X_wv_top_10_test = test_wv_top_10.drop([DATA_COL, TEMP_COL, TARGET_COL], axis=1)
y_wv_top_10_test = test_wv_top_10[TARGET_COL]


In [103]:
# Save the avg'ed word vectors dataset
import pickle as pkl

with open(f"{DATA_PATH}/{TOP_10_WORD_VECTORS_DATA_FILE}", mode="wb") as file:
    pkl.dump((X_wv_top_10_train, y_wv_top_10_train, X_wv_top_10_test, y_wv_top_10_test), file)


In [104]:
# Free Some Memory
del train_wv_top_10, test_wv_top_10, X_wv_top_10_train, y_wv_top_10_train, X_wv_top_10_test, y_wv_top_10_test


> ### Using Top 10 Word2Vec Vectors


In [105]:
# Load Avg. Word2Vec Data
import pickle as pkl

X_wv_top_10_train, y_wv_top_10_train, X_wv_top_10_test, y_wv_top_10_test = None, None, None, None
with open(f"{DATA_PATH}/{AVG_WORD_VECTORS_DATA_FILE}", mode="rb") as file:
    X_wv_top_10_train, y_wv_top_10_train, X_wv_top_10_test, y_wv_top_10_test = pkl.load(file)


In [106]:
FNN_TOP_10_WV_BATCH_SIZE = 16

In [107]:
X_wv_top_10_train_tensor = torch.FloatTensor(X_wv_top_10_train.values).to(device)
y_wv_top_10_train_tensor = torch.LongTensor(y_wv_top_10_train.values).to(device)
X_wv_top_10_test_tensor = torch.FloatTensor(X_wv_top_10_test.values).to(device)
y_wv_top_10_test_tensor = torch.LongTensor(y_wv_top_10_test.values).to(device)

fnn_wv_top_10_train = TensorDataset(X_wv_top_10_train_tensor, y_wv_top_10_train_tensor)
fnn_wv_top_10_train_loader = DataLoader(fnn_wv_top_10_train, batch_size=FNN_TOP_10_WV_BATCH_SIZE, drop_last=True, shuffle=True)

fnn_wv_top_10_test = TensorDataset(X_wv_top_10_test_tensor, y_wv_top_10_test_tensor)
fnn_wv_top_10_test_loader = DataLoader(fnn_wv_top_10_test, batch_size=FNN_TOP_10_WV_BATCH_SIZE, drop_last=True, shuffle=True)


In [108]:
fnn_wv_top_10_model = FNN(n_input=3000, n_output=5, dropout_rate=0.2).to(device)
fnn_wv_top_10_model


FNN(
  (fc1): Linear(in_features=3000, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [109]:
from sklearn.utils import class_weight

class_weights = torch.FloatTensor(
    class_weight.compute_class_weight(
        "balanced", classes=np.unique(y_wv_top_10_train.values), y=y_wv_top_10_train.values
    )
).to(device)


In [110]:
# Loss Function and Optimizer
criterion_wv_top_10 = nn.CrossEntropyLoss(weight=class_weights).to(device)

optimizer_wv_top_10 = torch.optim.Adam(fnn_wv_top_10_model.parameters(), lr=32e-4)


In [111]:
del (X_wv_top_10_train, y_wv_top_10_train, X_wv_top_10_test, y_wv_top_10_test)


In [112]:
fnn_wv_top_10_model = train_model(
    fnn_wv_top_10_train_loader,
    fnn_wv_top_10_model,
    n_epochs=32,
    optimizer=optimizer_wv_top_10,
    criterion=criterion_wv_top_10,
)


Epoch: 1 	Training Loss: 1.395733
Epoch: 2 	Training Loss: 1.338381
Epoch: 3 	Training Loss: 1.314909
Epoch: 4 	Training Loss: 1.293625
Epoch: 5 	Training Loss: 1.278479
Epoch: 6 	Training Loss: 1.260963
Epoch: 7 	Training Loss: 1.249624
Epoch: 8 	Training Loss: 1.237256
Epoch: 9 	Training Loss: 1.226008
Epoch: 10 	Training Loss: 1.213949
Epoch: 11 	Training Loss: 1.203274
Epoch: 12 	Training Loss: 1.195260
Epoch: 13 	Training Loss: 1.185953
Epoch: 14 	Training Loss: 1.176826
Epoch: 15 	Training Loss: 1.172334
Epoch: 16 	Training Loss: 1.160815
Epoch: 17 	Training Loss: 1.152731
Epoch: 18 	Training Loss: 1.147118
Epoch: 19 	Training Loss: 1.141121
Epoch: 20 	Training Loss: 1.135564
Epoch: 21 	Training Loss: 1.128876
Epoch: 22 	Training Loss: 1.123837
Epoch: 23 	Training Loss: 1.117985
Epoch: 24 	Training Loss: 1.108776
Epoch: 25 	Training Loss: 1.106559
Epoch: 26 	Training Loss: 1.098990
Epoch: 27 	Training Loss: 1.096045
Epoch: 28 	Training Loss: 1.089394
Epoch: 29 	Training Loss: 1.0

In [113]:
y_wv_top_10_true, y_wv_top_10_pred = predict(fnn_wv_top_10_model, fnn_wv_top_10_test_loader)


Report the accuracy value on the testing split for your MLP model.


In [114]:
# Print Metrics
from sklearn.metrics import classification_report, accuracy_score

print(
    classification_report(y_wv_top_10_true, y_wv_top_10_pred),
    "\nAccuracy Score: ",
    accuracy_score(y_wv_top_10_true, y_wv_top_10_pred),
)


              precision    recall  f1-score   support

           0       0.55      0.37      0.44      3995
           1       0.31      0.38      0.34      3995
           2       0.35      0.34      0.35      3998
           3       0.35      0.45      0.39      3992
           4       0.54      0.45      0.49      3988

    accuracy                           0.40     19968
   macro avg       0.42      0.40      0.40     19968
weighted avg       0.42      0.40      0.40     19968
 
Accuracy Score:  0.39758613782051283


In [115]:
# Save Model
torch.save(fnn_wv_top_10_model.state_dict(), f"{MODEL_PATH}/{FNN_TOP_10_WV_MODEL_FILE}")


In [116]:
del (
    fnn_wv_top_10_model,
    criterion_wv_top_10,
    optimizer_wv_top_10,
    y_wv_top_10_true,
    y_wv_top_10_pred,
)


What do you conclude by comparing accuracy values you obtain with those obtained in the “’Simple Models” section?


TODO: Answer


## Recurrent Neural Networks


### Building Dataset with Review Length of 20 words

In [117]:
def get_top_n_words(words_in_sentence: List[str], w2v_model, n=20):
    top_n_words = np.zeros((20, 300), dtype=np.float32)
    idx = 0
    for word in words_in_sentence:
        if idx == n or idx == len(words_in_sentence):
            break
        try:
            top_n_words[idx] = w2v_model[word]
            idx += 1
        except KeyError:
            continue
    if idx == 0:
        return np.nan
    return top_n_words


In [118]:
# Load the cleaned but not preprocessed data
import pickle as pkl

train_wv_20_words, test_wv_20_words = None, None
with open(f"{DATA_PATH}/{DATA_FILE}", mode="rb") as file:
    train_wv_20_words, test_wv_20_words = pkl.load(file)


In [119]:
from functools import partial
from gensim.utils import simple_preprocess

# Preprocess using gensim'simple_preprocess
train_wv_20_words[DATA_COL] = train_wv_20_words[DATA_COL].apply(simple_preprocess)
test_wv_20_words[DATA_COL] = test_wv_20_words[DATA_COL].apply(simple_preprocess)

TEMP_COL = "top_n_words"
VECTOR_COLS = [f"vector_{i}" for i in range(300)]

# Train Data
train_wv_20_words[TEMP_COL] = train_wv_20_words[DATA_COL].apply(partial(get_top_n_words, w2v_model=w2v_google))
train_wv_20_words.dropna(inplace=True)

X_wv_20_words_train = train_wv_20_words[TEMP_COL].values
X_wv_20_words_train = np.array([data for data in X_wv_20_words_train], dtype=np.float64)
y_wv_20_words_train = train_wv_20_words[TARGET_COL].values


# Test Data
test_wv_20_words[TEMP_COL] = test_wv_20_words[DATA_COL].apply(partial(get_top_n_words, w2v_model=w2v_google))
test_wv_20_words.dropna(inplace=True)

X_wv_20_words_test = test_wv_20_words[TEMP_COL].values
X_wv_20_words_test = np.array([data for data in X_wv_20_words_test], dtype=np.float64)
y_wv_20_words_test = test_wv_20_words[TARGET_COL].values


In [120]:
# Save the top 20 words word vectors dataset
import pickle as pkl

with open(f"{DATA_PATH}/{WORDS_20_WORD_VECTORS_DATA_FILE}", mode="wb") as file:
    pkl.dump((X_wv_20_words_train, y_wv_20_words_train, X_wv_20_words_test, y_wv_20_words_test), file)


In [121]:
# Free Some Memory
del (
    train_wv_20_words,
    test_wv_20_words,
    X_wv_20_words_train,
    y_wv_20_words_train,
    X_wv_20_words_test,
    y_wv_20_words_test,
)


### Working on RNN

In [122]:
RNN_BATCH_SIZE = 32


In [123]:
# Load the top 20 word vectors dataset
import pickle as pkl

X_wv_20_words_train, y_wv_20_words_train, X_wv_20_words_test, y_rnn_test = None, None, None, None
with open(f"{DATA_PATH}/{WORDS_20_WORD_VECTORS_DATA_FILE}", mode="rb") as file:
    X_wv_20_words_train, y_wv_20_words_train, X_wv_20_words_test, y_rnn_test = pkl.load(file)


In [124]:
X_wv_20_words_train_tensor = torch.FloatTensor(X_wv_20_words_train).to(device)
y_wv_20_words_train_tensor = torch.LongTensor(y_wv_20_words_train).to(device)
X_wv_20_words_test_tensor = torch.FloatTensor(X_wv_20_words_test).to(device)
y_wv_20_words_test_tensor = torch.LongTensor(y_rnn_test).to(device)

rnn_train = TensorDataset(X_wv_20_words_train_tensor, y_wv_20_words_train_tensor)
rnn_train_loader = DataLoader(rnn_train, batch_size=RNN_BATCH_SIZE, drop_last=True, shuffle=True)

rnn_test = TensorDataset(X_wv_20_words_test_tensor, y_wv_20_words_test_tensor)
rnn_test_loader = DataLoader(rnn_test, batch_size=RNN_BATCH_SIZE, drop_last=True, shuffle=True)


In [125]:
from sklearn.utils import class_weight

class_weights = torch.FloatTensor(
    class_weight.compute_class_weight(
        "balanced", classes=np.unique(y_wv_20_words_train), y=y_wv_20_words_train
    )
).to(device)


In [126]:
# Free Some Memory
del X_wv_20_words_train, y_wv_20_words_train, X_wv_20_words_test, y_rnn_test


### Simple RNN

In [127]:
class SimpleRNN(nn.Module):
    def __init__(self, n_inputs, n_outputs, n_hidden_states):
        super(SimpleRNN, self).__init__()
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        self.n_hidden_size = n_hidden_states
        self.n_layers = 1
        self.rnn = torch.nn.RNN(self.n_inputs, self.n_hidden_size, self.n_layers, batch_first=True)
        self.fc = torch.nn.Linear(self.n_hidden_size, n_outputs)

    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out[:, -1, :])
        return out

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.n_hidden_size)
        return hidden


#### Training Simple RNN

In [128]:
simple_rnn_model = SimpleRNN(n_inputs=300, n_outputs=5, n_hidden_states=20)
simple_rnn_model


SimpleRNN(
  (rnn): RNN(300, 20, batch_first=True)
  (fc): Linear(in_features=20, out_features=5, bias=True)
)

In [129]:
# Loss Function and Optimizer
criterion_simple_rnn = nn.CrossEntropyLoss(weight=class_weights).to(device)

optimizer_simple_rnn = torch.optim.Adam(simple_rnn_model.parameters(), lr=128e-5)


In [130]:
simple_rnn = train_model(
    rnn_train_loader, simple_rnn_model, n_epochs=32, optimizer=optimizer_simple_rnn, criterion=criterion_simple_rnn
)


Epoch: 1 	Training Loss: 1.401129
Epoch: 2 	Training Loss: 1.290815
Epoch: 3 	Training Loss: 1.262726
Epoch: 4 	Training Loss: 1.249236
Epoch: 5 	Training Loss: 1.236278
Epoch: 6 	Training Loss: 1.245102
Epoch: 7 	Training Loss: 1.225701
Epoch: 8 	Training Loss: 1.219416
Epoch: 9 	Training Loss: 1.217325
Epoch: 10 	Training Loss: 1.211375
Epoch: 11 	Training Loss: 1.211957
Epoch: 12 	Training Loss: 1.208538
Epoch: 13 	Training Loss: 1.205574
Epoch: 14 	Training Loss: 1.212909
Epoch: 15 	Training Loss: 1.203798
Epoch: 16 	Training Loss: 1.203189
Epoch: 17 	Training Loss: 1.206479
Epoch: 18 	Training Loss: 1.204533
Epoch: 19 	Training Loss: 1.198724
Epoch: 20 	Training Loss: 1.207651
Epoch: 21 	Training Loss: 1.205054
Epoch: 22 	Training Loss: 1.199495
Epoch: 23 	Training Loss: 1.204661
Epoch: 24 	Training Loss: 1.199945
Epoch: 25 	Training Loss: 1.195570
Epoch: 26 	Training Loss: 1.197134
Epoch: 27 	Training Loss: 1.195752
Epoch: 28 	Training Loss: 1.205070
Epoch: 29 	Training Loss: 1.1

In [131]:
y_rnn_true, y_rnn_pred = predict(simple_rnn, rnn_test_loader)


In [132]:
# Print the Metrics
from sklearn.metrics import accuracy_score, classification_report

print(classification_report(y_rnn_true, y_rnn_pred), "\nAccuracy Score: ", accuracy_score(y_rnn_true, y_rnn_pred))


              precision    recall  f1-score   support

           0       0.54      0.60      0.57      3992
           1       0.35      0.34      0.35      3996
           2       0.38      0.37      0.38      3996
           3       0.42      0.41      0.41      3992
           4       0.62      0.60      0.61      3992

    accuracy                           0.47     19968
   macro avg       0.46      0.47      0.46     19968
weighted avg       0.46      0.47      0.46     19968
 
Accuracy Score:  0.46509415064102566


In [133]:
# Save Model
torch.save(simple_rnn.state_dict(), f"{MODEL_PATH}/{RNN_TOP_20_WV_MODEL_FILE}")


In [134]:
del simple_rnn, criterion_simple_rnn, optimizer_simple_rnn


Report accuracy values on the testing split for your RNN model.


What do you conclude by comparing accuracy values you obtain with those obtained with feedforward neural network models?


### Gated Recurrent Unit


In [135]:
class GRU(nn.Module):
    def __init__(self, n_inputs, n_outputs, n_hidden_states, n_layers=1):
        super(GRU, self).__init__()
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        self.n_hidden_states = n_hidden_states
        self.n_layers = n_layers
        self.rnn = torch.nn.GRU(self.n_inputs, self.n_hidden_states, self.n_layers, batch_first=True)
        self.fc = torch.nn.Linear(self.n_hidden_states, self.n_outputs)

    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out[:, -1, :])
        return out

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.n_hidden_states).to(device)
        return hidden


### Working on GRU

In [136]:
# Load the top 20 word vectors dataset
import pickle as pkl

X_wv_20_words_train, y_wv_20_words_train, X_wv_20_words_test, y_rnn_test = None, None, None, None
with open(f"{DATA_PATH}/{WORDS_20_WORD_VECTORS_DATA_FILE}", mode="rb") as file:
    X_wv_20_words_train, y_wv_20_words_train, X_wv_20_words_test, y_rnn_test = pkl.load(file)


In [137]:
GRU_BATCH_SIZE = 16


In [138]:
X_wv_20_words_train_tensor = torch.FloatTensor(X_wv_20_words_train).to(device)
y_wv_20_words_train_tensor = torch.LongTensor(y_wv_20_words_train).to(device)
X_wv_20_words_test_tensor = torch.FloatTensor(X_wv_20_words_test).to(device)
y_wv_20_words_test_tensor = torch.LongTensor(y_rnn_test).to(device)

gru_train = TensorDataset(X_wv_20_words_train_tensor, y_wv_20_words_train_tensor)
gru_train_loader = DataLoader(gru_train, batch_size=GRU_BATCH_SIZE, drop_last=True, shuffle=True)

gru_test = TensorDataset(X_wv_20_words_test_tensor, y_wv_20_words_test_tensor)
gru_test_loader = DataLoader(gru_test, batch_size=GRU_BATCH_SIZE, drop_last=True, shuffle=True)


In [139]:
gru_model = GRU(n_inputs=300, n_outputs=5, n_hidden_states=20)
gru_model


GRU(
  (rnn): GRU(300, 20, batch_first=True)
  (fc): Linear(in_features=20, out_features=5, bias=True)
)

In [140]:
from sklearn.utils import class_weight

class_weights = torch.FloatTensor(
    class_weight.compute_class_weight(
        "balanced", classes=np.unique(y_wv_20_words_train), y=y_wv_20_words_train
    )
).to(device)


In [141]:
# Loss Function and Optimizer
criterion_gru = nn.CrossEntropyLoss(weight=class_weights).to(device)

optimizer_gru = torch.optim.Adam(gru_model.parameters(), lr=16e-5)


In [142]:
# Free Some Memory
del X_wv_20_words_train, y_wv_20_words_train, X_wv_20_words_test, y_rnn_test


In [143]:
gru_model = train_model(gru_train_loader, gru_model, n_epochs=32, optimizer=optimizer_gru, criterion=criterion_gru)


Epoch: 1 	Training Loss: 1.423860
Epoch: 2 	Training Loss: 1.250046
Epoch: 3 	Training Loss: 1.206539
Epoch: 4 	Training Loss: 1.184161
Epoch: 5 	Training Loss: 1.167547
Epoch: 6 	Training Loss: 1.155109
Epoch: 7 	Training Loss: 1.144839
Epoch: 8 	Training Loss: 1.136778
Epoch: 9 	Training Loss: 1.129797
Epoch: 10 	Training Loss: 1.123304
Epoch: 11 	Training Loss: 1.117963
Epoch: 12 	Training Loss: 1.113396
Epoch: 13 	Training Loss: 1.108805
Epoch: 14 	Training Loss: 1.104589
Epoch: 15 	Training Loss: 1.100702
Epoch: 16 	Training Loss: 1.097047
Epoch: 17 	Training Loss: 1.093883
Epoch: 18 	Training Loss: 1.090560
Epoch: 19 	Training Loss: 1.087260
Epoch: 20 	Training Loss: 1.085175
Epoch: 21 	Training Loss: 1.082275
Epoch: 22 	Training Loss: 1.079674
Epoch: 23 	Training Loss: 1.077481
Epoch: 24 	Training Loss: 1.075076
Epoch: 25 	Training Loss: 1.072198
Epoch: 26 	Training Loss: 1.070574
Epoch: 27 	Training Loss: 1.067724
Epoch: 28 	Training Loss: 1.066097
Epoch: 29 	Training Loss: 1.0

In [144]:
y_gru_true, y_gru_pred = predict(gru_model, gru_test_loader)


In [145]:
# Print Metrics
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_gru_true, y_gru_pred), "\nAccuracy Score: ", accuracy_score(y_gru_true, y_gru_pred))


              precision    recall  f1-score   support

           0       0.61      0.58      0.60      3996
           1       0.42      0.36      0.39      3994
           2       0.43      0.46      0.45      3997
           3       0.46      0.45      0.45      3992
           4       0.63      0.73      0.68      3989

    accuracy                           0.52     19968
   macro avg       0.51      0.52      0.51     19968
weighted avg       0.51      0.52      0.51     19968
 
Accuracy Score:  0.5161258012820513


In [146]:
# Save Model
torch.save(gru_model.state_dict(), f"{MODEL_PATH}/{GRU_TOP_20_WV_MODEL_FILE}")


In [147]:
del (
    X_wv_20_words_train_tensor,
    y_wv_20_words_train_tensor,
    X_wv_20_words_test_tensor,
    y_wv_20_words_test_tensor,
    gru_train,
    gru_train_loader,
    gru_test,
    gru_test_loader,
    gru_model,
    optimizer_gru,
    criterion_gru,
    y_gru_true,
    y_gru_pred,
)


What do you conclude by comparing accuracy values you obtain with those obtained using simple RNN?


TODO: Answer
