In [None]:
# Allow multiple print statements in a cell in Jupyter Notebook
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"


In [None]:
import pandas as pd
import numpy as np
import nltk

nltk.download("wordnet")
import re


In [None]:
# Install some dependencies
! pip install emot contractions

In [None]:
DATA_PATH = "/Volumes/dataTwo/usc/CSCI_544/assignment_02/data"
MODEL_PATH = "/Volumes/dataTwo/usc/CSCI_544/assignment_02/model"

ORIGINAL_DATA_FILE = "amazon_reviews_us_Jewelry_v1_00.tsv"
SAMPLED_DATA_FILE = "data_sampled.csv"
CLEANED_DATA_FILE = "data_cleaned.csv"

# Files after clean and split
DATA_FILE = "data.csv"

# Files after preprocessing the splitted dataset
PREPROCESSED_DATA_FILE = "data_preprocessed.csv"

# Files containing the tfidf data
TFIDF_DATA_FILE = "data_tfidf.csv"

# custom created word vectors for the review dataset
CUSTOM_WORD_VECTORS_MODEL_FILE = "gensim_w2v_amazon_reviews_model"

# train and test data for word2vec avg. word vectors approach
AVG_WORD_VECTORS_DATA_FILE = "data_avg_word_vectors.pkl"

# train and test data for word2vec contatenate top 10 vectors appraoch
TOP_10_WORD_VECTORS_DATA_FILE = "data_avg_word_vectors.pkl"

DATA_COL = "review_body"
TARGET_COL = "star_rating"

N_SAMPLES = 20000

RANDOM_SEED = 42


In [None]:
np.random.seed(RANDOM_SEED)
rng = np.random.default_rng(seed=RANDOM_SEED)


# Data Generation


In [None]:
# Load the tab separated data file, and print the first 5 rows for confirmation
data = pd.read_csv(f"{DATA_PATH}/{ORIGINAL_DATA_FILE}", sep="\t", usecols=[TARGET_COL, DATA_COL], low_memory=True)
data.head()

# Drop NA values
data.dropna(inplace=True)

# Drop the outlier which is star_rating = "2012-12-21"
data = data[data.star_rating != "2012-12-21"]

# Convert all star rating to integer
data[TARGET_COL] = data.star_rating.astype(int)

# Remove nan valued rows
data = data[data.review_body.notnull()]


In [None]:
data.head()


In [None]:
sampled_data = data.groupby(TARGET_COL, group_keys=False).apply(lambda x: x.sample(N_SAMPLES, random_state=RANDOM_SEED))
sampled_data.reset_index(inplace=True)
sampled_data.drop(columns=["index"], inplace=True)


In [None]:
# save data without cleaning
sampled_data.to_csv(f"{DATA_PATH}/{SAMPLED_DATA_FILE}", header=True)


## Data Cleaning


In [None]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: pd.Series):
    return data.str.lower()


In [None]:
def remove_accented_characters(data: pd.Series):
    import unicodedata

    """Removes accented characters from the Series

    Args:
        data (pd.Series): Series of string

    Returns:
        _type_: pd.Series
    """
    import unicodedata

    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))


In [None]:
def remove_html_encodings(data: pd.Series):
    return data.str.replace(r"&#\d+;", " ", regex=True)


In [None]:
def remove_html_tags(data: pd.Series):
    return data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)


In [None]:
def remove_url(data: pd.Series):
    return data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)


In [None]:
def remove_html_and_url(data: pd.Series):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (pd.Series): A Pandas series of type string

    Returns:
        _type_: pd.Series
    """
    # Remove HTML encodings
    data.str.replace(r"&#\d+;", " ", regex=True)

    # Remove HTML tags (both open and closed)
    data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

    # Remove URLs
    data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

    return data


In [None]:
# Handle emoji
def convert_emoji_to_txt(data: pd.Series):
    from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO

    EMO_TO_TXT_DICT = dict()
    for emot in UNICODE_EMOJI:
        EMO_TO_TXT_DICT[emot] = f" {re.sub(r',|:|_', '', UNICODE_EMOJI[emot])} "

    for emo in EMOTICONS_EMO:
        EMO_TO_TXT_DICT[emot] = f" {re.sub(r',| ', '', EMOTICONS_EMO[emo])} "

    def convert_emojis(text, emo_to_txt_dict):
        for emot in emo_to_txt_dict:
            text = text.replace(emot, emo_to_txt_dict[emot])
        return text

    return data.apply(lambda x: convert_emojis(x, EMO_TO_TXT_DICT))


In [None]:
# Remove non-alphabetical characters
def remove_non_alpha_characters(data: pd.Series):
    return data.str.replace(r"_+|\\|[^a-zA-Z0-9\s]", " ", regex=True)


In [None]:
# Remove extra spaces
def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)


In [None]:
# Expanding contractions
def fix_contractions(data: pd.Series):
    import contractions

    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])

    return data.apply(contraction_fixer)


In [None]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    DATA_COL: [
        convert_emoji_to_txt,
        to_lower,
        remove_accented_characters,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = sampled_data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = cleaned_data[col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    cleaned_data[col] = temp_data.copy()


In [None]:
# Store data file
cleaned_data.to_csv(f"{DATA_PATH}/{CLEANED_DATA_FILE}", sep=",", index=False)


## Data Preprocessing (used by TF-IDF Models)


In [None]:
def tokenize(data: pd.Series):
    from nltk.tokenize import word_tokenize

    nltk.download("punkt")

    return data.apply(word_tokenize)


In [None]:
from typing import List, Set


def remove_stopwords(data: pd.Series):
    """Remove stop words using the NLTK stopwords dictionary

    Args:
        string (str): a document

    Returns:
        str: a document with stopwords removed
    """
    from nltk.corpus import stopwords

    nltk.download("stopwords")

    stopwords = set(stopwords.words())

    def remover(word_list: List[str], stopwords: Set[str]):
        return [word for word in word_list if not word in stopwords]

    return data.apply(lambda word_list: remover(word_list, stopwords))


In [None]:
def lemmatize(data: pd.Series, consider_pos_tag: bool = True):
    from nltk.corpus import wordnet
    from nltk.stem import WordNetLemmatizer

    nltk.download("omw-1.4")

    # POS tagging
    def perform_nltk_pos_tag(data: pd.Series):
        from nltk import pos_tag

        nltk.download("averaged_perceptron_tagger")

        return data.apply(pos_tag)

    # Convert POS tag to wordnet pos tags
    def wordnet_pos_tagger(tag: str):
        if tag.startswith("J"):
            return wordnet.ADJ
        elif tag.startswith("V"):
            return wordnet.VERB
        elif tag.startswith("N"):
            return wordnet.NOUN
        elif tag.startswith("R"):
            return wordnet.ADV
        else:
            return None

    lemmatizer = WordNetLemmatizer()
    lemmatized = list()

    if consider_pos_tag:
        pos_tagged_data = data.copy()
        pos_tagged_data = perform_nltk_pos_tag(data)

        for row in pos_tagged_data:

            lemmatized_row = list()

            if consider_pos_tag:
                for word, tag in row:
                    wordnet_pos_tag = wordnet_pos_tagger(tag)

                    if wordnet_pos_tag is None:
                        lemmatized_row.append(word)
                    else:
                        result = lemmatizer.lemmatize(word, wordnet_pos_tag)
                        lemmatized_row.append(lemmatizer.lemmatize(word, wordnet_pos_tag))

            lemmatized.append(lemmatized_row)
    else:
        for row in data:
            lemmatized_row = list()

            for word in row:
                lemmatized_row.append(lemmatizer.lemmatize(word))

            lemmatized.append(lemmatized_row)

    return pd.Series(lemmatized)


In [None]:
# Concatenate lemmatized sentences back into one sentence
def concatenate(data: pd.Series):
    return data.apply(lambda words: " ".join(words))


## Data Split


In [None]:
# Split the data 80-20 split
import pickle as pkl
from sklearn.model_selection import train_test_split

# load clean data
cleaned_data = pd.read_csv(f"{DATA_PATH}/{CLEANED_DATA_FILE}")

# split cleaned data into train and test
train, test = train_test_split(cleaned_data, test_size=0.2, stratify=cleaned_data[TARGET_COL], random_state=RANDOM_SEED)

# save the split
with open(f"{DATA_PATH}/{DATA_FILE}", mode="wb") as file:
    pkl.dump((train, test), file)


### Do Preprocessing


In [None]:
import pickle as pkl

train, test = None, None
with open(f"{DATA_PATH}/{DATA_FILE}", mode="rb") as file:
    train, test = pkl.load(file)


In [None]:
import pickle as pkl

preprocessing_pipeline = {DATA_COL: [tokenize, lemmatize, concatenate]}

# Run the pipeline
preprocessed_train_data = train.copy()
preprocessed_test_data = test.copy()

In [None]:
import pickle as pkl

preprocessing_pipeline = {DATA_COL: [tokenize, lemmatize, concatenate]}

# Run the pipeline
preprocessed_train_data = train.copy()
preprocessed_test_data = test.copy()

# Process all the cleaning instructions
for col, pipeline in preprocessing_pipeline.items():
    # Get the column to perform cleaning on
    temp_data_train = preprocessed_train_data[col].copy()
    temp_data_test = preprocessed_test_data[col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")

        if func.__name__ == "lemmatize":
            temp_data_train = func(temp_data_train, consider_pos_tag=True)
            temp_data_test = func(temp_data_test, consider_pos_tag=True)
        else:
            temp_data_train = func(temp_data_train)
            temp_data_test = func(temp_data_test)

        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    preprocessed_train_data[col] = temp_data_train.copy()
    preprocessed_test_data[col] = temp_data_test.copy()

# Remove empty reviews
preprocessed_train_data = preprocessed_train_data[preprocessed_train_data[DATA_COL].str.len() != 0]
preprocessed_test_data = preprocessed_test_data[preprocessed_test_data[DATA_COL].str.len() != 0]

# Remove NaN
preprocessed_train_data.dropna(inplace=True)
preprocessed_test_data.dropna(inplace=True)

# Save the preprocessed data
with open(f"{DATA_PATH}/{PREPROCESSED_DATA_FILE}", mode="wb") as file:
    pkl.dump((preprocessed_train_data, preprocessed_test_data), file)

## Helper Functions


In [None]:
def calc_metrics(y_true, y_pred, avg_type="macro"):
    from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

    precision = precision_score(y_true, y_pred, average=avg_type)
    recall = recall_score(y_true, y_pred, average=avg_type)
    f1 = f1_score(y_true, y_pred, average=avg_type)
    accuracy = accuracy_score(y_true, y_pred)

    return precision, recall, f1, accuracy


In [None]:
def print_metics(precision, recall, f1, accuracy):
    print(f"Avg. Precision: {precision}")
    print(f"Avg. Recall: {recall}")
    print(f"Avg. F1: {f1}")
    print(f"Accuray Score: {accuracy}")


## Word Embedding


### Load `word2vec-google-news-300` Model

Learn how to extract word embeddings for your dataset. Try to check semantic similarities of the generated vectors using three examples of your own, e.g., King − Man + Woman = Queen or excellent ∼ outstanding.


In [None]:
! pip install gensim

In [None]:
import gensim.downloader as api

w2v_google = api.load("word2vec-google-news-300")


In [None]:
w2v_google.most_similar(positive=["king", "woman"], negative=["man"], topn=1)


In [None]:
w2v_google.similarity("excellent", "outstanding")


Three Semantic Similarity Examples


In [None]:
w2v_google.similarity("worst", "terrible")


In [None]:
w2v_google.similarity("cheap", "disappointed")


In [None]:
w2v_google.most_similar(positive=["cheap", "worst"], negative=["costly"], topn=1)


### Train Word2Vec on own Dataset


In [None]:
from gensim.test.utils import datapath
from gensim import utils


class AmazonReviewCorpus:
    def __iter__(self):
        data = pd.read_csv(f"{DATA_PATH}/{CLEANED_DATA_FILE}", sep=",")

        for review_body in data["review_body"]:
            yield utils.simple_preprocess(review_body)


In [None]:
from gensim.models import Word2Vec
from gensim import utils

w2v_custom = Word2Vec(vector_size=300, min_count=10, window=11, workers=4)


In [None]:
# Build the vocabulary
# reviews_data = pd.read_csv(f"{DATA_PATH}/{CLEANED_DATA_FILE}", sep=",")
# reviews = reviews_data["review_body"].apply(utils.simple_preprocess)
reviews = AmazonReviewCorpus()

w2v_custom.build_vocab(reviews, progress_per=1000)


In [None]:
# Train the model
w2v_custom.train(reviews, total_examples=w2v_custom.corpus_count, epochs=w2v_custom.epochs)


In [None]:
# Save the trained model
w2v_custom.save(f"{MODEL_PATH}/{CUSTOM_WORD_VECTORS_MODEL_FILE}")


#### Using trained model


In [None]:
# Load the trained model
w2v_custom = Word2Vec.load(f"{MODEL_PATH}/{CUSTOM_WORD_VECTORS_MODEL_FILE}")


In [None]:
w2v_custom.wv.most_similar(positive=["king", "woman"], negative=["man"], topn=1)


In [None]:
w2v_custom.wv.similarity("excellent", "outstanding")


Three Semantic Similarity Examples


In [None]:
w2v_google.similarity("worst", "terrible")


In [None]:
w2v_google.similarity("cheap", "disappointed")


In [None]:
w2v_google.most_similar(positive=["cheap", "worst"], negative=["costly"], topn=1)


Check semantic similarities from words used in earlier part.


What do you conclude from comparing vectors generated by yourself and the pretrained model? Which of the Word2Vec models seems to encode semantic similarities between words better?


TODO: Answer


## Avg. Word Vectors


Use the average Word2Vec vectors for each review as the input feature (x = N1 􏰀Ni=1 Wi for a review with N words)


In [None]:
def calculate_avg_word_vector(words: List[str], w2v_model):
    result_vector = np.ndarray(shape=(300,), buffer=np.zeros((300,)), dtype=float)
    removed_word_count = 0
    for word in words:
        try:
            result_vector += w2v_model[word]
        except KeyError:
            removed_word_count += 1
    return result_vector / (len(words) - removed_word_count)


##### Perform Word2Vec conversion


In [None]:
# Load the cleaned but not preprocessed data
import pickle as pkl

train_wv, test_wv = None, None
with open(f"{DATA_PATH}/{DATA_FILE}", mode="rb") as file:
    train_wv, test_wv = pkl.load(file)


In [None]:
from functools import partial
from gensim.utils import simple_preprocess

# Preprocess using gensim'simple_preprocess
train_wv["review_body"] = train_wv["review_body"].apply(simple_preprocess)
test_wv["review_body"] = test_wv["review_body"].apply(simple_preprocess)

TEMP_COL = "avg_word_vector"
VECTOR_COLS = [f"vector_{i}" for i in range(300)]

# Train Data
train_wv[TEMP_COL] = train_wv[DATA_COL].apply(partial(calculate_avg_word_vector, w2v_model=w2v_google))

wv_df = pd.DataFrame(train_wv[TEMP_COL].to_list(), index=train_wv[TEMP_COL].index, columns=VECTOR_COLS)
wv_df.dropna(inplace=True)

train_wv = pd.concat([train_wv, wv_df], axis=1)
train_wv.dropna(inplace=True)

X_wv_train = train_wv.drop([DATA_COL, TEMP_COL, TARGET_COL], axis=1)
y_wv_train = train_wv[TARGET_COL]


# Test Data
test_wv[TEMP_COL] = test[DATA_COL].apply(partial(calculate_avg_word_vector, w2v_model=w2v_google))

wv_df = pd.DataFrame(test_wv[TEMP_COL].to_list(), index=test_wv[TEMP_COL].index, columns=VECTOR_COLS)
wv_df.dropna(inplace=True)

test_wv = pd.concat([test_wv, wv_df], axis=1)
test_wv.dropna(inplace=True)

X_wv_test = train_wv.drop([DATA_COL, TEMP_COL, TARGET_COL], axis=1)
y_wv_test = train_wv[TARGET_COL]


In [None]:
# Save the avg'ed word vectors dataset
import pickle as pkl

with open(f"{DATA_PATH}/{AVG_WORD_VECTORS_DATA_FILE}", mode="wb") as file:
    pkl.dump((X_wv_train, y_wv_train, X_wv_test, y_wv_test), file)


## Simple Models


Report your accuracy values on the testing split for these models similar to HW1, i.e., for each of perceptron and SVM models, report two accuracy values Word2Vec and TF-IDF features.


### TF-IDF Vectorization


In [None]:
# Load train and test data
import pickle as pkl

train_preprocessed, test_preprocessed = None, None
with open(f"{DATA_PATH}/{PREPROCESSED_DATA_FILE}", mode="rb") as file:
    train_preprocessed, test_preprocessed = pkl.load(file)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

nltk.download("punkt")

vectorizer = TfidfVectorizer(tokenizer=word_tokenize)

# Using entire data to fit as the dataset is small and as using entire dataset is not needed for homework requirement
all_data = pd.concat([train_preprocessed, test_preprocessed], axis=0)
vectorizer.fit(all_data[DATA_COL])

X_tfidf_train = vectorizer.transform(train_preprocessed[DATA_COL])
X_tfidf_test = vectorizer.transform(test_preprocessed[DATA_COL])
y_tfidf_train = train_preprocessed[TARGET_COL]
y_tfidf_test = test_preprocessed[TARGET_COL]


In [None]:
# Save train and test TFIDF vectors
import pickle as pkl

with open(f"{DATA_PATH}/{TFIDF_DATA_FILE}", mode="wb") as file:
    pkl.dump((X_tfidf_train, y_tfidf_train, X_tfidf_test, y_tfidf_test), file)


### Perceptron


> #### TF-IDF Based Approach


In [None]:
# Load TF-IDF data
import pickle as pkl

X_tfidf_train, y_tfidf_train, X_tfidf_test, y_tfidf_test = None, None, None, None
with open(f"{DATA_PATH}/{TFIDF_DATA_FILE}", mode="rb") as file:
    X_tfidf_train, y_tfidf_train, X_tfidf_test, y_tfidf_test = pkl.load(file)


In [None]:
from sklearn.linear_model import Perceptron

perceptron_tfidf_clf = Perceptron(
    max_iter=8000, alpha=0.012, random_state=RANDOM_SEED, tol=1e-4, early_stopping=True, class_weight="balanced"
)

perceptron_tfidf_clf.fit(X_tfidf_train, y_tfidf_train)

y_tfidf_pred = perceptron_tfidf_clf.predict(X_tfidf_test)

print_metics(*calc_metrics(y_tfidf_test, y_tfidf_pred))


In [None]:
# Delete Model and related variables
del perceptron_tfidf_clf, y_tfidf_pred


> #### Word2Vec Based Approach - Avg. Word Vectors


In [None]:
# Load Avg. Word2Vec Data
import pickle as pkl

X_wv_train, y_wv_train, X_wv_test, y_wv_test = None, None, None, None
with open(f"{DATA_PATH}/{AVG_WORD_VECTORS_DATA_FILE}", mode="rb") as file:
    X_wv_train, y_wv_train, X_wv_test, y_wv_test = pkl.load(file)


In [None]:
from sklearn.linear_model import Perceptron

perceptron_wv_clf = Perceptron(
    max_iter=8000, alpha=0.012, random_state=RANDOM_SEED, tol=1e-4, early_stopping=True, class_weight="balanced"
)

perceptron_wv_clf = Perceptron(max_iter=8000, alpha=0.012, random_state=RANDOM_SEED, tol=1e-4, early_stopping=True)

perceptron_wv_clf.fit(X_wv_train, y_wv_train)

y_wv_pred = perceptron_wv_clf.predict(X_wv_test)

print_metics(*calc_metrics(y_wv_test, y_wv_pred))


In [None]:
# Delete model and related variables
del perceptron_wv_clf, y_wv_pred


### SVM


> #### TF-IDF Based


In [None]:
# Load TF-IDF data
import pickle as pkl

X_tfidf_train, y_tfidf_train, X_tfidf_test, y_tfidf_test = None, None, None, None
with open(f"{DATA_PATH}/{TFIDF_DATA_FILE}", mode="rb") as file:
    X_tfidf_train, y_tfidf_train, X_tfidf_test, y_tfidf_test = pkl.load(file)


In [None]:
from sklearn.svm import LinearSVC

# class_weight = {1: 0.9525, 2: 1.99825, 3: 1.9225, 4: 0.625, 5: 0.8585}

svm_tfidf_clf = LinearSVC(dual=False, C=0.1, max_iter=1000, class_weight="balanced", random_state=RANDOM_SEED)

svm_tfidf_clf.fit(X_tfidf_train, y_tfidf_train)

y_tfidf_pred = svm_tfidf_clf.predict(X_tfidf_test)

print_metics(*calc_metrics(y_tfidf_test, y_tfidf_pred))


In [None]:
# Delete model and variables
del svm_tfidf_clf, y_tfidf_pred


> #### Word2Vec Based Approach - Avg. Word Vectors


In [None]:
# Load Avg. Word2Vec Data
import pickle as pkl

X_wv_train, y_wv_train, X_wv_test, y_wv_test = None, None, None, None
with open(f"{DATA_PATH}/{AVG_WORD_VECTORS_DATA_FILE}", mode="rb") as file:
    X_wv_train, y_wv_train, X_wv_test, y_wv_test = pkl.load(file)


In [None]:
from sklearn.svm import LinearSVC

class_weight = {1: 0.9525, 2: 1.99825, 3: 1.9225, 4: 0.625, 5: 0.8585}

svm_wv_clf = LinearSVC(dual=False, C=0.1, max_iter=1000, class_weight=class_weight, random_state=RANDOM_SEED)

svm_wv_clf.fit(X_wv_train, y_wv_train)

y_wv_avged_pred = svm_wv_clf.predict(X_wv_test)

print_metics(*calc_metrics(y_wv_test, y_wv_avged_pred))


In [None]:
# Delete model and variables
del svm_wv_clf, y_wv_avged_pred


What do you conclude from comparing performances for the models trained using the two different feature types (TF-IDF and your trained Word2Vec features)?


TODO: Answer


> ## PyTorch Setup


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as func
from torch.utils.data import TensorDataset, DataLoader

device = None
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using Device: {device}")


#### Helper Functions


In [None]:
def train_model(data_loader, train_model, n_epochs, optimizer, criterion):
    valid_loss_min = np.Inf

    for epoch in range(n_epochs):
        train_loss = 0.0

        ###################
        # train the model #
        ###################
        train_model.train()  # prep model for training
        for data, target in data_loader:
            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            # forward pass: compute predicted outputs by passing inputs to the model
            output = train_model(data)
            # calculate the loss
            loss = criterion(output, target)
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            # update running training loss
            train_loss += loss.item() * data.size(0)

        # print training/validation statistics
        # calculate average loss over an epoch
        train_loss = train_loss / len(data_loader.dataset)

        print(f"Epoch: {epoch + 1} \tTraining Loss: {train_loss:.6f}")

    return train_model


In [None]:
def predict(model, dataloader):
    prediction_list = []
    for i, batch in enumerate(dataloader):
        outputs = model(batch)
        _, predicted = torch.max(outputs.data, 1)
        prediction_list.append(predicted.cpu())
    return prediction_list


## Feedforward Neural Network


Using the Word2Vec features, train a feedforward multilayer perceptron net- work for classification. Consider a network with two hidden layers, each with 50 and 10 nodes, respectively. You can use cross entropy loss and your own choice for other hyperparamters, e.g., nonlinearity, number of epochs, etc. Part of getting good results is to select good values for these hyperparamters.


In [None]:
FNN_BATCH_SIZE = 200


### Create FNN


In [None]:
class FNN(nn.Module):
    def __init__(self, n_input, n_output, dropout_rate) -> None:
        super().__init__()

        self.n_input = n_input
        self.n_hidden_1 = 50
        self.n_hidden_2 = 10
        self.n_output = n_output
        self.dropout_rate = dropout_rate

        self.fc1 = nn.Linear(self.n_input, self.n_hidden_1)
        self.fc2 = nn.Linear(self.n_hidden_1, self.n_hidden_2)
        self.fc3 = nn.Linear(self.n_hidden_2, self.n_output)
        self.dropout = nn.Dropout(self.dropout_rate)

    def forward(self, x):
        x = func.relu(self.fc1(x))
        x = self.dropout(x)
        x = func.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x


In [None]:
fnn_model = FNN(n_input=300, n_output=5, dropout_rate=0.2).to(device)
fnn_model


In [None]:
# Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(fnn_model.parameters(), lr=1e-2)


> ### Using Avg. Word2Vec Vectors


In [None]:
# Load Avg. Word2Vec Data
import pickle as pkl

X_wv_train, y_wv_train, X_wv_test, y_wv_test = None, None, None, None
with open(f"{DATA_PATH}/{AVG_WORD_VECTORS_DATA_FILE}", mode="rb") as file:
    X_wv_train, y_wv_train, X_wv_test, y_wv_test = pkl.load(file)


In [None]:
import torch


class AmazonReviewsDataset(torch.utils.data.Dataset):
    "Characterizes a dataset for PyTorch"

    def __init__(self, inputs, transform=None):
        "Initialization"
        self.data = inputs

    def __len__(self):
        "Denotes the total number of samples"
        return len(self.data)

    def __getitem__(self, index):
        "Generates one sample of data"
        if torch.is_tensor(index):
            index = index.tolist()

        input, label = self.data[index][0], self.data[index][1]

        return input, label


In [None]:
X_wv_train_tensor = torch.from_numpy(X_wv_train.values).float().to(device)
y_wv_train_tensor = torch.from_numpy(y_wv_train.values).float().to(device)
X_wv_test_tensor = torch.from_numpy(X_wv_test.values).float().to(device)
y_wv_test_tensor = torch.from_numpy(y_wv_test.values).float().to(device)

fnn_train = AmazonReviewsDataset(TensorDataset(X_wv_train_tensor, y_wv_train_tensor))
fnn_train_loader = DataLoader(fnn_train, batch_size=FNN_BATCH_SIZE, drop_last=True, shuffle=True)

fnn_test = AmazonReviewsDataset(TensorDataset(X_wv_test_tensor, y_wv_test_tensor))
fnn_test_loader = DataLoader(fnn_test, batch_size=FNN_BATCH_SIZE, drop_last=True, shuffle=True)


In [None]:
fnn_model = train_model(fnn_train_loader, fnn_model, 1, optimizer, criterion)


In [None]:
test_loader = torch.utils.data.DataLoader(fnn_test_loader, batch_size=FNN_BATCH_SIZE)

y_wv_pred = predict(fnn_model, test_loader)

print(*calc_metrics(y_wv_test, y_wv_pred))


Report accuracy on the testing split.


> ### Concatenate First 10 Word2Vec Vectors


In [None]:
def concatenate_top_n_wv(sentence: str, w2v_model, n=10):
    top_n_wvs = np.zeros(300 * n, dtype=np.float64)
    count = 1

    words = sentence.split(" ")
    n_words = len(words)

    for word in words:
        if count == n or count == n_words:
            break
        count += 1
        try:
            top_n_wvs[(count - 1) * 300 : count * 300] = w2v_model[word]
        except KeyError:
            count -= 1

    if np.all((top_n_wvs == 0)):
        return np.nan

    return top_n_wvs


##### Apply Top 10 Word Vectors


In [None]:
# Load the cleaned but not preprocessed data
import pickle as pkl

train_wv_top_10, test_wv_top_10 = None, None
with open(f"{DATA_PATH}/{DATA_FILE}", mode="rb") as file:
    train_wv_top_10, test_wv_top_10 = pkl.load(file)


In [None]:
from functools import partial
from gensim.utils import simple_preprocess

# Preprocess using gensim'simple_preprocess
train_wv_top_10["review_body"] = train_wv_top_10["review_body"].apply(simple_preprocess)
test_wv_top_10["review_body"] = test_wv_top_10["review_body"].apply(simple_preprocess)

TEMP_COL = "top_10_wv"
VECTOR_COLS = ["vector_" + f"_{i}" for i in range(3000)]

# Train
train_wv_top_10[TEMP_COL] = train_wv_top_10[DATA_COL].apply(partial(concatenate_top_n_wv, w2v_model=w2v_google, n=10))

wv_df = pd.DataFrame(train_wv_top_10[TEMP_COL].to_list(), index=train_wv_top_10[TEMP_COL].index, columns=VECTOR_COLS)

train_wv_top_10 = pd.concat([train_wv_top_10, wv_df], axis=1)
train_wv_top_10.dropna(inplace=True)

X_wv_train = train_wv_top_10.drop([DATA_COL, TEMP_COL, TARGET_COL], axis=1)
y_wv_train = train_wv_top_10[TARGET_COL]


# Test
test_wv_top_10[TEMP_COL] = test[DATA_COL].apply(partial(concatenate_top_n_wv, w2v_model=w2v_google, n=10))

wv_df = pd.DataFrame(test_wv_top_10[TEMP_COL].to_list(), index=test_wv_top_10[TEMP_COL].index, columns=VECTOR_COLS)

test_wv_top_10 = pd.concat([test_wv_top_10, wv_df], axis=1)
test_wv_top_10.dropna(inplace=True)

X_wv_test = test_wv_top_10.drop([DATA_COL, TEMP_COL, TARGET_COL], axis=1)
y_wv_test = test_wv_top_10[TARGET_COL]


In [None]:
# Save the avg'ed word vectors dataset
import pickle as pkl

with open(f"{DATA_PATH}/{TOP_10_WORD_VECTORS_DATA_FILE}", mode="wb") as file:
    pkl.dump((X_wv_train, y_wv_train, X_wv_test, y_wv_test), file)


Report the accuracy value on the testing split for your MLP model.


What do you conclude by comparing accuracy values you obtain with those obtained in the “’Simple Models” section?


TODO: Answer


## Recurrent Neural Networks


### Simple RNN


Report accuracy values on the testing split for your RNN model.


What do you conclude by comparing accuracy values you obtain with those obtained with feedforward neural network models?


### Gated Recurrent Unit


What do you conclude by comparing accuracy values you obtain with those obtained using simple RNN?


TODO: Answer
