In [1]:
# Allow multiple print statements in a cell in Jupyter Notebook
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"


In [2]:
import pandas as pd
import numpy as np
import nltk

nltk.download("wordnet")
import re


[nltk_data] Downloading package wordnet to /Users/aditya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Install some dependencies
! pip install emot contractions



In [4]:
DATA_PATH = "/Volumes/dataTwo/usc/CSCI_544/assignment_02/data"
MODEL_PATH = "/Volumes/dataTwo/usc/CSCI_544/assignment_02/model"

ORIGINAL_DATA_FILE = "amazon_reviews_us_Jewelry_v1_00.tsv"
CLEANED_DATA_FILE = "data_cleaned.csv"
PREPROCESSED_DATA_FILE = "data_preprocessed.csv"

WORD_EMBEDDINGS_FILE = "gensim_w2v_amazon_reviews"
WORD_VECTORS_DATA_FILE = "data_word_vectors.csv"

DATA_COL = "review_body"
TARGET_COL = "star_rating"

N_SAMPLES = 20000

RANDOM_SEED = 42


In [5]:
np.random.seed(RANDOM_SEED)
rng = np.random.default_rng(seed=RANDOM_SEED)


# Data Generation


In [6]:
# Load the tab separated data file, and print the first 5 rows for confirmation
data = pd.read_csv(f"{DATA_PATH}/{ORIGINAL_DATA_FILE}", sep="\t", usecols=[TARGET_COL, DATA_COL], low_memory=True)
data.head()

# Drop NA values
data.dropna(inplace=True)

# Drop the outlier which is star_rating = "2012-12-21"
data = data[data.star_rating != "2012-12-21"]

# Convert all star rating to integer
data[TARGET_COL] = data.star_rating.astype(int)

# Remove nan valued rows
data = data[data.review_body.notnull()]


  data = pd.read_csv(f"{DATA_PATH}/{ORIGINAL_DATA_FILE}", sep="\t", usecols=[TARGET_COL, DATA_COL], low_memory=True)


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


In [7]:
data.head()


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


In [8]:
sampled_data = data.groupby(TARGET_COL, group_keys=False).apply(lambda x: x.sample(N_SAMPLES, random_state=RANDOM_SEED))
sampled_data.reset_index(inplace=True)
sampled_data.drop(columns=["index"], inplace=True)


In [9]:
# save data without cleaning
sampled_data.to_csv(f"{DATA_PATH}/{CLEANED_DATA_FILE}", header=True)


## Data Cleaning


In [10]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: pd.Series):
    return data.str.lower()


In [11]:
def remove_accented_characters(data: pd.Series):
    import unicodedata

    """Removes accented characters from the Series

    Args:
        data (pd.Series): Series of string

    Returns:
        _type_: pd.Series
    """
    import unicodedata

    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))


In [12]:
def remove_html_encodings(data: pd.Series):
    return data.str.replace(r"&#\d+;", " ", regex=True)


In [13]:
def remove_html_tags(data: pd.Series):
    return data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)


In [14]:
def remove_url(data: pd.Series):
    return data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)


In [15]:
def remove_html_and_url(data: pd.Series):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (pd.Series): A Pandas series of type string

    Returns:
        _type_: pd.Series
    """
    # Remove HTML encodings
    data.str.replace(r"&#\d+;", " ", regex=True)

    # Remove HTML tags (both open and closed)
    data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

    # Remove URLs
    data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

    return data


In [16]:
# Handle emoji
def convert_emoji_to_txt(data: pd.Series):
    from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO

    EMO_TO_TXT_DICT = dict()
    for emot in UNICODE_EMOJI:
        EMO_TO_TXT_DICT[emot] = f" {re.sub(r',|:|_', '', UNICODE_EMOJI[emot])} "

    for emo in EMOTICONS_EMO:
        EMO_TO_TXT_DICT[emot] = f" {re.sub(r',| ', '', EMOTICONS_EMO[emo])} "

    def convert_emojis(text, emo_to_txt_dict):
        for emot in emo_to_txt_dict:
            text = text.replace(emot, emo_to_txt_dict[emot])
        return text

    return data.apply(lambda x: convert_emojis(x, EMO_TO_TXT_DICT))


In [17]:
# Remove non-alphabetical characters
def remove_non_alpha_characters(data: pd.Series):
    return data.str.replace(r"_+|\\|[^a-zA-Z0-9\s]", " ", regex=True)


In [18]:
# Remove extra spaces
def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)


In [19]:
# Expanding contractions
def fix_contractions(data: pd.Series):
    import contractions

    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])

    return data.apply(contraction_fixer)


In [20]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    DATA_COL: [
        convert_emoji_to_txt,
        to_lower,
        remove_accented_characters,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = sampled_data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = cleaned_data[col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    cleaned_data[col] = temp_data.copy()


Starting: convert_emoji_to_txt
Ended: convert_emoji_to_txt
Starting: to_lower
Ended: to_lower
Starting: remove_accented_characters
Ended: remove_accented_characters
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [21]:
# Store data file
cleaned_data.to_csv(f"{DATA_PATH}/{CLEANED_DATA_FILE}", sep=",", index=False)


## Data Preprocessing (used by TF-IDF Models)


In [22]:
def tokenize(data: pd.Series):
    from nltk.tokenize import word_tokenize

    nltk.download("punkt")

    return data.apply(word_tokenize)


In [23]:
from typing import List, Set


def remove_stopwords(data: pd.Series):
    """Remove stop words using the NLTK stopwords dictionary

    Args:
        string (str): a document

    Returns:
        str: a document with stopwords removed
    """
    from nltk.corpus import stopwords

    nltk.download("stopwords")

    stopwords = set(stopwords.words())

    def remover(word_list: List[str], stopwords: Set[str]):
        return [word for word in word_list if not word in stopwords]

    return data.apply(lambda word_list: remover(word_list, stopwords))


In [24]:
def lemmatize(data: pd.Series, consider_pos_tag: bool = True):
    from nltk.corpus import wordnet
    from nltk.stem import WordNetLemmatizer

    nltk.download("omw-1.4")

    # POS tagging
    def perform_nltk_pos_tag(data: pd.Series):
        from nltk import pos_tag

        nltk.download("averaged_perceptron_tagger")

        return data.apply(pos_tag)

    # Convert POS tag to wordnet pos tags
    def wordnet_pos_tagger(tag: str):
        if tag.startswith("J"):
            return wordnet.ADJ
        elif tag.startswith("V"):
            return wordnet.VERB
        elif tag.startswith("N"):
            return wordnet.NOUN
        elif tag.startswith("R"):
            return wordnet.ADV
        else:
            return None

    lemmatizer = WordNetLemmatizer()
    lemmatized = list()

    if consider_pos_tag:
        pos_tagged_data = data.copy()
        pos_tagged_data = perform_nltk_pos_tag(data)

        for row in pos_tagged_data:

            lemmatized_row = list()

            if consider_pos_tag:
                for word, tag in row:
                    wordnet_pos_tag = wordnet_pos_tagger(tag)

                    if wordnet_pos_tag is None:
                        lemmatized_row.append(word)
                    else:
                        result = lemmatizer.lemmatize(word, wordnet_pos_tag)
                        lemmatized_row.append(lemmatizer.lemmatize(word, wordnet_pos_tag))

            lemmatized.append(lemmatized_row)
    else:
        for row in data:
            lemmatized_row = list()

            for word in row:
                lemmatized_row.append(lemmatizer.lemmatize(word))

            lemmatized.append(lemmatized_row)

    return pd.Series(lemmatized)


In [25]:
# Concatenate lemmatized sentences back into one sentence
def concatenate(data: pd.Series):
    return data.apply(lambda words: " ".join(words))


### Do Preprocessing


In [26]:
cleaned_data = pd.read_csv(f"{DATA_PATH}/{CLEANED_DATA_FILE}")


In [27]:
preprocessing_pipeline = {DATA_COL: [tokenize, lemmatize, concatenate]}

# Run the pipeline
preprocessed_data = cleaned_data.copy()

# Process all the cleaning instructions
for col, pipeline in preprocessing_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = preprocessed_data[col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")

        if func.__name__ == "lemmatize":
            temp_data = func(temp_data, consider_pos_tag=True)
        else:
            temp_data = func(temp_data)

        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    preprocessed_data[col] = temp_data.copy()

# Remove empty reviews
preprocessed_data = preprocessed_data[preprocessed_data[DATA_COL].str.len() != 0]


Starting: tokenize


[nltk_data] Downloading package punkt to /Users/aditya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Ended: tokenize
Starting: lemmatize


[nltk_data] Downloading package omw-1.4 to /Users/aditya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/aditya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Ended: lemmatize
Starting: concatenate
Ended: concatenate


In [28]:
# Save the preprocessed data
preprocessed_data.to_csv(f"{DATA_PATH}/{PREPROCESSED_DATA_FILE}", index=False)


## Data Split


In [29]:
# Load the preprocessed data
preprocessed_data = pd.read_csv(f"{DATA_PATH}/{PREPROCESSED_DATA_FILE}")


In [30]:
# Split the data 80-20 split
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    preprocessed_data, test_size=0.2, stratify=preprocessed_data[TARGET_COL], random_state=RANDOM_SEED
)


## Helper Functions


In [31]:
def calc_metrics(y_true, y_pred, avg_type="macro"):
    from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

    precision = precision_score(y_true, y_pred, average=avg_type)
    recall = recall_score(y_true, y_pred, average=avg_type)
    f1 = f1_score(y_true, y_pred, average=avg_type)
    accuracy = accuracy_score(y_true, y_pred)

    return precision, recall, f1, accuracy


In [32]:
def print_metics(precision, recall, f1, accuracy):
    print(f"Avg. Precision: {precision}")
    print(f"Avg. Recall: {recall}")
    print(f"Avg. F1: {f1}")
    print(f"Accuray Score: {accuracy}")


## Word Embedding


### Load `word2vec-google-news-300` Model

Learn how to extract word embeddings for your dataset. Try to check semantic similarities of the generated vectors using three examples of your own, e.g., King − Man + Woman = Queen or excellent ∼ outstanding.


In [33]:
! pip install gensim



In [34]:
import gensim.downloader as api

w2v_google = api.load("word2vec-google-news-300")


In [35]:
w2v_google.most_similar(positive=["king", "woman"], negative=["man"], topn=1)


[('queen', 0.7118192911148071)]

In [36]:
w2v_google.similarity("excellent", "outstanding")


0.5567486

Three Semantic Similarity Examples


In [37]:
w2v_google.similarity("worst", "terrible")


0.55750686

In [38]:
w2v_google.similarity("cheap", "disappointed")


0.05468019

In [39]:
w2v_google.most_similar(positive=["cheap", "worst"], negative=["costly"], topn=1)


[('crappiest', 0.46163424849510193)]

### Train Word2Vec on own Dataset


In [40]:
data = pd.read_csv(f"{DATA_PATH}/{CLEANED_DATA_FILE}", sep=",")
data.head()


Unnamed: 0,star_rating,review_body
0,1,too small even for the knuckles
1,1,did not fit right
2,1,this stupid kit has 16 gauge needles not 14ga...
3,1,i would not suggest this item i bought the on...
4,1,i am sure that it will be lovely once i get i...


In [41]:
from gensim.test.utils import datapath
from gensim import utils


class AmznReviewCorpus:
    def __iter__(self):
        corpus_path = datapath(f"{DATA_PATH}/{PREPROCESSED_DATA_FILE}")

        data = pd.read_csv(corpus_path)
        for review_body in data["review_body"]:
            yield utils.simple_preprocess(review_body)


In [42]:
import gensim.models

reviews = AmznReviewCorpus()
w2v_model = gensim.models.Word2Vec(sentences=reviews, vector_size=300, min_count=10, window=11)


In [43]:
w2v_model.save(f"{MODEL_PATH}/{WORD_EMBEDDINGS_FILE}")


In [44]:
w2v_model.wv.most_similar(positive=["king", "woman"], negative=["man"], topn=1)


[('familiar', 0.5208674073219299)]

In [45]:
w2v_model.wv.similarity("excellent", "outstanding")


0.7532696

Three Semantic Similarity Examples


In [46]:
w2v_google.similarity("worst", "terrible")


0.55750686

In [47]:
w2v_google.similarity("cheap", "disappointed")


0.05468019

In [48]:
w2v_google.most_similar(positive=["cheap", "worst"], negative=["costly"], topn=1)


[('crappiest', 0.46163424849510193)]

Check semantic similarities from words used in earlier part.


What do you conclude from comparing vectors generated by yourself and the pretrained model? Which of the Word2Vec models seems to encode semantic similarities between words better?


TODO: Answer


## Avg. Word Vectors


Use the average Word2Vec vectors for each review as the input feature (x = N1 􏰀Ni=1 Wi for a review with N words)


In [49]:
def calculate_avg_word_vector(sentence: str, w2v_model):
    result_vector = np.ndarray(shape=(300,), buffer=np.zeros((300,)), dtype=float)
    words = sentence.split(" ")
    removed_word_count = 0
    for word in words:
        try:
            result_vector += w2v_model[word]
        except KeyError:
            removed_word_count += 1
    return result_vector / (len(words) - removed_word_count)


In [50]:
from functools import partial

VECTOR_COLS = ["vector_" + f"_{i}" for i in range(300)]

train_wv = train.copy()
train_wv["avg_word_vector"] = train_wv[DATA_COL].apply(partial(calculate_avg_word_vector, w2v_model=w2v_google))
# google = [google[x] for x in range(len(google)) if np.isnan(google[x][0]).any() == False]
train_wv[VECTOR_COLS] = pd.DataFrame(
    train_wv["avg_word_vector"].to_list(), index=train_wv["avg_word_vector"].index
)
train_wv.dropna(inplace=True)

X_wv_train = train_wv.drop([DATA_COL, "avg_word_vector", TARGET_COL], axis=1)
y_wv_train = train_wv[TARGET_COL]

test_wv = test.copy()
test_wv["avg_word_vector"] = test[DATA_COL].apply(partial(calculate_avg_word_vector, w2v_model=w2v_google))
# google = [google[x] for x in range(len(google)) if np.isnan(google[x][0]).any() == False]
test_wv[VECTOR_COLS] = pd.DataFrame(
    test_wv["avg_word_vector"].to_list(), index=test_wv["avg_word_vector"].index
)
test_wv.dropna(inplace=True)

X_wv_test = train_wv.drop([DATA_COL, "avg_word_vector", TARGET_COL], axis=1)
y_wv_test = train_wv[TARGET_COL]


  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(
  train_wv[VECTOR_COLS] = pd.DataFrame(


In [51]:
# Save the avg'ed word vectors dataset
train_wv.to_csv(f"{DATA_PATH}/{WORD_VECTORS_DATA_FILE}", index=False)
test_wv.to_csv(f"{DATA_PATH}/{WORD_VECTORS_DATA_FILE}", index=False, mode="a")

## Simple Models


Report your accuracy values on the testing split for these models similar to HW1, i.e., for each of perceptron and SVM models, report two accuracy values Word2Vec and TF-IDF features.


### TF-IDF Vectorization


In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

nltk.download("punkt")

vectorizer = TfidfVectorizer(tokenizer=word_tokenize)

# Using entire data to fit as the dataset is small and as using entire dataset is not needed for homework requirement
vectorizer.fit(preprocessed_data[DATA_COL])

X_tfidf_train = vectorizer.transform(train[DATA_COL])
X_tfidf_test = vectorizer.transform(test[DATA_COL])
y_tfidf_train = train[TARGET_COL]
y_tfidf_test = test[TARGET_COL]


[nltk_data] Downloading package punkt to /Users/aditya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True



### Perceptron


In [53]:
from sklearn.linear_model import Perceptron

perceptron_clf = Perceptron(
    max_iter=8000, alpha=0.012, random_state=RANDOM_SEED, tol=1e-4, early_stopping=True, class_weight="balanced"
)


> #### TF-IDF Based Approach


In [54]:
perceptron_clf.fit(X_tfidf_train, y_tfidf_train)

y_tfidf_pred = perceptron_clf.predict(X_tfidf_test)

print_metics(*calc_metrics(y_tfidf_test, y_tfidf_pred))


Avg. Precision: 0.43148402487338205
Avg. Recall: 0.4158747559200956
Avg. F1: 0.4205659226973907
Accuray Score: 0.4158623793569035


> #### Word2Vec Based Approach


In [55]:
perceptron_clf = Perceptron(
    max_iter=8000, alpha=0.012, random_state=RANDOM_SEED, tol=1e-4, early_stopping=True
)

perceptron_clf.fit(X_wv_train, y_wv_train)

y_wv_pred = perceptron_clf.predict(X_wv_test)

print_metics(*calc_metrics(y_wv_test, y_wv_pred))


Avg. Precision: 0.4570699124118292
Avg. Recall: 0.30238016051844246
Avg. F1: 0.23453637060498034
Accuray Score: 0.3023898898898899


### SVM


In [56]:
from sklearn.svm import LinearSVC

class_weight = {1: 0.9525, 2: 1.99825, 3: 1.9225, 4: 0.625, 5: 0.8585}

svm_clf = LinearSVC(dual=False, C=0.1, max_iter=1000, class_weight=class_weight, random_state=RANDOM_SEED)


> #### TF-IDF Based


In [57]:
svm_clf.fit(X_tfidf_train, y_tfidf_train)

y_tfidf_pred = svm_clf.predict(X_tfidf_test)

print_metics(*calc_metrics(y_tfidf_test, y_tfidf_pred))


Avg. Precision: 0.53990986930853
Avg. Recall: 0.4997515446958287
Avg. F1: 0.4822014955371706
Accuray Score: 0.49972495874381156


> #### Word2Vec Based Approach


In [58]:
svm_clf.fit(X_wv_train, y_wv_train)

y_wv_avged_pred = svm_clf.predict(X_wv_test)

print_metics(*calc_metrics(y_wv_test, y_wv_pred))


Avg. Precision: 0.4570699124118292
Avg. Recall: 0.30238016051844246
Avg. F1: 0.23453637060498034
Accuray Score: 0.3023898898898899


What do you conclude from comparing performances for the models trained using the two different feature types (TF-IDF and your trained Word2Vec features)?


TODO: Answer


> ## PyTorch Setup

In [59]:
import torch
import torch.nn as nn
import torch.nn.functional as func
from torch.utils.data import TensorDataset, DataLoader

device = None
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device('cpu')

print(f"Using Device: {device}")

Using Device: mps


#### Helper Functions

In [60]:
def train_model(data_loader, train_model, n_epochs, optimizer, criterion):
    valid_loss_min = np.Inf

    for epoch in range(n_epochs):
        train_loss = 0.0

        ###################
        # train the model #
        ###################
        train_model.train() # prep model for training
        for data, target in data_loader:
            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            # forward pass: compute predicted outputs by passing inputs to the model
            output = train_model(data)
            # calculate the loss
            loss = criterion(output, target)
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            # update running training loss
            train_loss += loss.item()*data.size(0)
        
        # print training/validation statistics 
        # calculate average loss over an epoch
        train_loss = train_loss/len(data_loader.dataset)
        
        print(f"Epoch: {epoch + 1} \tTraining Loss: {train_loss:.6f}")

    return train_model

In [61]:
def predict(model, dataloader):
    prediction_list = []
    for i, batch in enumerate(dataloader):
        outputs = model(batch)
        _, predicted = torch.max(outputs.data, 1) 
        prediction_list.append(predicted.cpu())
    return prediction_list

## Feedforward Neural Network


Using the Word2Vec features, train a feedforward multilayer perceptron net- work for classification. Consider a network with two hidden layers, each with 50 and 10 nodes, respectively. You can use cross entropy loss and your own choice for other hyperparamters, e.g., nonlinearity, number of epochs, etc. Part of getting good results is to select good values for these hyperparamters.

In [62]:
FNN_BATCH_SIZE=200

### Create FNN

In [63]:
class FNN(nn.Module):
    def __init__(self, n_input, n_output, dropout_rate) -> None:
        super().__init__()

        self.n_input = n_input
        self.n_hidden_1 = 50
        self.n_hidden_2 = 10
        self.n_output = n_output
        self.dropout_rate = dropout_rate

        self.fc1 = nn.Linear(self.n_input, self.n_hidden_1)
        self.fc2 = nn.Linear(self.n_hidden_1, self.n_hidden_2)
        self.fc3 = nn.Linear(self.n_hidden_2, self.n_output)
        self.dropout = nn.Dropout(self.dropout_rate)

    def forward(self, x):
        x = func.relu(self.fc1(x))
        x = self.dropout(x)
        x = func.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [64]:
fnn_model = FNN(n_input=300, n_output=5, dropout_rate=0.2).to(device)
fnn_model

FNN(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [65]:
# Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(fnn_model.parameters(), lr=1e-2)

> ### Using Avg. Word2Vec Vectors

In [67]:
X_wv_train_tensor = torch.from_numpy(X_wv_train.values).float().to(device)
y_wv_train_tensor = torch.from_numpy(y_wv_train.values).float().to(device)
X_wv_test_tensor = torch.from_numpy(X_wv_test.values).float().to(device)
y_wv_test_tensor = torch.from_numpy(y_wv_test.values).float().to(device)

fnn_train = TensorDataset(X_wv_train_tensor, y_wv_train_tensor)
fnn_train_loader = DataLoader(fnn_train, batch_size=FNN_BATCH_SIZE, shuffle=True)

fnn_test = TensorDataset(X_wv_test_tensor, y_wv_test_tensor)
fnn_test_loader = DataLoader(fnn_test, batch_size=FNN_BATCH_SIZE, shuffle=True)

In [68]:
fnn_model = train_model(fnn_train_loader, fnn_model, 1, optimizer, criterion)

: 

: 

In [None]:
test_loader = torch.utils.data.DataLoader(fnn_test_loader, batch_size=FNN_BATCH_SIZE)

y_wv_pred = predict(fnn_model,test_loader)

print(*calc_metrics(y_wv_test, y_wv_pred))

Report accuracy on the testing split.


> ### Concatenate First 10 Word2Vec Vectors


In [None]:
def concatenate_top_n_wv(sentence: str, w2v_model, n = 10):
    result = list()
    for _ in range(n):
        pass

    return result

Report the accuracy value on the testing split for your MLP model.


What do you conclude by comparing accuracy values you obtain with those obtained in the “’Simple Models” section?


TODO: Answer

## Recurrent Neural Networks


### Simple RNN


Report accuracy values on the testing split for your RNN model.


What do you conclude by comparing accuracy values you obtain with those obtained with feedforward neural network models?


### Gated Recurrent Unit


What do you conclude by comparing accuracy values you obtain with those obtained using simple RNN?


TODO: Answer