In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from bs4 import BeautifulSoup
import warnings 
warnings.filterwarnings("ignore")
from gensim.test.utils import datapath
from gensim import utils
import gensim.models
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer



## 1) Dataset Generation (5 points)
We will use the Amazon reviews dataset used in HW1. Load the dataset
and build a balanced dataset of 250K reviews along with their ratings (50K
instances per each rating score) through random selection. Create ternary
labels using the ratings. We assume that ratings more than 3 denote positive
1
sentiment (class 1) and rating less than 3 denote negative sentiment (class
2). Reviews with rating 3 are considered to have neutral sentiment (class 3).
You can store your dataset after generation and reuse it to reduce the computational load. For your experiments consider a 80%/20% training/testing
split.

In [2]:
df = pd.read_csv("data/amazon_reviews_us_Office_Products_v1_00.tsv", sep='\t', on_bad_lines='skip')#, usecols=['review_body','star_rating']) #lineterminator='\r'
df.drop(df.columns[0], axis=1, inplace=True)
df = df[['review_body', 'star_rating']]
df.columns

Index(['review_body', 'star_rating'], dtype='object')

### Preprocess/Cleaning take 15 minutes

In [3]:
contraction_mapping = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}


pattern_contractions = re.compile('(%s)' % '|'.join(contraction_mapping.keys()))
lemmatizer = WordNetLemmatizer()
nltk.download('stopwords', 'punkt')
stop_words = set(stopwords.words('english'))


def expand_contractions(text, contraction_map=contraction_mapping):
    return pattern_contractions.sub(lambda occurrence: contraction_map[occurrence.group(0)], text)


def rem_stopwords(review,stp):
    words = review.split()
    filtered_words = [word for word in words if word not in stp]
    filtered_sentence = ' '.join(filtered_words)
    return filtered_sentence


def lemmazation(review):
    words = review.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_review = ' '.join(lemmatized_words)
    return lemmatized_review


def clean_preproc_reviews(reviews, stp):
    ### CLEANING
    reviews = reviews.str.lower()
    reviews = reviews.apply(lambda x: BeautifulSoup(x, "html.parser").get_text())
    reviews = reviews.replace(r'http\S+', '', regex=True)
    reviews = reviews.replace("[^a-zA-Z]", " ", regex=True)
    reviews = reviews.replace('\s+', ' ', regex=True).str.strip()
    reviews = reviews.apply(lambda x: expand_contractions(x))

    ### PREPROCESSING
    reviews = reviews.apply(lambda x : rem_stopwords(x, stp))
    reviews = reviews.apply(lemmazation)

    return reviews

# Clean the reviews
df.dropna(subset=['review_body'], inplace=True)
df['review_body'] = clean_preproc_reviews(df['review_body'].astype(str), stop_words)
df.dropna(subset=['review_body'], inplace=True)


[nltk_data] Downloading package stopwords to punkt...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df['label'] = df['star_rating'].apply(lambda x: 1 if x in [4, 5] else (0 if x in [1, 2] else 3))


star_ratings = [5, 4, 3, 2, 1]
samples = [ df[df['star_rating'] == rating].sample(n=50000, random_state=42) for rating in star_ratings]
merged_dataset = pd.concat(samples)


## 2) Word Embedding (30 points)
In this part the of the assignment, you will learn how to generate two sets
of Word2Vec features for the dataset you generated. You can use Gensim
library for this purpose. A helpful tutorial is available in the following link:
https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.
html


In [5]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

### (a) (10 points)
Load the pretrained “word2vec-google-news-300” Word2Vec model and learn
how to extract word embeddings for your dataset. Try to check semantic
similarities of the generated vectors using two examples of your own, e.g.,
King − M an + W oman = Queen or excellent ∼ outstanding.


In [26]:
# # wv['buger'] - wv['fries'] + wv['fish'] ?= wv['chips']
# test_relationship = wv['burger'] - wv['fries'] + wv['fish']
# print(test_relationship, wv['chips']) 


In [7]:
import gensim.downloader as api

# Load the pretrained 'word2vec-google-news-300' model
model = api.load('word2vec-google-news-300')

# Example 1: Solving an analogy: "King - Man + Woman = ?"
# Note: You may need to handle cases where words are not in the vocabulary
result = my_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print("King - Man + Woman =", result[0][0])

# Example 2: Checking similarity between 'excellent' and 'outstanding'
similarity = model.similarity('excellent', 'outstanding')
print("Similarity between 'excellent' and 'outstanding':", similarity)


King - Man + Woman = queen
Similarity between 'excellent' and 'outstanding': 0.5567486


In [8]:
import numpy as np
from gensim.downloader import load

# Load the model
model = load('word2vec-google-news-300')

# Function to find the most similar word
def most_similar_vector(vector):
    return model.similar_by_vector(vector, topn=1)[0][0]

# Vector arithmetic: "King - Man + Woman"
result_vector = model['king'] - model['man'] + model['woman']
analogy_result = most_similar_vector(result_vector)
print(f"King - Man + Woman = {analogy_result}")

# Function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Compute similarity between 'excellent' and 'outstanding'
similarity_score = cosine_similarity(model['excellent'], model['outstanding'])
print(f"Similarity between 'excellent' and 'outstanding': {similarity_score}")


King - Man + Woman = king
Similarity between 'excellent' and 'outstanding': 0.556748628616333


### (b) (20 points)
Train a Word2Vec model using your own dataset. Set the embedding size
to be 300 and the window size to be 11. You can also consider a minimum
word count of 10. Check the semantic similarities for the same two examples
in part (a). What do you conclude from comparing vectors generated by
yourself and the pretrained model? Which of the Word2Vec models seems
to encode semantic similarities between words better?

In [32]:
merged_dataset.columns

Index(['review_body', 'star_rating', 'label'], dtype='object')

In [39]:
class MyCorpus:
    def __init__(self, df, col):
        self.df = df
        self.col = col
        self.count = 0

    def __iter__(self):
        for line in self.df[self.col]:
            self.count += 1 
            yield utils.simple_preprocess(line)
    def print_count(self):
        print(self.count)


sentences = MyCorpus(merged_dataset, 'review_body')
my_model = gensim.models.Word2Vec(sentences=sentences, vector_size=300, window=11, min_count=10, workers=4)
sentences.print_count()


1500000


In [40]:
merged_dataset

Unnamed: 0,review_body,star_rating,label
2031422,child teacher wish list bought box shipped dir...,5,1
1390030,definitely going start ordering often much che...,5,1
1285394,work great,5,1
1452754,product tear envelope packaging ease serrated ...,5,1
1129854,good product fast shipping,5,1
...,...,...,...
1718122,utter junk buy gave star b c allow star glad p...,1,0
1498869,scanner completely worthless page scan straigh...,1,0
117396,garage poorly made fell apart week later mostl...,1,0
632610,bought returned week definitely good projector...,1,0


In [34]:
### NEW Dataframe

word_vectors = my_model.wv

# Using the most_similar method
result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print("King - Man + Woman =", result[0][0])

# Calculate similarity between words
similarity = word_vectors.similarity('excellent', 'outstanding')
print("Similarity between 'excellent' and 'outstanding':", similarity)

King - Man + Woman = republican
Similarity between 'excellent' and 'outstanding': 0.79697263


In [36]:
len(word_vectors)

14435

In [29]:
#### MODEL TRAINED ON UNREFINED DATAFRAME
word_vectors = my_model.wv

# Using the most_similar method
result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print("King - Man + Woman =", result[0][0])

# Calculate similarity between words
similarity = word_vectors.similarity('excellent', 'outstanding')
print("Similarity between 'excellent' and 'outstanding':", similarity)

King - Man + Woman = queen
Similarity between 'excellent' and 'outstanding': 0.84469146


In [None]:
Y = merged_dataset['label']
X = merged_dataset['review_body']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


## 3) Simple models (20 points)
Using the Word2Vec features that you can generate using the two models
you prepared in the Word Embedding section, train a perceptron and an
SVM model similar to HW1 for class 1 and class 2 (binary models). For this
purpose, you can just use the average Word2Vec vectors for each review as
the input feature (x = 1N PNi=1 Wi for a review with N words). To improve 2
your performance, use the data cleaning and preprocessing steps of HW1
to include only important words from each review when you compute the
average x = 1 N PN i=1 Wi.
Report your accuracy values on the testing split for
these models for each feature type along with values you reported in your
HW1 submission, i.e., for each of perceptron and SVM, you need to report
three accuracy values for “word2vec-google-news-300”, your own Word2Vec,
and TF-IDF features.
What do you conclude from comparing performances for the models
trained using the three different feature types (TF-IDF, pretrained Word2Vec,
your trained Word2Vec)?


In [None]:
#### PERCEPTRON 
##USE AVERAGE WORD VECTOR

from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def evaulate(y_label, y_predicted):
    accuracy = accuracy_score(y_label, y_predicted)
    precision = precision_score(y_label, y_predicted, average='binary')
    recall = recall_score(y_label, y_predicted, average='binary')
    f1 = f1_score(y_label, y_predicted, average='binary')
    return accuracy, precision, recall,f1

clf = Perceptron(tol=1e-5, random_state=0)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

tr_acc, tr_prec, tr_rec, tr_f1 = evaulate(y_train, y_pred_train)
te_acc, te_prec, te_rec, te_f1 = evaulate(y_test, y_pred_test)

print("Training: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(tr_acc, tr_prec, tr_rec, tr_f1))
print(" Testing: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(te_acc, te_prec, te_rec, te_f1))

In [None]:
#### SVM 

## 4) Feedforward Neural Networks (25 points)
Using the features that you can generate using the models you prepared in
the Word “Embedding section”, train a feedforward multilayer perceptron
network for sentiment analysis classification. Consider a network with two
hidden layers, each with 50 and 10 nodes, respectively. You can use cross
entropy loss and your own choice for other hyperparamters, e.g., nonlinearity,
number of epochs, etc. Part of getting good results is to select good values
for these hyperparamters.
You can also refer to the following tutorial to familiarize yourself:
https://www.kaggle.com/mishra1993/pytorch-multi-layer-perceptron-mnist
Although the above tutorial is for image data but the concept of training
an MLP is very similar to what we want to do.

### (a) (10 points)
To generate the input features, use the average Word2Vec vectors similar to
the “Simple models” section and train the neural network. Train a network
for binary classification using class 1 and class 2 and also a ternary model for
the three classes. Report accuracy values on the testing split for your MLP
model for each of the binary and ternary classification cases.


(b) (15 points)
To generate the input features, concatenate the first 10 Word2Vec vectors
for each review as the input feature (x = [WT
1, ..., WT [10]) and train the neural 3
network. Report the accuracy value on the testing split for your MLP model
for each of the binary and ternary classification cases.
What do you conclude by comparing accuracy values you obtain with
those obtained in the “’Simple Models” section (note you can compare the
accuracy values for binary classification).

## 5) Convolutional Neural Networks (20 points)
Using the vectors you prepared in the “Word Embedding” section, train a
convolutional neural network (CNN) for sentiment analysis classification.
Train a simple CNN for sentiment analysis. You can consider an two-layer
CNN with the output channel sizes of 50 and 10. To feed your data into the
CNN, limit the maximum review length to 50 by truncating longer reviews
and padding shorter reviews with a null value (0). You can use cross entropy
loss and your own choice for other hyperparamters, e.g., nonlinearity, number
of epochs, etc. Train the CNN network for binary classification using class 1
and class 2 and also a ternary model for the three classes. Report accuracy
values on the testing split for your CNN model.