# CSCI-544 Homework Assignment No. 1
### Name : Ashwin Chafale
### USC ID : 1990624801

## Sentiment Analysis on Amazon reviews dataset

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Installation before running the notebook
! pip install bs4
! pip install contractions
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ashwin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ashwin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ashwin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Read Data
1. [Amazon reviews dataset](https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Jewelry_v1_00.tsv.gz)
2. Our goal is to train sentiment analysis classifiers that can predict the rating value for a given review.

In [3]:
df = pd.read_csv("amazon_reviews_us_Jewelry_v1_00.tsv", sep='\t', header=0, on_bad_lines='skip')

## Keep Reviews and Ratings

In [4]:
df = df[['review_body','star_rating']]
df.head()

Unnamed: 0,review_body,star_rating
0,so beautiful even tho clearly not high end ......,5
1,"Great product.. I got this set for my mother, ...",5
2,Exactly as pictured and my daughter's friend l...,5
3,Love it. Fits great. Super comfortable and nea...,5
4,Got this as a Mother's Day gift for my Mom and...,5


Removing `Null` and missing values from the dataset

In [5]:
df = df.dropna()
df = df.reset_index(drop=True)
df.shape

(1766748, 2)

## We select 20000 reviews randomly from each rating class.

In [6]:
df['star_rating'] = df['star_rating'].astype(int)

sample_size  = 20000
# five_star =  df.loc[ df['star_rating'] == 5].sample(sample_size)
# four_star =  df.loc[ df['star_rating'] == 4].sample(sample_size)
# three_star =  df.loc[ df['star_rating'] == 3].sample(sample_size)
# two_star =  df.loc[ df['star_rating'] == 2].sample(sample_size)
# one_star =  df.loc[ df['star_rating'] == 1].sample(sample_size)
five_star = df.loc[df['star_rating'] == 5][:sample_size]
four_star = df.loc[df['star_rating'] == 4][:sample_size]
three_star = df.loc[df['star_rating'] == 3][:sample_size]
two_star = df.loc[df['star_rating'] == 2][:sample_size]
one_star = df.loc[df['star_rating'] == 1][:sample_size]

data = pd.concat([five_star, four_star, three_star, two_star, one_star], axis=0)

In [7]:
print("Average Length of reviews before Data Cleaning step = ", data['review_body'].str.len().mean())

Average Length of reviews before Data Cleaning step =  130.81458


# Data Cleaning

### 1. Converting all reviews to lower case

In [8]:
# convert all reviews to lower case
data["pre_processed_reviews"] = data['review_body'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))

### 2. Removing the HTML and URLs from the reviews

In [9]:
# remove HTML tags as well as URLs from reviews.
data["pre_processed_reviews"] = data["pre_processed_reviews"].apply(lambda x: BeautifulSoup(x).get_text())
data["pre_processed_reviews"] = data["pre_processed_reviews"].apply(lambda x: re.sub(r"http\S+", "", x))

### 3. Perform "Contractions" on reviews

In [10]:
# contractions
import contractions
data["pre_processed_reviews"] = data["pre_processed_reviews"].apply(lambda x:contractions.fix(x))

### 4. Remove the non-alpha characters

In [11]:
# remove the non-alpha characters
data["pre_processed_reviews"] = data["pre_processed_reviews"].apply(lambda x: " ".join([re.sub("[^A-Za-z]+","", x) for x in nltk.word_tokenize(x)]))

### 5. Remove extra spaces among the words

In [12]:
# remove extra spaces among the words
data['pre_processed_reviews'] = data['pre_processed_reviews'].apply(lambda x: re.sub(' +', ' ', x))

In [13]:
print("Average Length of reviews after Data Cleaning step = ", data['pre_processed_reviews'].str.len().mean())

Average Length of reviews after Data Cleaning step =  126.72483


# Pre-processing

In [14]:
print("Average Length of reviews before Data Pre-processing step = ", data['pre_processed_reviews'].str.len().mean())

Average Length of reviews before Data Pre-processing step =  126.72483


### 1. Remove stop words

Note: Just for the purpose of pre-processing I have shown the stop-words removal.
However, the stop-word removed pre-processed data is not used to train the model.

Reason for not performing stop-word removing step:
*I have noticed that after stop-words are not removed it leads to increase in average precision of all the ML models by 10%.*

In [15]:
data_copy = data.copy(deep=True)

In [16]:
# remove stop words using a NLTK package
from nltk.corpus import stopwords
sw_nltk = stopwords.words('english')
sw_nltk.remove("not")
sw_nltk.remove("don")
sw_nltk.remove("don't")
sw_nltk.remove("aren't")
sw_nltk.remove("couldn't")
sw_nltk.remove("couldn")
sw_nltk.remove("didn")
sw_nltk.remove("didn't")
sw_nltk.remove("doesn")
sw_nltk.remove("doesn't")
sw_nltk.remove("won")
sw_nltk.remove("won't")
data_copy['pre_processed_reviews'] = data_copy['pre_processed_reviews'].apply(lambda x: " ".join([x for x in x.split() if x not in sw_nltk]))

In [17]:
print("Average Length of reviews after Data Pre-processing step = ", data_copy['pre_processed_reviews'].str.len().mean())

Average Length of reviews after Data Pre-processing step =  78.67438


### 2. Perform Lemmatization

In [18]:
# lemmatization using wordnet lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
data['pre_processed_reviews'] = data['pre_processed_reviews'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))

# TF-IDF Feature Extraction

In [19]:
# Train - test split
from sklearn.model_selection import train_test_split

five_star_X_train, five_star_X_test, five_star_Y_train, five_star_Y_test = \
train_test_split(data[data["star_rating"] == 5]["pre_processed_reviews"],
                 data[data["star_rating"] == 5]["star_rating"], test_size=0.2, random_state=30)

four_star_X_train, four_star_X_test, four_star_Y_train, four_star_Y_test = \
train_test_split(data[data["star_rating"] == 4]["pre_processed_reviews"],
                 data[data["star_rating"] == 4]["star_rating"], test_size=0.2, random_state=30)

three_star_X_train, three_star_X_test, three_star_Y_train, three_star_Y_test = \
train_test_split(data[data["star_rating"] == 3]["pre_processed_reviews"],
                 data[data["star_rating"] == 3]["star_rating"], test_size=0.2, random_state=30)

two_star_X_train, two_star_X_test, two_star_Y_train, two_star_Y_test = \
train_test_split(data[data["star_rating"] == 2]["pre_processed_reviews"],
                 data[data["star_rating"] == 2]["star_rating"], test_size=0.2, random_state=30)

one_star_X_train, one_star_X_test, one_star_Y_train, one_star_Y_test = \
train_test_split(data[data["star_rating"] == 1]["pre_processed_reviews"],
                 data[data["star_rating"] == 1]["star_rating"], test_size=0.2, random_state=30)

X_train = pd.concat([five_star_X_train, four_star_X_train, three_star_X_train, two_star_X_train, one_star_X_train])
X_test = pd.concat([five_star_X_test, four_star_X_test, three_star_X_test, two_star_X_test, one_star_X_test])
Y_train = pd.concat([five_star_Y_train, four_star_Y_train, three_star_Y_train, two_star_Y_train, one_star_Y_train])
Y_test = pd.concat([five_star_Y_test, four_star_Y_test, three_star_Y_test, two_star_Y_test, one_star_Y_test])

print("Train: ", X_train.shape, Y_train.shape, "Test: ", (X_test.shape, Y_test.shape))

Train:  (80000,) (80000,) Test:  ((20000,), (20000,))


In [20]:
# TF-IDF step
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vector =  TfidfVectorizer()
tf_x_train = tf_idf_vector.fit_transform(X_train)
tf_x_test = tf_idf_vector.transform(X_test)

# Perceptron

In [47]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
perceptron = Perceptron(max_iter=1000, random_state=0)
perceptron.fit(tf_x_train,Y_train)
y_test_predicted = perceptron.predict(tf_x_test)

In [48]:
report = classification_report(Y_test, y_test_predicted, output_dict=True)
output = pd.DataFrame.from_dict(report)
output.to_csv("perceptron.csv")

# SVM

In [49]:
from sklearn.svm import LinearSVC
svm = LinearSVC(multi_class="ovr", random_state=0)
svm.fit(tf_x_train,Y_train)
y_test_predicted = svm.predict(tf_x_test)

In [50]:
report = classification_report(Y_test, y_test_predicted, output_dict=True)
output = pd.DataFrame.from_dict(report)
output.to_csv("svm.csv")

# Logistic Regression

#### 1. Simple Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000, solver='saga')
lr.fit(tf_x_train,Y_train)
y_test_predicted = lr.predict(tf_x_test)

Unnamed: 0,precision,recall,f1-score
1,0.601127,0.6665,0.632128
2,0.42815,0.39925,0.413195
3,0.442865,0.42825,0.435435
4,0.495541,0.4445,0.468635
5,0.665905,0.729,0.696026
weighted avg,0.526718,0.5335,0.529084


In [31]:
report = classification_report(Y_test, y_test_predicted, output_dict=True)
pd.DataFrame.from_dict(report)

Unnamed: 0,1,2,3,4,5,accuracy,macro avg,weighted avg
precision,0.592847,0.384177,0.411308,0.45814,0.657394,0.5019,0.500773,0.500773
recall,0.605,0.38725,0.42375,0.42,0.6735,0.5019,0.5019,0.5019
f1-score,0.598862,0.385707,0.417436,0.438242,0.665349,0.5019,0.501119,0.501119
support,4000.0,4000.0,4000.0,4000.0,4000.0,0.5019,20000.0,20000.0


### 2. Hyper-parameter tuning for Logistic Regression

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

# define models and parameters
model = LogisticRegression(max_iter=10000)

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(solver=solvers, penalty=penalty, C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
grid_result = grid_search.fit(tf_x_train, Y_train)

# summarize results
print("Best Tuning parameters : " , grid_result.best_params_)

In [25]:
grid = dict(solver=["lbfgs"], penalty=["l2"], C=[1.0])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
grid_result = grid_search.fit(tf_x_train, Y_train)
y_test_pred = grid_search.predict(tf_x_test)

report = classification_report(Y_test, y_test_pred, output_dict=True)
pd.DataFrame.from_dict(report)[["1", "2", "3", "4", "5", "weighted avg"]][:3].transpose()

# Naive Bayes

### 1. Multinomial Naive Bayes

In [26]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(tf_x_train,Y_train)
y_test_predicted = nb.predict(tf_x_test)

report = classification_report(Y_test, y_test_predicted, output_dict=True)
pd.DataFrame.from_dict(report)[["1", "2", "3", "4", "5", "weighted avg"]][:3].transpose()

Unnamed: 0,precision,recall,f1-score
1,0.592847,0.605,0.598862
2,0.384177,0.38725,0.385707
3,0.411308,0.42375,0.417436
4,0.45814,0.42,0.438242
5,0.657394,0.6735,0.665349
weighted avg,0.500773,0.5019,0.501119


### 2. Hyper-parameter tuning for MultinomialNB

In [27]:
# Hyper-parameter tuning for MultinomialNB
cv_method = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=999)
grid_params = {
    'alpha': np.linspace(0.5, 1.5, 6),
    'fit_prior': [True, False]
}

mul_nom_NB = GridSearchCV(estimator=MultinomialNB(),
                          param_grid=grid_params,
                          cv=cv_method,
                          verbose=1,
                          scoring='accuracy')
mul_nom_NB.fit(tf_x_train, Y_train)
print("Best Tuning parameters : ", mul_nom_NB.best_params_)

In [28]:
grid_params = { 'alpha': [1.5], 'fit_prior': [True] }

mul_nom_NB = GridSearchCV(estimator=MultinomialNB(),
                          param_grid=grid_params,
                          cv=cv_method,
                          verbose=1,
                          scoring='accuracy')
mul_nom_NB.fit(tf_x_train, Y_train)
y_test_predicted = mul_nom_NB.predict(tf_x_test)

report = classification_report(Y_test, y_test_predicted, output_dict=True)
pd.DataFrame.from_dict(report)[["1", "2", "3", "4", "5", "weighted avg"]][:3].transpose()