In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import contractions
import seaborn as sns
#nltk.download('wordnet')

#@author: alexwu.tech@gmail.com

In [2]:
#dependencies (may have more)
#! pip install contractions
# Dataset: https://web.archive.org/web/20201127142707if_/https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz


## Read Data

In [3]:
data = pd.read_csv('amazon_reviews_us_Office_Products_v1_00.tsv',sep='\t',on_bad_lines='skip',dtype=str)  
data.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,43081963,R18RVCKGH1SSI9,B001BM2MAC,307809868,"Scotch Cushion Wrap 7961, 12 Inches x 100 Feet",Office Products,5,0,0,N,Y,Five Stars,Great product.,2015-08-31
1,US,10951564,R3L4L6LW1PUOFY,B00DZYEXPQ,75004341,"Dust-Off Compressed Gas Duster, Pack of 4",Office Products,5,0,1,N,Y,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...,2015-08-31
2,US,21143145,R2J8AWXWTDX2TF,B00RTMUHDW,529689027,Amram Tagger Standard Tag Attaching Tagging Gu...,Office Products,5,0,0,N,Y,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it.",2015-08-31
3,US,52782374,R1PR37BR7G3M6A,B00D7H8XB6,868449945,AmazonBasics 12-Sheet High-Security Micro-Cut ...,Office Products,1,2,3,N,Y,and the shredder was dirty and the bin was par...,Although this was labeled as &#34;new&#34; the...,2015-08-31
4,US,24045652,R3BDDDZMZBZDPU,B001XCWP34,33521401,"Derwent Colored Pencils, Inktense Ink Pencils,...",Office Products,4,0,0,N,Y,Four Stars,Gorgeous colors and easy to use,2015-08-31


## Keep Reviews and Ratings

In [4]:
data = data[['star_rating','review_body']].copy()


 ## We form two classes and select 50000 reviews randomly from each class.



In [5]:
#form two classes, ratings 1-3 form class 1, and ratings 4 and 5 form class 2 as a new column 'label'


data.dropna(how='any',inplace=True) #drop rows with missing values
data['label'] = np.where(pd.to_numeric(data['star_rating']) < 4, 1, 2)
#data['label'] = data['star_rating'].apply(lambda x: 1 if pd.to_numeric(x) < 4 else 2)

#via. timeit
#using apply.(lambda func.):
# 4.43 s ± 17.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) 
#using np.where():
# 533 ms ± 2.48 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) wayyy faster

#randomizing to select 50000 samples for each label 
sampled_data = data.groupby('label').apply(lambda x: x.sample(n=50000,random_state=1)).reset_index(drop=True)


#obtain average length of reviews (before cleaned and pre-proccessed data)
sampled_data['review_body'].str.len().mean()


316.30319

# Data Cleaning



In [6]:
#fix contractions first
sampled_data['review_body'] = sampled_data['review_body'].apply(contractions.fix)

#remove punctuations, numbers, and special characters
sampled_data['review_body'] = sampled_data['review_body'].str.lower().str.replace(r'(http[s]?://\S+)|(www\.\S+)|(<[^>]+>)|[^a-zA-Z]+',' ',regex =True).str.replace(r'\s+',' ',regex = True) #remove url and html

sampled_data['review_body'].str.len().mean() 



304.1103

# Pre-processing

## remove the stop words 

In [7]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english')) 
sampled_data['review_body'] = sampled_data['review_body'].apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word not in stop_words]))

sampled_data['review_body'].str.len().mean() 

190.13981

## perform lemmatization  

In [8]:
#nltk.download('omw-1.4') Open Multilingual Wordnet dependency (extension of nltk.corpus.wordnet)

In [9]:
from nltk.stem import WordNetLemmatizer
l = WordNetLemmatizer()
sampled_data['review_body'] = sampled_data['review_body'].apply(lambda x: ' '.join([l.lemmatize(word) for word in nltk.word_tokenize(x)]))

sampled_data['review_body'].str.len().mean() 

186.94236

# TF-IDF and BoW Feature Extraction

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split

Y = sampled_data['label']
#BoW --------------------
vectorizer = CountVectorizer()
X_BoW = vectorizer.fit_transform(sampled_data['review_body'])
#X.shape = (100000, 40261) due to more sophisticated vectorization processes, single-character words may have been ommitted 

#manual method done out of curiosity (less-optimal)
'''
vocab = set(word for review in sampled_data['review_body'] for word in review.split()) #build a vocab 
vocab = {word: i for i, word in enumerate(vocab)} #assign index to each word in vocab 
X_BoW = np.zeros((len(sampled_data['review_body']),len(vocab))) #initialize matrix, #rows as number reviews, #columns as each word in vocab
for i, reviews in enumerate(sampled_data['review_body']):
    review = reviews.split()
    for word in review:
        if word in vocab:
            X_BoW[i,vocab[word]] += 1
#X_BoW.shape = (100000, 40285)
'''
X_train_BoW,X_test_BoW,Y_train,Y_test = train_test_split(X_BoW,Y,train_size=0.8,random_state=42)
#-------------------------

#TF-IDF ------------------
X_TFIDF = TfidfTransformer().fit_transform(X_BoW) #turn previous vectorized BoW count matrix to TFIDF matrix
X_train_TFIDF,X_test_TFIDF,Y_train,Y_test = train_test_split(X_TFIDF,Y,train_size=0.8,random_state=42) 
#-------------------------


In [11]:
#imports for models below
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn import svm, naive_bayes

# Perceptron Using Both Features

In [12]:
perceptron_BoW = Perceptron()
perceptron_TFIDF = Perceptron()

perceptron_BoW.fit(X_train_BoW,Y_train)
perceptron_BoW_pred = perceptron_BoW.predict(X_test_BoW)

perceptron_TFIDF.fit(X_train_TFIDF,Y_train)
percentron_TFIDF_pred = perceptron_TFIDF.predict(X_test_TFIDF)

#accuracy 
print('BoW Accuracy: ',accuracy_score(Y_test,perceptron_BoW_pred))
print('TFIDF Accuracy: ',accuracy_score(Y_test,percentron_TFIDF_pred))
print('-----------------------')

#precision
print('BoW Precision: ',precision_score(Y_test,perceptron_BoW_pred))
print('TFIDF Precision: ',precision_score(Y_test,percentron_TFIDF_pred))
print('-----------------------')

#recall
print('BoW Recall: ',recall_score(Y_test,perceptron_BoW_pred))
print('TFIDF Recall: ',recall_score(Y_test,percentron_TFIDF_pred))
print('-----------------------')

#f1
print('BoW F1: ',f1_score(Y_test,perceptron_BoW_pred))
print('TFIDF F1: ',f1_score(Y_test,percentron_TFIDF_pred))
print('-----------------------')

BoW Accuracy:  0.78695
TFIDF Accuracy:  0.78505
-----------------------
BoW Precision:  0.7912043574742788
TFIDF Precision:  0.8183129855715872
-----------------------
BoW Recall:  0.7816641753861485
TFIDF Recall:  0.7347284504235176
-----------------------
BoW F1:  0.7864053336006817
TFIDF F1:  0.7742714623260698
-----------------------


# SVM Using Both Features

In [13]:
#note, optimized for linear decision bounds. Will follow up by testing on further unseen data.
#if data seems to poorly generalize, will try to use a non-linear kernel follwing gridsearchCV
svm_BoW = svm.LinearSVC(max_iter=100000)
svm_TFIDF = svm.LinearSVC(max_iter=100000)

svm_BoW.fit(X_train_BoW,Y_train)
svm_BoW_pred = svm_BoW.predict(X_test_BoW)

svm_TFIDF.fit(X_train_TFIDF,Y_train)
svm_TFIDF_pred = svm_TFIDF.predict(X_test_TFIDF)

#accuracy
print('BoW Accuracy: ',accuracy_score(Y_test,svm_BoW_pred))
print('TFIDF Accuracy: ',accuracy_score(Y_test,svm_TFIDF_pred))
print('-----------------------')

#precision
print('BoW Precision: ',precision_score(Y_test,svm_BoW_pred))
print('TFIDF Precision: ',precision_score(Y_test,svm_TFIDF_pred))
print('-----------------------')

#recall
print('BoW Recall: ',recall_score(Y_test,svm_BoW_pred))
print('TFIDF Recall: ',recall_score(Y_test,svm_TFIDF_pred))
print('-----------------------')

#f1
print('BoW F1: ',f1_score(Y_test,svm_BoW_pred))
print('TFIDF F1: ',f1_score(Y_test,svm_TFIDF_pred))
print('-----------------------')

BoW Accuracy:  0.82405
TFIDF Accuracy:  0.83915
-----------------------
BoW Precision:  0.8386694386694387
TFIDF Precision:  0.837190900098912
-----------------------
BoW Recall:  0.8039860488290982
TFIDF Recall:  0.8434479322371699
-----------------------
BoW F1:  0.8209615873823455
TFIDF F1:  0.8403077686770909
-----------------------


# Logistic Regression Using Both Features

In [14]:
logistic_BoW = LogisticRegression(max_iter=100000)
logistic_TFIDF = LogisticRegression(max_iter=100000)

logistic_BoW.fit(X_train_BoW,Y_train)
logistic_BoW_pred = logistic_BoW.predict(X_test_BoW)

logistic_TFIDF.fit(X_train_TFIDF,Y_train)
logistic_TFIDF_pred = logistic_TFIDF.predict(X_test_TFIDF)

#accuracy
print('BoW Accuracy: ',accuracy_score(Y_test,logistic_BoW_pred))
print('TFIDF Accuracy: ',accuracy_score(Y_test,logistic_TFIDF_pred))

#precision
print('BoW Precision: ',precision_score(Y_test,logistic_BoW_pred))
print('TFIDF Precision: ',precision_score(Y_test,logistic_TFIDF_pred))

#recall
print('BoW Recall: ',recall_score(Y_test,logistic_BoW_pred))
print('TFIDF Recall: ',recall_score(Y_test,logistic_TFIDF_pred))

#f1
print('BoW F1: ',f1_score(Y_test,logistic_BoW_pred))
print('TFIDF F1: ',f1_score(Y_test,logistic_TFIDF_pred))

BoW Accuracy:  0.8385
TFIDF Accuracy:  0.8488
BoW Precision:  0.8520434557682359
TFIDF Precision:  0.8441826215022091
BoW Recall:  0.820627802690583
TFIDF Recall:  0.8568011958146488
BoW F1:  0.8360406091370558
TFIDF F1:  0.8504451038575668


# Naive Bayes Using Both Features

In [15]:
naive_bayes_BoW = naive_bayes.MultinomialNB()
naive_bayes_TFIDF = naive_bayes.MultinomialNB()

naive_bayes_BoW.fit(X_train_BoW,Y_train)
naive_bayes_BoW_pred = naive_bayes_BoW.predict(X_test_BoW)

naive_bayes_TFIDF.fit(X_train_TFIDF,Y_train)
naive_bayes_TFIDF_pred = naive_bayes_TFIDF.predict(X_test_TFIDF)

#accuracy
print('BoW Accuracy: ',accuracy_score(Y_test,naive_bayes_BoW_pred))
print('TFIDF Accuracy: ',accuracy_score(Y_test,naive_bayes_TFIDF_pred))

#precision
print('BoW Precision: ',precision_score(Y_test,naive_bayes_BoW_pred))
print('TFIDF Precision: ',precision_score(Y_test,naive_bayes_TFIDF_pred))

#recall
print('BoW Recall: ',recall_score(Y_test,naive_bayes_BoW_pred))
print('TFIDF Recall: ',recall_score(Y_test,naive_bayes_TFIDF_pred))

#f1
print('BoW F1: ',f1_score(Y_test,naive_bayes_BoW_pred))
print('TFIDF F1: ',f1_score(Y_test,naive_bayes_TFIDF_pred))



BoW Accuracy:  0.80365
TFIDF Accuracy:  0.81915
BoW Precision:  0.8436093609360936
TFIDF Precision:  0.8131953933242241
BoW Recall:  0.7471848530144495
TFIDF Recall:  0.830293971101146
BoW F1:  0.7924747661575859
TFIDF F1:  0.8216557368966027
