In [23]:
import pandas as pd
import numpy as np
import nltk
import contractions
import re
from bs4 import BeautifulSoup

In [24]:
#! pip install bs4 
# in case you don't have it installed
nltk.download('wordnet')
# Dataset: https://web.archive.org/web/20201127142707if_/https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Read Data
Read the data through pandas packages, using "usecols" attribute to filter the data we need

In [25]:
df = pd.read_table('./data/amazon_reviews_us_Office_Products_v1_00.tsv', on_bad_lines='skip', usecols=['star_rating', 'review_body'], dtype={'star_rating':'str', 'review_body':'str'})
df.dropna(inplace=True)

## Add Class Label
Use the star_rating to get our class label

In [26]:
#df['class_label'] = df['star_rating'].astype(int) > 3
class_label = []
for index, rate in df.iterrows():
    if int(rate['star_rating']) > 3:
        class_label.append(2)
    else:
        class_label.append(1)
df['class_label'] = class_label
df.head(10)

Unnamed: 0,star_rating,review_body,class_label
0,5,Great product.,2
1,5,What's to say about this commodity item except...,2
2,5,"Haven't used yet, but I am sure I will like it.",2
3,1,Although this was labeled as &#34;new&#34; the...,1
4,4,Gorgeous colors and easy to use,2
5,5,Perfect for planning weekly meals. Removrd the...,2
6,5,Gold plated fusers are the best! It will never...,2
7,5,I have used these highlighters for my bible fo...,2
8,5,Heavy pen that writes very well. I've had it ...,2
9,5,Not sure if they work but sent quickly and fit...,2


 ## We form two classes and select 50000 reviews randomly from each class.
For each class, use "groupby" to selection random 50000 rows.


In [27]:
df2 = df.groupby("class_label").sample(n = 50000)
df2.shape

(100000, 3)

# Data Cleaning



# Pre-processing
Define a cleaning function by regex method and import labrary package to clean the review_body text

In [28]:
def cleaning(s):
    # Contractions
    s = contractions.fix(s)
    # Convert to lower case
    s = s.lower()
    # Remove URLs
    s = re.sub(r'(http|www)\S+', '', s)
    # Remove HTML
    s = BeautifulSoup(s, 'html.parser').get_text()
    # Remove non-alphabetical chracters
    s = re.sub(r'[^a-zA-Z\s]+', '', s)
    # Remove extra space
    s = re.sub(r'\s+', ' ', s)
    
    return s

df2['cleaned'] = df2['review_body'].apply(lambda context: cleaning(context))
df2['before_clean_count'] = df2['review_body'].apply(lambda x : len(x))
df2['after_clean_count'] = df2['cleaned'].apply(lambda x : len(x))

print('Before cleaning avg : ', df2['before_clean_count'].mean(), ', After cleaning avg : ', df2['after_clean_count'].mean())

  s = BeautifulSoup(s, 'html.parser').get_text()


Before cleaning avg :  312.83406 , After cleaning avg :  297.40285


## remove the stop words 
Use nltk to remove the stopwords

In [29]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(s):
    words = s.split(' ')
    res = [w for w in words if not w in stop_words]
    
    return ' '.join(res)

df2['preprocessing'] = df2['cleaned'].apply(lambda x: remove_stopwords(x))
df2.head()

Unnamed: 0,star_rating,review_body,class_label,cleaned,before_clean_count,after_clean_count,preprocessing
1404872,1,The quality of these is horrible. They fell of...,1,the quality of these is horrible they fell off...,172,168,quality horrible fell boxes within hours going...
27285,3,work as described,1,work as described,17,17,work described
1493466,2,VERY SMALL FOR THE PRICE!!!! I DIDN'T HAVE TIM...,1,very small for the price i did not have time t...,101,97,small price time send back gift cancer patient
1710841,1,I brought this product and barely used.. i mig...,1,i brought this product and barely used i might...,306,301,brought product barely used might used times e...
846563,1,CHEAP - THIN - either plan on doubling the pac...,1,cheap thin either plan on doubling the package...,134,125,cheap thin either plan doubling package ship f...


## perform lemmatization  
Use nltk to perform lemmatization

In [30]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(s):
    words = s.split(' ')
    res = [wordnet_lemmatizer.lemmatize(w) for w in words]
    return ' '.join(res)

df2['preprocessing'] = df2['preprocessing'].apply(lambda x: lemmatize(x))
df2['after_pre_count'] = df2['preprocessing'].apply(lambda x : len(x))

print('Before preprocessing avg : ', df2['after_clean_count'].mean(), ', After preprocessing avg : ', df2['after_pre_count'].mean())

df2.head()

Before preprocessing avg :  297.40285 , After preprocessing avg :  184.62694


Unnamed: 0,star_rating,review_body,class_label,cleaned,before_clean_count,after_clean_count,preprocessing,after_pre_count
1404872,1,The quality of these is horrible. They fell of...,1,the quality of these is horrible they fell off...,172,168,quality horrible fell box within hour going wo...,77
27285,3,work as described,1,work as described,17,17,work described,14
1493466,2,VERY SMALL FOR THE PRICE!!!! I DIDN'T HAVE TIM...,1,very small for the price i did not have time t...,101,97,small price time send back gift cancer patient,46
1710841,1,I brought this product and barely used.. i mig...,1,i brought this product and barely used i might...,306,301,brought product barely used might used time ev...,171
846563,1,CHEAP - THIN - either plan on doubling the pac...,1,cheap thin either plan on doubling the package...,134,125,cheap thin either plan doubling package ship f...,78


# TF-IDF and BoW Feature Extraction
Since the dataset is too large, limited the BoW and TF-IDF features to certain size. In order the avoid the memory issue.

In [31]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
df3 = df2['preprocessing']
bow_vectorizer = CountVectorizer(max_features=1000)
bow_V = bow_vectorizer.fit_transform(df3).toarray()
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_V = tfidf_vectorizer.fit_transform(df3).toarray()

In [32]:
label_V = df2['class_label'].to_numpy()

## Split Dataset
Using sklearn to split the dataset, set the random_state to make sure I have the same dataset for better evaluation

In [33]:
from sklearn.model_selection import train_test_split
bow_train, bow_test, bow_label_train, bow_label_test = train_test_split(bow_V, label_V, test_size=0.2, random_state=0)
tfidf_train, tfidf_test, tfidf_label_train, tfidf_label_test = train_test_split(tfidf_V, label_V, test_size=0.2, random_state=0)

# Perceptron Using Both Features
Define a evaluation function for calculating the scores

In [34]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import confusion_matrix
def calculate_result(tn, fp, fn, tp):
    acc = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 / ((1 / precision) + (1 / recall))
    return acc, precision, recall, f1
    
p_bow = Perceptron(random_state=0)
p_bow.fit(bow_train, bow_label_train)
predictions_test = p_bow.predict(bow_test)
tn, fp, fn, tp = confusion_matrix(bow_label_test, predictions_test).ravel()
acc, precision, recall, f1 = calculate_result(tn, fp, fn, tp)
print("BoW Perceptron: Precision = " + str(precision) + ", Recall = " + str(recall) + ", F1 = " + str(f1))

BoW Perceptron: Precision = 0.7080066524114992, Recall = 0.8946262383668568, F1 = 0.7904509283819628


In [35]:
p_tfidf = Perceptron(random_state=0)
p_tfidf.fit(tfidf_train, tfidf_label_train)
predictions_test = p_tfidf.predict(tfidf_test)
tn, fp, fn, tp = confusion_matrix(tfidf_label_test, predictions_test).ravel()
acc, precision, recall, f1 = calculate_result(tn, fp, fn, tp)
print("TF-IDF Perceptron: Precision = " + str(precision) + ", Recall = " + str(recall) + ", F1 = " + str(f1))

TF-IDF Perceptron: Precision = 0.8708240534521158, Recall = 0.6260382267587311, F1 = 0.7284159049892297


# SVM Using Both Features
Set the iteration times to avoid wasting too long for training

In [36]:
from sklearn.svm import LinearSVC
bow_svm = LinearSVC(max_iter=1000)
bow_svm.fit(bow_train, bow_label_train)
predictions_test = bow_svm.predict(bow_test)
tn, fp, fn, tp = confusion_matrix(bow_label_test, predictions_test).ravel()
acc, precision, recall, f1 = calculate_result(tn, fp, fn, tp)
print("BoW SVM: Precision = " + str(precision) + ", Recall = " + str(recall) + ", F1 = " + str(f1))

BoW SVM: Precision = 0.8138993893573714, Recall = 0.8402882017412189, F1 = 0.8268833087149189




In [37]:
tfidf_svm = LinearSVC(max_iter=1000)
tfidf_svm.fit(tfidf_train, tfidf_label_train)
predictions_test = tfidf_svm.predict(tfidf_test)
tn, fp, fn, tp = confusion_matrix(tfidf_label_test, predictions_test).ravel()
acc, precision, recall, f1 = calculate_result(tn, fp, fn, tp)
print("TF-IDF SVM: Precision = " + str(precision) + ", Recall = " + str(recall) + ", F1 = " + str(f1))

TF-IDF SVM: Precision = 0.8323863636363636, Recall = 0.8209746822775943, F1 = 0.8266411406116179


# Logistic Regression Using Both Features
Set the iteration times to avoid wasting too long for training

In [38]:
from sklearn.linear_model import LogisticRegression
bow_lr = LogisticRegression(max_iter=1000)
bow_lr.fit(bow_train, bow_label_train)
predictions_test = bow_lr.predict(bow_test)
tn, fp, fn, tp = confusion_matrix(bow_label_test, predictions_test).ravel()
acc, precision, recall, f1 = calculate_result(tn, fp, fn, tp)
print("BoW Logistic Regression: Precision = " + str(precision) + ", Recall = " + str(recall) + ", F1 = " + str(f1))

BoW Logistic Regression: Precision = 0.816364345621221, Recall = 0.8376863804663264, F1 = 0.8268879340149157


In [39]:
tfidf_lr = LogisticRegression(max_iter=1000)
tfidf_lr.fit(tfidf_train, tfidf_label_train)
predictions_test = tfidf_lr.predict(tfidf_test)
tn, fp, fn, tp = confusion_matrix(tfidf_label_test, predictions_test).ravel()
acc, precision, recall, f1 = calculate_result(tn, fp, fn, tp)
print("TF-IDF Logistic Regression: Precision = " + str(precision) + ", Recall = " + str(recall) + ", F1 = " + str(f1))

TF-IDF Logistic Regression: Precision = 0.834335814048323, Recall = 0.8189732812969078, F1 = 0.8265831734168266


# Naive Bayes Using Both Features

In [40]:
from sklearn.naive_bayes import GaussianNB
bow_nb = GaussianNB()
bow_nb.fit(bow_train, bow_label_train)
predictions_test = bow_nb.predict(bow_test)
tn, fp, fn, tp = confusion_matrix(bow_label_test, predictions_test).ravel()
acc, precision, recall, f1 = calculate_result(tn, fp, fn, tp)
print("BoW Naive Bayes: Precision = " + str(precision) + ", Recall = " + str(recall) + ", F1 = " + str(f1))

BoW Naive Bayes: Precision = 0.6378685980341439, Recall = 0.8637045932152507, F1 = 0.7338037748682197


In [41]:
tfidf_nb = GaussianNB()
tfidf_nb.fit(tfidf_train, tfidf_label_train)
predictions_test = tfidf_nb.predict(tfidf_test)
tn, fp, fn, tp = confusion_matrix(tfidf_label_test, predictions_test).ravel()
acc, precision, recall, f1 = calculate_result(tn, fp, fn, tp)
print("TF-IDF Naive Bayes: Precision = " + str(precision) + ", Recall = " + str(recall) + ", F1 = " + str(f1))

TF-IDF Naive Bayes: Precision = 0.7638942868247183, Recall = 0.7867507255078555, F1 = 0.7751540547202367
