In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

## 1. Dataset Preparation

In [2]:
start = time.time()
# Read data
#df = pd.read_csv('data/amazon_reviews_us_Kitchen_v1_00.tsv', sep='\t', on_bad_lines='skip')
df = pd.read_csv('https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz', sep='\t', on_bad_lines='skip')

In [3]:
df = df[['star_rating', 'review_body']].rename(columns={'star_rating':'ratings', 'review_body':'reviews'})
df.head()

Unnamed: 0,ratings,reviews
0,5.0,Beautiful. Looks great on counter.
1,5.0,I personally have 5 days sets and have also bo...
2,5.0,Fabulous and worth every penny. Used for clean...
3,5.0,A must if you love garlic on tomato marinara s...
4,5.0,Worth every penny! Buy one now and be a pizza ...


In [4]:
df.groupby('ratings').describe()

Unnamed: 0_level_0,reviews,reviews,reviews,reviews
Unnamed: 0_level_1,count,unique,top,freq
ratings,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1.0,426870,419692,Too small,110
2.0,241939,239368,Too small,105
3.0,349539,342257,ok,550
4.0,731701,703142,good,1163
5.0,3124595,2876141,Great,5604


In [5]:
# Create binary labels: ratings > 3 --> 1, ratings < 3 --> 0, ratings == 3 --> discard
# Discard ratings equal to 3
df = df[(df['ratings']>3)|(df['ratings']<3)]
print(df['ratings'].unique())
# Map ratings > 3 to 1, and ratings < 3 to 0
df['label'] = df['ratings'].map(lambda x: 1 if x > 3 else 0)
df.drop('ratings', axis=1, inplace=True)
print(df.head())

[5. 1. 4. 2.]
                                             reviews  label
0                Beautiful.  Looks great on counter.      1
1  I personally have 5 days sets and have also bo...      1
2  Fabulous and worth every penny. Used for clean...      1
3  A must if you love garlic on tomato marinara s...      1
4  Worth every penny! Buy one now and be a pizza ...      1


In [6]:
# Downsize the dataframe with 100,000 positive reviews and 100,000 negative ones.
df_p = df[df['label']==1].sample(n=100000, random_state=1)
df_n = df[df['label']==0].sample(n=100000, random_state=1)
print(df_p.head())
print(df_n.head())
df = pd.concat([df_p, df_n]).sample(frac=1).reset_index(drop=True)
print(df.head())

                                                   reviews  label
3490548  very good product and met every expectations. ...      1
2590127  These are pretty and fun.  The only problem I ...      1
3283260  LOVE THIS!!! It is sturdy, elegant and is easy...      1
2859885  I didn't have a pot for our town house, and ki...      1
3201281  I love it, it looks so nice under my Keurig an...      1
                                                   reviews  label
2655570  We liked the looks of this timer, and the fact...      0
3261497  i bought a swell bottle. LOVED it. kept hot th...      0
763143             Worse thing I have very bought on line.      0
549574             Product did not meet my need. Too small      0
4405080  We were happy enough with this toaster -- for ...      0
                                             reviews  label
0  We love veggies and salads, but I've always ha...      1
1  I recently purchased 2 Grazia Premium silicone...      1
2                           

## 2. Preprocessing

In [7]:
X, y = df['reviews'].fillna('').tolist(), df['label'].tolist()

In [8]:
# convert reviews to lower case
X = list(map(lambda x: str(x).lower(), X))
# remove HTML and URLs from reviews
X = list(map(lambda x: re.sub('<.*>', '', x), X))
X = list(map(lambda x: re.sub(r'https?://\S+', '', x), X))
# remove non-alphabetical characters
X = list(map(lambda x: re.sub('[^a-z ]', '', x), X))
# remove extra spaces
X = list(map(lambda x: re.sub(' +', ' ', x), X))

In [9]:
# expand contractions
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}
def decontraction(s):
    for word in s.split(' '):
        if word in contractions.keys():
            s = re.sub(word, contractions[word], s)
    return s
X = list(map(decontraction, X))

In [10]:
# remove stop words
stopWords =set(stopwords.words('english'))
def rmstopWords(s):
    wordlist = s.split(' ')
    newlist = []
    for word in wordlist:
        if word not in stopWords:
            newlist.append(word)
    s = ' '.join(newlist)
    return s

X = list(map(rmstopWords, X))

# perform lemmatization
wnl = WordNetLemmatizer()
X = list(map(lambda x: ' '.join(map(wnl.lemmatize, x.split(' '))), X))

In [11]:
# Split the downsized dataset into 80% training dataset and 20% testing dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## 3. Feature Extraction

In [12]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(X_train)
tfidf_test = vectorizer.transform(X_test)

## 4. Perceptron

In [13]:
perceptron = Perceptron(random_state=1)
perceptron.fit(tfidf, y_train)
y_train_predict, y_test_predict = perceptron.predict(tfidf), perceptron.predict(tfidf_test)

# report accuracy, precision, recall, and f1-score on both the training and testing split
train_stats = precision_recall_fscore_support(y_train, y_train_predict, average='binary')
precision_train, recall_train, fscore_train = train_stats[0], train_stats[1], train_stats[2]

test_stats = precision_recall_fscore_support(y_test, y_test_predict, average='binary')
precision_test, recall_test, fscore_test = test_stats[0], test_stats[1], test_stats[2]

print('The accuracy of training dataset: {:2.1%}'.format(perceptron.score(tfidf, y_train)))
print('The precision of training dataset: {:2.1%}'.format(precision_train))
print('The recall of training dataset: {:2.1%}'.format(recall_train))
print('The fscore of training dataset: {:2.1%}\n'.format(fscore_train))

print('The accuracy of testing dataset: {:2.1%}'.format(perceptron.score(tfidf_test, y_test)))
print('The precision of testing dataset: {:2.1%}'.format(precision_test))
print('The recall of testing dataset: {:2.1%}'.format(recall_test))
print('The fscore of testing dataset: {:2.1%}'.format(fscore_test))

The accuracy of training dataset: 91.4%
The precision of training dataset: 93.5%
The recall of training dataset: 89.0%
The fscore of training dataset: 91.2%

The accuracy of testing dataset: 84.8%
The precision of testing dataset: 87.0%
The recall of testing dataset: 81.8%
The fscore of testing dataset: 84.3%


## 5. SVM

In [14]:
# refer to https://stackoverflow.com/questions/52008548/python-running-into-x-test-y-test-fit-errors
# for why with_mean should be set to False
svm = LinearSVC(random_state=1)
svm.fit(tfidf, y_train)

y_train_predict, y_test_predict = svm.predict(tfidf), svm.predict(tfidf_test)

# report accuracy, precision, recall, and f1-score on both the training and testing split
train_stats = precision_recall_fscore_support(y_train, y_train_predict, average='binary')
precision_train, recall_train, fscore_train = train_stats[0], train_stats[1], train_stats[2]

test_stats = precision_recall_fscore_support(y_test, y_test_predict, average='binary')
precision_test, recall_test, fscore_test = test_stats[0], test_stats[1], test_stats[2]

print('The accuracy of training dataset: {:2.1%}'.format(svm.score(tfidf, y_train)))
print('The precision of training dataset: {:2.1%}'.format(precision_train))
print('The recall of training dataset: {:2.1%}'.format(recall_train))
print('The fscore of training dataset: {:2.1%}\n'.format(fscore_train))

print('The accuracy of testing dataset: {:2.1%}'.format(svm.score(tfidf_test, y_test)))
print('The precision of testing dataset: {:2.1%}'.format(precision_test))
print('The recall of testing dataset: {:2.1%}'.format(recall_test))
print('The fscore of testing dataset: {:2.1%}'.format(fscore_test))

The accuracy of training dataset: 93.9%
The precision of training dataset: 94.0%
The recall of training dataset: 93.8%
The fscore of training dataset: 93.9%

The accuracy of testing dataset: 89.2%
The precision of testing dataset: 89.0%
The recall of testing dataset: 89.4%
The fscore of testing dataset: 89.2%


## 6. Logistic Regression

In [15]:
logistic = LogisticRegression(random_state=1, max_iter=200)
logistic.fit(tfidf, y_train)

y_train_predict, y_test_predict = logistic.predict(tfidf), logistic.predict(tfidf_test)

# report accuracy, precision, recall, and f1-score on both the training and testing split
train_stats = precision_recall_fscore_support(y_train, y_train_predict, average='binary')
precision_train, recall_train, fscore_train = train_stats[0], train_stats[1], train_stats[2]

test_stats = precision_recall_fscore_support(y_test, y_test_predict, average='binary')
precision_test, recall_test, fscore_test = test_stats[0], test_stats[1], test_stats[2]

print('The accuracy of training dataset: {:2.1%}'.format(logistic.score(tfidf, y_train)))
print('The precision of training dataset: {:2.1%}'.format(precision_train))
print('The recall of training dataset: {:2.1%}'.format(recall_train))
print('The fscore of training dataset: {:2.1%}\n'.format(fscore_train))

print('The accuracy of testing dataset: {:2.1%}'.format(logistic.score(tfidf_test, y_test)))
print('The precision of testing dataset: {:2.1%}'.format(precision_test))
print('The recall of testing dataset: {:2.1%}'.format(recall_test))
print('The fscore of testing dataset: {:2.1%}'.format(fscore_test))

The accuracy of training dataset: 91.1%
The precision of training dataset: 91.4%
The recall of training dataset: 90.8%
The fscore of training dataset: 91.1%

The accuracy of testing dataset: 89.6%
The precision of testing dataset: 89.7%
The recall of testing dataset: 89.5%
The fscore of testing dataset: 89.6%


## 7. Multinomial Naive Bayes

In [16]:
multiNB = MultinomialNB()
multiNB.fit(tfidf, y_train)
y_train_predict, y_test_predict = multiNB.predict(tfidf), multiNB.predict(tfidf_test)

# report accuracy, precision, recall, and f1-score on both the training and testing split
train_stats = precision_recall_fscore_support(y_train, y_train_predict, average='binary')
precision_train, recall_train, fscore_train = train_stats[0], train_stats[1], train_stats[2]

test_stats = precision_recall_fscore_support(y_test, y_test_predict, average='binary')
precision_test, recall_test, fscore_test = test_stats[0], test_stats[1], test_stats[2]

print('The accuracy of training dataset: {:2.1%}'.format(multiNB.score(tfidf, y_train)))
print('The precision of training dataset: {:2.1%}'.format(precision_train))
print('The recall of training dataset: {:2.1%}'.format(recall_train))
print('The fscore of training dataset: {:2.1%}\n'.format(fscore_train))

print('The accuracy of testing dataset: {:2.1%}'.format(multiNB.score(tfidf_test, y_test)))
print('The precision of testing dataset: {:2.1%}'.format(precision_test))
print('The recall of testing dataset: {:2.1%}'.format(recall_test))
print('The fscore of testing dataset: {:2.1%}'.format(fscore_test))

The accuracy of training dataset: 89.0%
The precision of training dataset: 89.8%
The recall of training dataset: 88.0%
The fscore of training dataset: 88.9%

The accuracy of testing dataset: 87.2%
The precision of testing dataset: 88.0%
The recall of testing dataset: 86.0%
The fscore of testing dataset: 87.0%


In [17]:
end = time.time()
print('time elapsed: {:.2f} s'.format(end-start))

time elapsed: 90.12 s
