In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics

In [2]:
mobile = pd.read_csv("gs://devdutt_bucket/amazon_reviews_us_Mobile_Electronics_v1.tsv",
                    sep = '\t',
                    on_bad_lines='skip',
                    usecols = ['marketplace','product_category', 'star_rating', 'review_headline', 'review_body'])

In [3]:
mobile.tail()

Unnamed: 0,marketplace,product_category,star_rating,review_headline,review_body
104849,US,Mobile_Electronics,5.0,The Cat Barf is Gone!,"I've been looking for a while for the \\""purr\..."
104850,US,Mobile_Electronics,5.0,Well worth [it],We live in an apartment with hardwood floors a...
104851,US,Mobile_Electronics,5.0,Its a Time saver!!!,I received this for a birthday present and Its...
104852,US,Mobile_Electronics,5.0,Got Stains?,I got the Dirt Devil Spot Scrubber for a gift ...
104853,US,Mobile_Electronics,4.0,no smell has to mean it is getting the job don...,I purchased this about three weeks ago along w...


In [4]:
mobile.dtypes

marketplace          object
product_category     object
star_rating         float64
review_headline      object
review_body          object
dtype: object

In [5]:
mobile['star_rating'].value_counts(dropna=False)

5.0    52199
4.0    18063
1.0    17572
3.0     9720
2.0     7298
NaN        2
Name: star_rating, dtype: int64

In [6]:
mobile.dropna(subset=['star_rating', 'review_headline', 'review_body'], inplace = True)

In [7]:
mobile['marketplace'].value_counts(dropna=False)

US    104849
Name: marketplace, dtype: int64

In [8]:
mobile['product_category'].value_counts(dropna=False)

Mobile_Electronics    104849
Name: product_category, dtype: int64

In [9]:
mobile['star_rating'] = mobile['star_rating'].astype(int)

In [10]:
mobile['star_rating'].value_counts(dropna=False)

5    52197
4    18063
1    17572
3     9719
2     7298
Name: star_rating, dtype: int64

In [11]:
mobile = mobile[(mobile['star_rating'] == 1) | (mobile['star_rating'] == 5)]

In [12]:
mobile['star_rating'].value_counts(dropna=False)

5    52197
1    17572
Name: star_rating, dtype: int64

In [13]:
mobile['sentiment'] = mobile['star_rating'].map({1:0, 5:1})

In [14]:
mobile['sentiment'].value_counts(normalize = True)

1    0.74814
0    0.25186
Name: sentiment, dtype: float64

In [15]:
mobile['text'] = mobile['review_headline'] + " " + mobile['review_body']

In [16]:
mobile = mobile[['text', 'sentiment']]

In [17]:
mobile.shape[0]

69769

In [18]:
# ~69k rows

# positive ~52k (75%)
# negative ~17k (25%)

In [19]:
mobile.tail()

Unnamed: 0,text,sentiment
104847,A wonder Since I have two dogs in a small apar...,1
104849,The Cat Barf is Gone! I've been looking for a ...,1
104850,Well worth [it] We live in an apartment with h...,1
104851,Its a Time saver!!! I received this for a birt...,1
104852,Got Stains? I got the Dirt Devil Spot Scrubber...,1


In [20]:
mobile.to_csv('mobile_sent.csv', index=False)

In [21]:
# define X and y
X = mobile['text']
y = mobile['sentiment']

print(f"X Shape: {X.shape}")
print(f"y Shape: {y.shape}")

X Shape: (69769,)
y Shape: (69769,)


In [22]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

print(f"Training records: \n X_train: {X_train.shape} \n y_train: {y_train.shape}")
print(f"Testing records: \n  X_test: {X_test.shape} \n y_test: {y_test.shape}")

Training records: 
 X_train: (52326,) 
 y_train: (52326,)
Testing records: 
  X_test: (17443,) 
 y_test: (17443,)


In [23]:
vect = CountVectorizer(stop_words='english')

### Base Model (Naive Bayes)

In [24]:
# instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()

In [25]:
%time

nb.fit(vect.fit_transform(X_train), y_train)

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 14.1 µs


MultinomialNB()

In [26]:
# make class predictions
y_pred = nb.predict(vect.transform(X_test))

In [27]:
# calculate accuracy of class predictions
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.1f}%")

Test Accuracy: 94.2%


In [28]:
# calculate precision and recall
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.87      0.88      4439
           1       0.96      0.97      0.96     13004

    accuracy                           0.94     17443
   macro avg       0.93      0.92      0.92     17443
weighted avg       0.94      0.94      0.94     17443



In [29]:
# calculate the confusion matrix
print(metrics.confusion_matrix(y_test, y_pred))

[[ 3871   568]
 [  446 12558]]
