# User reviews classifier to predict of a product review will be useful for other users. (Data set Amazon)


**Use Case**: As user prepares and submits a review, how can companies proactively identify reviews not to be posted towards an item that as not a helpful for other users?

**Target Variable** Helpful response from other reviewers (this is a target variable that is constructed by a rating on actual reviews done by other users using the scale from vote or helpful)

**Data Source** https://nijianmo.github.io/amazon/index.html



In [34]:
%%time

import os
import re
import json
import gzip
import wget
import pandas as pd
import numpy as np
from urllib.request import urlopen
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from prettytable import PrettyTable
import time
import nltk
from imp import reload

#cleaning textfiles libraries
from collections import defaultdict # For accumlating values
from nltk.corpus import stopwords # To remove stopwords
from gensim import corpora # To create corpus and dictionary for the LDA model
from gensim.models import LdaModel # To use the LDA model

CPU times: total: 0 ns
Wall time: 998 µs


# tested links
- http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Arts_Crafts_and_Sewing_5.json.gz - works
- http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Grocery_and_Gourmet_Food_5.json.gz - works


In [35]:
##download data from url
### randomly selected file to model
url = 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Grocery_and_Gourmet_Food_5.json.gz'
#filename = wget.download(url)


In [36]:
#load metadata
data = []
with gzip.open('Grocery_and_Gourmet_Food_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print(len(data))

# first row of the list
print(data[0])

1143860
{'overall': 5.0, 'verified': True, 'reviewTime': '11 19, 2014', 'reviewerID': 'A1QVBUH9E1V6I8', 'asin': '4639725183', 'reviewerName': 'Jamshed Mathur', 'reviewText': 'No adverse comment.', 'summary': 'Five Stars', 'unixReviewTime': 1416355200}


In [37]:
# convert list into pandas dataframe

df = pd.DataFrame.from_dict(data)

print(len(df))



1143860


In [38]:
#look at dataframe
df.info()
display(df)




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143860 entries, 0 to 1143859
Data columns (total 12 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   overall         1143860 non-null  float64
 1   verified        1143860 non-null  bool   
 2   reviewTime      1143860 non-null  object 
 3   reviewerID      1143860 non-null  object 
 4   asin            1143860 non-null  object 
 5   reviewerName    1143722 non-null  object 
 6   reviewText      1143470 non-null  object 
 7   summary         1143641 non-null  object 
 8   unixReviewTime  1143860 non-null  int64  
 9   vote            158202 non-null   object 
 10  style           592086 non-null   object 
 11  image           9510 non-null     object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 97.1+ MB


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"11 19, 2014",A1QVBUH9E1V6I8,4639725183,Jamshed Mathur,No adverse comment.,Five Stars,1416355200,,,
1,5.0,True,"10 13, 2016",A3GEOILWLK86XM,4639725183,itsjustme,Gift for college student.,Great product.,1476316800,,,
2,5.0,True,"11 21, 2015",A32RD6L701BIGP,4639725183,Krystal Clifton,"If you like strong tea, this is for you. It mi...",Strong,1448064000,,,
3,5.0,True,"08 12, 2015",A2UY1O1FBGKIE6,4639725183,U. Kane,Love the tea. The flavor is way better than th...,Great tea,1439337600,,,
4,5.0,True,"05 28, 2015",A3QHVBQYDV7Z6U,4639725183,The Nana,I have searched everywhere until I browsed Ama...,This is the tea I remembered!,1432771200,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1143855,5.0,True,"09 8, 2017",A223YRQH2Z5T1D,B01HJF6FRA,flint5292,"As a new vegan, it is sometimes difficult to r...","As a new vegan, it is sometimes difficult to ...",1504828800,4,,
1143856,5.0,True,"08 4, 2017",A38GDA4TB9EILT,B01HJF6FRA,Moriah Bolyard,The best thing ever is ordering a product you ...,The best thing ever is ordering a product you ...,1501804800,3,,
1143857,5.0,True,"07 4, 2017",A2025PN7HDC5BO,B01HJF6FRA,M.C,I used to love ranch before I became vegan. It...,Just what the vegan ordered!,1499126400,5,,
1143858,5.0,True,"06 7, 2017",A1NY7XWC7EPQOA,B01HJF6FRA,Greensboro,I cannot have dairy nor gluten. This is as cl...,This is as close to Ranch as I will ever be ab...,1496793600,2,,


### Data catalogue

- __overall:__- Rating of the Product
- __reviewTime:__- Time of the review (raw)
- __reviewerID:__- ID of the reviewer, e.g. A2SUAM1J3GNN3B
- __asin:__- ID of the product, e.g. 0000013714
- __style:__- A dictionary of the product metadata, e.g., "Format" is "Hardcover"
- __reviewerName:__- Name of the reviewer
- __reviewerText:__- Text of the review
- __summary:__- Summary of the review
- __vote:__- Helpful votes of the review
- __unixReviewTime:__- Time of the review (unix time)
- __reviewText:__- Text of the review
- __image:__- Images that users post after they have received the product

### Initital Data clean up

#### a) Check for NAs


In [39]:
# Any missing values in the dataset
def plot_missingness(df: pd.DataFrame=df) -> None:
    nan_df = pd.DataFrame(df.isna().sum()).reset_index()
    nan_df.columns  = ['Column', 'NaN_Count']
    nan_df['NaN_Count'] = nan_df['NaN_Count'].astype('int')
    nan_df['NaN_%'] = round(nan_df['NaN_Count']/df.shape[0] * 100,4)
    nan_df['Type']  = 'Missingness'
    nan_df.sort_values('NaN_%', inplace=True)

    # Add completeness
    for i in range(nan_df.shape[0]):
        complete_df = pd.DataFrame([nan_df.loc[i,'Column'],df.shape[0] - nan_df.loc[i,'NaN_Count'],100 - nan_df.loc[i,'NaN_%'], 'Completeness']).T
        complete_df.columns  = ['Column','NaN_Count','NaN_%','Type']
        complete_df['NaN_%'] = complete_df['NaN_%'].astype('int')
        complete_df['NaN_Count'] = complete_df['NaN_Count'].astype('int')
        nan_df = pd.concat([nan_df,complete_df], sort=True)
            
    nan_df = nan_df.rename(columns={"Column": "Feature", "NaN_%": "Missing %"})

    # Missingness Plot
    fig = px.bar(nan_df,
                 x='Feature',
                 y='Missing %',
                 title=f"Missingness Plot (N={df.shape[0]})",
                 color='Type',
                 opacity = 0.6,
                 color_discrete_sequence=['red','#808080'],
                 width=800,
                 height=800)
    fig.show()

plot_missingness(df)


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



#### b) Remove columns and change type

In [40]:
#drop columns that don't impact
df = df.drop(['style','summary','image'], axis=1)


In [41]:
df['vote'].value_counts()

2      59308
3      29733
4      17002
5      10985
6       7730
       ...  
266        1
612        1
247        1
817        1
414        1
Name: vote, Length: 333, dtype: int64

In [42]:
#convert vote column to float

df['vote']=df['vote'].str.replace(',','')
df["vote"]= df["vote"].fillna(0)
df["vote"] = df["vote"].astype(float)


In [43]:
#convert column to string
df["reviewText"]=df["reviewText"].astype(str)

#### c) Add Columns

In [44]:
## inserting helpful flag to be used in EDA and Models
df['helpful_flag'] = np.where(df['vote'] > 0, 1, 0)



In [45]:
#Determine Average Review Length and add review length column to dataframe

x = [len(df['reviewText'][i]) for i in range(df['reviewText'].shape[0])]
print('average length of review: {:.3f}'.format(sum(x)/len(x)) )


average length of review: 208.585


In [46]:
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords


#stop_words=['not', 'for','in','of', 'to']
stop_words = stopwords.words('english')
stop_words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Triston\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [47]:
# creating function to remove spaces from review so that we will only get characters.
def char_counts(x):
    s = x.split()
    x = ''.join(s)
    return len(x)

In [48]:
# adding columns for NLP EDAS
# df['totalWords'] = df['reviewText'].str.split().str.len()
df['totalWords'] = df['reviewText'].apply( lambda x: len(str(x).split()))
df['vocab_size'] = df['reviewText'].apply( lambda x: len(set(str(x).split())))
df['char_counts'] = df['reviewText'].apply( lambda x: char_counts(str(x)))
df['avg_word_size'] = df['char_counts']/df['totalWords']

In [49]:
len(stop_words)

179

In [50]:
df['stopword_count'] = df['reviewText'].apply( lambda x: len([t for t in x.split() if t in stop_words]))

In [51]:
df['numeric_count'] = df['reviewText'].apply( lambda x:len([t for t in x.split() if t.isdigit()]))

In [52]:
df['UpperCase_word_count'] = df['reviewText'].apply( lambda x:len([t for t in x.split() if t.isupper()]))

In [53]:
df['Capitalized_word_count'] = df['reviewText'].apply( lambda x:len([t for t in x.split() if t[0].isupper()]))

In [57]:
nltk.download('averaged_perceptron_tagger')

tags = nltk.pos_tag("because he said".translate
    (str.maketrans('', '', string.punctuation)).split())
print(tags)
noun_preceders = [a for (a, b) in tags if b in ('NN', 'NNS', 'NNP', 'NNPS') ]
noun_preceders
len(noun_preceders)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Triston\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('because', 'IN'), ('he', 'PRP'), ('said', 'VBD')]


0

In [58]:
df['Nouns_count'] = df['reviewText'].apply( lambda x:len([a for (a, b) in nltk.pos_tag(x.translate
    (str.maketrans('', '', string.punctuation)).split()) if b in ('NN', 'NNS', 'NNP', 'NNPS') ]))

KeyboardInterrupt: 

In [None]:
df['Verbs_count'] = df['reviewText'].apply( lambda x:len([a for (a, b) in nltk.pos_tag(x.translate
    (str.maketrans('', '', string.punctuation)).split()) if b in ('VB', 'VBG', 'VBD', 'VBN', 'VBP', 'VBZ') ]))

In [None]:
df['Adj_count'] = df['reviewText'].apply( lambda x:len([a for (a, b) in nltk.pos_tag(x.translate
    (str.maketrans('', '', string.punctuation)).split()) if b in ('JJ', 'JJR', 'JJS') ]))

In [None]:
df['Adverbs_count'] = df['reviewText'].apply( lambda x:len([a for (a, b) in nltk.pos_tag(x.translate
    (str.maketrans('', '', string.punctuation)).split()) if b in ('RB', 'RBR', 'RBS') ]))

In [None]:
df['Conj_count'] = df['reviewText'].apply( lambda x:len([a for (a, b) in nltk.pos_tag(x.translate
    (str.maketrans('', '', string.punctuation)).split()) if b in ('CC', 'IN', 'MD', 'RP') ]))

In [None]:
#Create initial data set for first test 
df_initial = df

In [None]:
#check dataframe after add and changes
df.info()

#### d) Identify and discard duplicative reviews

In [None]:
# we looked to remove duplicates with the same review time and review IDs as they were assummed to be computer generated and not human generated 


duplicated_reviews = df.duplicated(subset=["reviewerID","reviewTime","reviewText"], keep='first') #returns a Series with True and False values that describe which rows in the DataFrame are duplicated and not.
count_duplicated_reviews = duplicated_reviews.value_counts()

sum_reviews = count_duplicated_reviews.sum()
perc_duplicated_reviews = (count_duplicated_reviews/sum_reviews) * 100

x = PrettyTable()
x.field_names = ["","Count","Percentage of Total"]
x.add_rows([
    ["Duplicate Reviews", count_duplicated_reviews[True], perc_duplicated_reviews[True]],
    ["Original Reviews", count_duplicated_reviews[False], perc_duplicated_reviews[False]],
])
print(x)

In [None]:
df_duplicates = df[duplicated_reviews]
df_duplicates.sort_values(by = ['totalWords'], ascending = [False])
df_duplicates[df_duplicates['vote'] == 2.0]

In [None]:
df_new = df[(df['reviewerID'] == 'A2N8B21NWXHIW7') & (df['unixReviewTime'] == 1469145600) ]
df_new

In [None]:
#remove duplicate reviews for df
df = df[~duplicated_reviews]

print(f"Number of reviews after removel of duplicates : {df.shape[0]}")

### EDA1

In [None]:
#Summary Stats

df.describe().T

In [None]:
# plot the correlation matrix for the 
sns.set(style="darkgrid") # one of the many styles to plot using
cmap = sns.diverging_palette(220, 10, as_cmap=True) # one of the many color mappings
f, ax = plt.subplots(figsize=(10, 5))

sns.heatmap(df.corr(), cmap=cmap, annot=True)
f.tight_layout()

In [None]:
#create visual - duplicate code
#selected_columns = ['overall','verified','vote', 'totalWords', 'helpful_flag']
#df[selected_columns].corr()

We can see that there is a weak to medium positive correlation between number of words and  helpful flag. 
There is weak positive correlation between votes and helpful flag even though the votes were used to create the helpful flag.


In [None]:
#create graph to check review distribution

fig = px.histogram(df, x="overall", color="overall").update_xaxes(categoryorder='total descending')
fig.update_xaxes(type='category')
fig.update_layout(bargap=0.3)
fig.show()

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise: Added 11/27 DY
labels = [f'{k} ({df["overall"].value_counts()[k]} samples)' for k in df['overall'].value_counts().keys()]
sizes = dict(df['overall'].value_counts()).values()

fig1, ax1 = plt.subplots(figsize=(8,8))
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
ax1.set_title("Distribution of ratings in reviews",pad=40, fontweight='bold', fontsize=15)
plt.show();

In [None]:
bins = [-1, 0, 2, 5, 10, 25, 50, 100,3000]
df['binned'] = pd.cut(df['vote'], bins)

In [None]:
df['binned'].value_counts()

In [None]:
df['helpful_flag'].value_counts()

In [None]:
df_initial[df_initial['vote']==1]

In [None]:
plt.figure(figsize = (12,7))

ax = df['binned'].value_counts().plot(kind='bar')
ax.bar_label(ax.containers[0])
plt.xlabel("Binned Votes", labelpad=14)
plt.ylabel("Count of Reviews", labelpad=14)
plt.title("Histogram for Votes (Binned)", y=1.02);

In [None]:
plt.figure(figsize = (12,7))
ax = df.loc[df['vote']>0,'overall'].value_counts().plot(kind='barh')
ax.bar_label(ax.containers[0])
plt.xlabel("Number of Reviews", labelpad=14)
plt.ylabel("Number of Stars", labelpad=14)
plt.title("Helpful Reviews by Stars", y=1.02);

In [None]:
#Boxplot of Vote Counts by Score

fig = px.box(df, x="vote", color="overall")

fig.show()

In [None]:
#create graph to check review distribution

fig = px.histogram(df, x="helpful_flag", color="overall").update_xaxes(categoryorder='total descending')
fig.update_xaxes(type='category')
fig.update_layout(bargap=0.3)
fig.show()

In [None]:
#Summary Stats

df.describe().T

In [None]:
#Check distribution of non zero votes

df0 = df.loc[df['vote'] > 0]

df0.describe().T

In [None]:
print(df['helpful_flag'].value_counts())

We can see that the dataset is unbalanced

In [None]:
print(df[[ 'helpful_flag','overall']].value_counts())

We noticed that the helpful is a much lower percentage than unhelpful but 5 star helpfuls was the largest type of helpful votes.

### Create a balanced Dataset

In [None]:
print(df['helpful_flag'].value_counts())

In [None]:
df.shape

In [None]:
143572/969400  # only 14.8% records are helpful .. this data set is unbalanced .. we will create a balanced data set

In [None]:
# creating helpful dataset
df_helpful = df[df['helpful_flag']==1]
df_helpful.shape

In [None]:
# creating nonhelpful dataset 
df_nothelpful = df[df['helpful_flag']==0]
df_nothelpful.shape

In [None]:
#downsampling nonhelpful
df_nothelpful_downsampled = df_nothelpful.sample(df_helpful.shape[0])
df_nothelpful_downsampled.shape

In [None]:
# combining for balanced data set

df_nodups_balanced = pd.concat([df_nothelpful_downsampled, df_helpful])
df_nodups_balanced.shape

In [None]:
print(df_initial['helpful_flag'].value_counts())

In [None]:
df_initial.shape

In [None]:
143572/969400  # only 14.8% records are helpful .. this data set is unbalanced .. we will create a balanced data set

In [None]:
# creating helpful dataset
df_helpful = df_initial[df_initial['helpful_flag']==1]
df_helpful.shape

In [None]:
# creating nonhelpful dataset 
df_nothelpful = df_initial[df_initial['helpful_flag']==0]
df_nothelpful.shape

In [None]:
#downsampling nonhelpful
df_nothelpful_downsampled = df_nothelpful.sample(df_helpful.shape[0])
df_nothelpful_downsampled.shape

In [None]:
# combining for balanced data set

df_balanced = pd.concat([df_nothelpful_downsampled, df_helpful])
df_balanced.shape

### Creating random samples for NB and BERT Models

We will create a sample for: 
- the initial df
- the balanced df 
- the no_dups_balanced df

In [None]:
df_initial_sample = df_initial.sample(n = 20000, random_state = 1)
df_balanced_sample = df_balanced.sample(n = 20000, random_state = 1)
df_nodups_balanced_sample =df_nodups_balanced.sample(n = 20000, random_state = 1)


### initial Sample run

#### NB model

In [None]:
#Import Naive Bayes

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import classification_report

In [None]:
dfNB = df_initial_sample[['reviewText', 'helpful_flag']]

In [None]:
#Create Train-Test Split

#https://towardsdatascience.com/how-to-split-a-dataset-into-training-and-testing-sets-b146b1649830

from sklearn.model_selection import train_test_split

training_data, testing_data = train_test_split(dfNB, test_size=0.2, random_state=25)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

In [None]:
the_count = TfidfVectorizer()

Xtrain = the_count.fit_transform(training_data['reviewText'])

In [None]:
Xtrain = Xtrain.toarray()

reverse = {j:i for i,j in the_count.vocabulary_.items()}

In [None]:
#Fit Multinomial Naive Bayes model for initial dataframe sample
nb = MultinomialNB()
nb.fit(Xtrain, training_data['helpful_flag'])

In [None]:
Xtest = the_count.transform(testing_data['reviewText'])
preds = nb.predict(Xtest.toarray())
preds.shape

In [None]:
print(classification_report(testing_data['helpful_flag'],preds))

In [None]:
# plot the confusion matrix

cnf_matrix = confusion_matrix(testing_data['helpful_flag'],preds)

print(cnf_matrix)

In [None]:
sns.heatmap(cnf_matrix, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.title("Confusion Matrix (Initial Dataset) NB Model", y=1.02);

#### Bert Model

In [None]:
# import tensorflow for creating Bert Models
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
# encoder processing urls
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [None]:
# using the initial sample for BERT
Bertdf = df_initial_sample

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Bertdf['reviewText'],Bertdf['helpful_flag'], stratify=Bertdf['helpful_flag'])

In [None]:
X_train.head(4)

In [None]:
bert_preprocessor = hub.KerasLayer(preprocess_url)
bert_encoder = hub.KerasLayer(encoder_url)

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocessor(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocessor(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
intermediate_layer = tf.keras.layers.Dense(64, activation='relu', name='intermediate_layer')(l)
output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(intermediate_layer)

# Use inputs and outputs to construct a final model
BERT_model = tf.keras.Model(inputs=[text_input], outputs = [output_layer])

In [None]:
BERT_model.summary()

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]
optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
BERT_model.compile(optimizer=optim,
              loss='binary_crossentropy',
              metrics=METRICS)

In [None]:
BERT_model.fit(X_train, y_train, epochs=10)

In [None]:
BERT_model.evaluate(X_test, y_test)

In [None]:
y_predicted = BERT_model.predict(X_test)
y_predicted = y_predicted.flatten()

In [None]:
y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

In [None]:
cm = confusion_matrix(y_test, y_predicted)
cm 

In [None]:
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.title("Confusion Matrix (Initial Dataset) BERT Model", y=1.02);

In [None]:
print(classification_report(y_test, y_predicted))

### Balanced Dataset

#### Naive Bayes

In [None]:
dfNB = df_balanced_sample[['reviewText', 'helpful_flag']]

In [None]:
training_data, testing_data = train_test_split(dfNB, test_size=0.2, random_state=25)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

In [None]:
the_count = TfidfVectorizer()

Xtrain = the_count.fit_transform(training_data['reviewText'])

In [None]:
Xtrain = Xtrain.toarray()

reverse = {j:i for i,j in the_count.vocabulary_.items()}

In [None]:
#Fit Multinomial Naive Bayes model for initial dataframe sample
nb = MultinomialNB()
nb.fit(Xtrain, training_data['helpful_flag'])

In [None]:
Xtest = the_count.transform(testing_data['reviewText'])
preds = nb.predict(Xtest.toarray())
preds.shape

In [None]:
print(classification_report(testing_data['helpful_flag'],preds))

In [None]:
# plot the confusion matrix

cnf_matrix = confusion_matrix(testing_data['helpful_flag'],preds)

print(cnf_matrix)

In [None]:
sns.heatmap(cnf_matrix, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.title("Confusion Matrix (Balanced with Dups Dataset) NB Model", y=1.02);

#### Bert Model  - Balanced (Dups) Dataset

In [None]:
# encoder processing urls
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [None]:
# using the initial sample for BERT
Bertdf = df_balanced_sample

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Bertdf['reviewText'],Bertdf['helpful_flag'], stratify=Bertdf['helpful_flag'])

In [None]:
X_train.head(4)

In [None]:
bert_preprocessor = hub.KerasLayer(preprocess_url)
bert_encoder = hub.KerasLayer(encoder_url)

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocessor(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocessor(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
intermediate_layer = tf.keras.layers.Dense(64, activation='relu', name='intermediate_layer')(l)
output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(intermediate_layer)

# Use inputs and outputs to construct a final model
BERT_model = tf.keras.Model(inputs=[text_input], outputs = [output_layer])

In [None]:
BERT_model.summary()

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]
optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
BERT_model.compile(optimizer=optim,
              loss='binary_crossentropy',
              metrics=METRICS)

In [None]:
BERT_model.fit(X_train, y_train, epochs=10)

In [None]:
BERT_model.evaluate(X_test, y_test)

In [None]:
y_predicted = BERT_model.predict(X_test)
y_predicted = y_predicted.flatten()

In [None]:
y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

In [None]:
cm = confusion_matrix(y_test, y_predicted)
cm 

In [None]:
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.title("Confusion Matrix (Balanced Dups Dataset) BERT Model", y=1.02);

In [None]:
print(classification_report(y_test, y_predicted))

###  Balanced no Dups Dataset

#### Naive Bayes 

In [None]:
dfNB = df_nodups_balanced_sample[['reviewText', 'helpful_flag']]

In [None]:
training_data, testing_data = train_test_split(dfNB, test_size=0.2, random_state=25)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

In [None]:
the_count = TfidfVectorizer()

Xtrain = the_count.fit_transform(training_data['reviewText'])

In [None]:
Xtrain = Xtrain.toarray()

reverse = {j:i for i,j in the_count.vocabulary_.items()}

In [None]:
#Fit Multinomial Naive Bayes model for initial dataframe sample
nb = MultinomialNB()
nb.fit(Xtrain, training_data['helpful_flag'])

In [None]:
Xtest = the_count.transform(testing_data['reviewText'])
preds = nb.predict(Xtest.toarray())
preds.shape

In [None]:
print(classification_report(testing_data['helpful_flag'],preds))

In [None]:
# plot the confusion matrix

cnf_matrix = confusion_matrix(testing_data['helpful_flag'],preds)

print(cnf_matrix)

In [None]:
sns.heatmap(cnf_matrix, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.title("Confusion Matrix (Balanced No Dups Dataset) NB Model", y=1.02);

#### Bert Model  - Balanced Dataset

In [None]:
# encoder processing urls
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [None]:
# using the initial sample for BERT
Bertdf = df_nodups_balanced_sample

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Bertdf['reviewText'],Bertdf['helpful_flag'], stratify=Bertdf['helpful_flag'])

In [None]:
X_train.head(4)

In [None]:
bert_preprocessor = hub.KerasLayer(preprocess_url)
bert_encoder = hub.KerasLayer(encoder_url)

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocessor(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocessor(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
intermediate_layer = tf.keras.layers.Dense(64, activation='relu', name='intermediate_layer')(l)
output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(intermediate_layer)

# Use inputs and outputs to construct a final model
BERT_model = tf.keras.Model(inputs=[text_input], outputs = [output_layer])

In [None]:
BERT_model.summary()

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]
optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
BERT_model.compile(optimizer=optim,
              loss='binary_crossentropy',
              metrics=METRICS)

In [None]:
BERT_model.fit(X_train, y_train, epochs=10)

In [None]:
BERT_model.evaluate(X_test, y_test)

In [None]:
y_predicted = BERT_model.predict(X_test)
y_predicted = y_predicted.flatten()

In [None]:
y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

In [None]:
cm = confusion_matrix(y_test, y_predicted)
cm 

In [None]:
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.title("Confusion Matrix (Balanced No Dups Dataset) BERT Model", y=1.02);

In [None]:
print(classification_report(y_test, y_predicted))

Creating New EDA section for Lexical Approach

In [None]:
## We can see that the balanced data is getting the best scores.
## we will now look at wordcount as it had high correlation to the helpful score

In [None]:
sns.set_theme(style="whitegrid")


sns.boxenplot(x="helpful_flag", y="totalWords",
              color="b", 
              scale="linear", data=df[['totalWords','helpful_flag']])

We can see that the more words in the review help in the getting a helpful vote

In [None]:
## create chart from excel in here 

In [None]:
import tqdm

In [None]:
# encoder processing urls
the_count = TfidfVectorizer()


In [None]:
### we are noticing that the low word counts are mostly unhelpful so we
### are looking to convert the helpful flag to 0 for words under 10

# create empty results df

results_df = pd.DataFrame(columns = ['wordcount', 'fl-score macro avg', 'fl-score helpful' , 'fl-score not helpful', 'fl-score accuracy'])



# Topics range
min_word = 2
max_word = 33
step_size = 1
word_range = range(min_word, max_word, step_size)

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(word_range)),  leave=True)
    
    # iterate through number of topics
    for k in word_range:
        df_nodups_balanced_sample['New_Helpful'] = np.where(df_nodups_balanced_sample['totalWords'] < k, 0, df_nodups_balanced_sample['helpful_flag'])
        # using the initial sample for BERT
        dfNB = df_nodups_balanced_sample[['reviewText', 'New_Helpful']]
        training_data, testing_data = train_test_split(dfNB, test_size=0.2, random_state=25)
        Xtrain = the_count.fit_transform(training_data['reviewText'])
        Xtrain = Xtrain.toarray()
        nb = MultinomialNB()
        nb.fit(Xtrain, training_data['New_Helpful'])
        Xtest = the_count.transform(testing_data['reviewText'])
        preds = nb.predict(Xtest.toarray())
        df_class_report = pd.DataFrame(classification_report(testing_data['New_Helpful'],preds, output_dict = True))
        results_df = results_df.append({'wordcount' : k , 'fl-score macro avg' : df_class_report.loc['f1-score','macro avg'] , 'fl-score helpful' : df_class_report.loc['f1-score','1'], 'fl-score not helpful' : df_class_report.loc['f1-score','0'], 'fl-score accuracy' : df_class_report.loc['f1-score','accuracy']}, ignore_index=True)
        pbar.update(1)
        
    pbar.close()
#    results_df['fl-score macro avg'] = df_class_report.loc['f1-score','macro avg']
#    results_df['fl-score helpful'] = df_class_report.loc['f1-score','1']
#    results_df['fl-score not helpful'] = df_class_report.loc['f1-score','0']
#    results_df['fl-score accuracy'] = df_class_report.loc['f1-score','accuracy']
    

In [None]:
df_class_report.loc['f1-score','macro avg']

In [None]:
df_class_report

In [None]:
results_df

In [None]:
### we are noticing that the low word counts are mostly unhelpful so we
### are looking to convert the helpful flag to 0 for words under 10

# create empty results df

results_top_df = pd.DataFrame(columns = ['wordcount', 'fl-score macro avg', 'fl-score helpful' , 'fl-score not helpful', 'fl-score accuracy'])



# Topics range
min_word = 88
max_word = 125
step_size = 1
word_range = range(min_word, max_word, step_size)

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(word_range)),  leave=True)
    
    # iterate through number of topics
    for k in word_range:
        df_nodups_balanced_sample['New_Helpful'] = np.where(df_nodups_balanced_sample['totalWords'] > k, 1, df_nodups_balanced_sample['helpful_flag'])
        # using the initial sample for BERT
        dfNB = df_nodups_balanced_sample[['reviewText', 'New_Helpful']]
        training_data, testing_data = train_test_split(dfNB, test_size=0.2, random_state=25)
        Xtrain = the_count.fit_transform(training_data['reviewText'])
        Xtrain = Xtrain.toarray()
        nb = MultinomialNB()
        nb.fit(Xtrain, training_data['New_Helpful'])
        Xtest = the_count.transform(testing_data['reviewText'])
        preds = nb.predict(Xtest.toarray())
        df_class_report = pd.DataFrame(classification_report(testing_data['New_Helpful'],preds, output_dict = True))
        results_top_df = results_top_df.append({'wordcount' : k , 'fl-score macro avg' : df_class_report.loc['f1-score','macro avg'] , 'fl-score helpful' : df_class_report.loc['f1-score','1'], 'fl-score not helpful' : df_class_report.loc['f1-score','0'], 'fl-score accuracy' : df_class_report.loc['f1-score','accuracy']}, ignore_index=True)
        pbar.update(1)
        
    pbar.close()
#    results_df['fl-score macro avg'] = df_class_report.loc['f1-score','macro avg']
#    results_df['fl-score helpful'] = df_class_report.loc['f1-score','1']
#    results_df['fl-score not helpful'] = df_class_report.loc['f1-score','0']
#    results_df['fl-score accuracy'] = df_class_report.loc['f1-score','accuracy']
    

In [None]:
results_top_df

 we see that if we limit the previous  that less than 25 words should be set to 0 and that more than 125 words should be set to 1

In [None]:
df_nodups_balanced_sample['New_Helpful'] = np.where(df_nodups_balanced_sample['totalWords'] > 125, 1, df_nodups_balanced_sample['helpful_flag'])
df_nodups_balanced_sample['New_Helpful'] = np.where(df_nodups_balanced_sample['totalWords'] < 25, 0, df_nodups_balanced_sample['New_Helpful'])





In [None]:
dfNB = df_nodups_balanced_sample[['reviewText', 'New_Helpful']]
training_data, testing_data = train_test_split(dfNB, test_size=0.2, random_state=25)
Xtrain = the_count.fit_transform(training_data['reviewText'])
Xtrain = Xtrain.toarray()
nb = MultinomialNB()
nb.fit(Xtrain, training_data['New_Helpful'])
Xtest = the_count.transform(testing_data['reviewText'])
preds = nb.predict(Xtest.toarray())
df_class_report = pd.DataFrame(classification_report(testing_data['New_Helpful'],preds, output_dict = True))

In [None]:
df_class_report

# Same data set using BERT

In [None]:
# encoder processing urls
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [None]:
# using the initial sample for BERT
Bertdf = df_nodups_balanced_sample

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Bertdf['reviewText'],Bertdf['New_Helpful'], stratify=Bertdf['New_Helpful'])

In [None]:
X_train.head(4)

In [None]:
bert_preprocessor = hub.KerasLayer(preprocess_url)
bert_encoder = hub.KerasLayer(encoder_url)

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocessor(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocessor(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
intermediate_layer = tf.keras.layers.Dense(64, activation='relu', name='intermediate_layer')(l)
output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(intermediate_layer)

# Use inputs and outputs to construct a final model
BERT_model = tf.keras.Model(inputs=[text_input], outputs = [output_layer])

In [None]:
BERT_model.summary()

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]
optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
BERT_model.compile(optimizer=optim,
              loss='binary_crossentropy',
              metrics=METRICS)

In [None]:
BERT_model.fit(X_train, y_train, epochs=10)

In [None]:
BERT_model.evaluate(X_test, y_test)

In [None]:
y_predicted = BERT_model.predict(X_test)
y_predicted = y_predicted.flatten()

In [None]:
y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

In [None]:
cm = confusion_matrix(y_test, y_predicted)
cm 

In [None]:
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.title("Confusion Matrix (New Helpful Column) BERT Model", y=1.02);

In [None]:
print(classification_report(y_test, y_predicted))