#### Naive Bayes

In [None]:
import os
import re
import json
import gzip
import wget
import pandas as pd
import numpy as np
from urllib.request import urlopen
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from prettytable import PrettyTable
import time
import nltk
from imp import reload

#cleaning textfiles libraries
from collections import defaultdict # For accumlating values
from nltk.corpus import stopwords # To remove stopwords

In [None]:
##download data from url
### randomly selected file to model
url = 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Grocery_and_Gourmet_Food_5.json.gz'
#filename = wget.download(url)


In [None]:
#load metadata
data = []
with gzip.open('Grocery_and_Gourmet_Food_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print(len(data))

# first row of the list
print(data[0])

In [None]:
# convert list into pandas dataframe

df = pd.DataFrame.from_dict(data)

print(len(df))

In [None]:
#look at dataframe
df.info()
display(df)

In [None]:
#drop columns that don't impact
df = df.drop(['style','summary','image'], axis=1)

In [None]:
#convert vote column to float

df['vote']=df['vote'].str.replace(',','')
df["vote"]= df["vote"].fillna(0)
df["vote"] = df["vote"].astype(float)

In [None]:
#convert column to string
df["reviewText"]=df["reviewText"].astype(str)

In [None]:
## inserting helpful flag to be used in EDA and Models
df['helpful_flag'] = np.where(df['vote'] > 0, 1, 0)

In [None]:
#Determine Average Review Length and add review length column to dataframe

x = [len(df['reviewText'][i]) for i in range(df['reviewText'].shape[0])]
print('average length of review: {:.3f}'.format(sum(x)/len(x)) )

df['totalWords'] = df['reviewText'].str.split().str.len()

In [None]:
#Create initial data set for first test 
df_initial = df

In [None]:
# we looked to remove duplicates with the same review time and review IDs as they were assummed to be computer generated and not human generated 


duplicated_reviews = df.duplicated(subset=["reviewerID","reviewTime","reviewText"], keep='first') #returns a Series with True and False values that describe which rows in the DataFrame are duplicated and not.
count_duplicated_reviews = duplicated_reviews.value_counts()

sum_reviews = count_duplicated_reviews.sum()
perc_duplicated_reviews = (count_duplicated_reviews/sum_reviews) * 100

x = PrettyTable()
x.field_names = ["","Count","Percentage of Total"]
x.add_rows([
    ["Duplicate Reviews", count_duplicated_reviews[True], perc_duplicated_reviews[True]],
    ["Original Reviews", count_duplicated_reviews[False], perc_duplicated_reviews[False]],
])
print(x)

In [None]:
df_duplicates = df[duplicated_reviews]
df_duplicates.sort_values(by = ['totalWords'], ascending = [False])
df_duplicates[df_duplicates['vote'] == 2.0]

In [None]:
df_new = df[(df['reviewerID'] == 'A2N8B21NWXHIW7') & (df['unixReviewTime'] == 1469145600) ]
df_new

In [None]:
#remove duplicate reviews for df
df = df[~duplicated_reviews]

print(f"Number of reviews after removel of duplicates : {df.shape[0]}")