In [94]:
import pandas as pd
import json
import numpy as np
import ydata_profiling

In [95]:
#we only need 3 columns in review.json. Since the file is so big (5GB!) we will avoid opening the whole JSON file
#and instead append to a list and then make it into a dataframe

#ADD IN REVIEW JSON

chunk = []
columns = ["user_id", "business_id", "stars"] #We only need these columns

with open("/Users/dariayip/Documents/vs code/yelp_academic_dataset_review.json", "r") as y:
    for line in y:
        doc = json.loads(line)
        lst = [doc["user_id"], doc["business_id"], doc["stars"]]
        chunk.append(lst)

review = pd.DataFrame(data=chunk, columns=columns)

review.rename(columns={"stars": "stars_review"}, inplace=True) #rename stars to stars review to avoid confusion

In [96]:
business_dtypes = {"business_id" : str, "name" : str, "address" : str, "city" : str, "state" : str, "postal_code" : str,
                  "latitude" : float, "longitude" : float, "stars" : float, "review_count" : int, "is_open" : int,
                  "attributes" : str, "categories" : str, "hours" : str}

#Open the Business JSON, with orient as columns --> values in that ordered pair
with open("/Users/dariayip/Documents/vs code/yelp_academic_dataset_business.json", "r") as y:
    business = pd.read_json(y, orient="records", lines = True,  dtype = business_dtypes) #save as a business df

business.rename(columns={"stars": "stars_business"}, inplace=True) #rename stars to stars biz to avoid confusion

In [97]:
#perform a left join to get the businesses with reviews and add the columns in the business 
businesseswithreviews = pd.merge(review, business, on = "business_id", how = "left")
#This is done to merge all relevant information into one dataframe

In [98]:
#Downsampling

businesseswithreviews = businesseswithreviews[businesseswithreviews["is_open"] == 1] #drop any businesses that are no longer operational
businesseswithreviews = businesseswithreviews[businesseswithreviews["categories"].str.contains("Restaurants", na=False)] #Delete any businesses that do not include the word "Restaurants" within the Categories section

In [99]:
#Downsampling Part 2

businesseswithreviews = businesseswithreviews[businesseswithreviews["city"] == "Philadelphia"] #filter out any cities other than Philadelphia (most businesses by city)
print(businesseswithreviews.shape)

#Group by business_id and count the number of reviews per business with the .transform(len)
counts_business_id = businesseswithreviews.groupby("business_id")["business_id"].transform(len)
counter = (counts_business_id > 20)
businesseswithreviews = businesseswithreviews[counter]
businesseswithreviews.shape
#remove businesses with less than 20 reviews

(511138, 16)


(499171, 16)

In [100]:
#Systematic Sampling (removing every 10th row)
businesseswithreviews = businesseswithreviews[businesseswithreviews.index % 10 != 0] 
businesseswithreviews.shape

(448906, 16)

In [101]:
#Random Sampling of 20% of dataset
businesseswithreviews = businesseswithreviews.sample(frac=0.20)
businesseswithreviews.shape

(89781, 16)

In [102]:
#Detect any missing data
missingdata = businesseswithreviews.isnull().sum() #sum up the amount of missing data points in this dataframe
print(missingdata)

user_id           0
business_id       0
stars_review      0
name              0
address           0
city              0
state             0
postal_code       0
latitude          0
longitude         0
stars_business    0
review_count      0
is_open           0
attributes        0
categories        0
hours             0
dtype: int64


In [103]:
#Detect any missing data
missingdata = business.isnull().sum() #sum up the amount of missing data points in this dataframe
print(missingdata)

business_id       0
name              0
address           0
city              0
state             0
postal_code       0
latitude          0
longitude         0
stars_business    0
review_count      0
is_open           0
attributes        0
categories        0
hours             0
dtype: int64


In [104]:
#Detect any missing data
missingdata = review.isnull().sum() #sum up the amount of missing data points in this dataframe
print(missingdata)

user_id         0
business_id     0
stars_review    0
dtype: int64


In [105]:
#Pandas Profile for the businesseswithreviews dataset
profile = ydata_profiling.ProfileReport(businesseswithreviews)
profile.to_file(output_file="businesseswithreviewsEDAprofile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]



Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [106]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split

reader = Reader(line_format='user item rating', rating_scale=(1, 5)) #using the rating scale to ensure it stays as
#1 to 5
dataset = Dataset.load_from_df(businesseswithreviews[["user_id", "name", "stars_review"]], reader)

#Split the dataset into train and test sets before we preprocess the data using Surprise to 80/20
df_train, df_test = train_test_split(dataset, test_size=0.2)

In [107]:
from surprise import SVD
method = SVD()
method.fit(df_train)  #An alternative is to use ALS. Training the model here
recommendations = method.test(df_test) #generate recs by using SVD on the test dataset

In [108]:
from surprise import accuracy

#Using Surprise, we can calculate RMSE and MAE to evaluate our recommendation system

# totalmae = 0
# totalrmse = 0

# for i in range(5): #loop 5 times and take average
MAE = accuracy.mae(recommendations)
totalmae += MAE
RMSE = accuracy.rmse(recommendations)
totalrmse += RMSE
    
# #Arithmetic Average
# totalmae = totalmae/5
# totalrmse = totalrmse/5

# print("\nAverage RMSE:", totalrmse, "\nAverage MAE:", totalmae)

MAE:  0.9478
RMSE: 1.1941


In [109]:
totalmae = [0.9398, 0.9509, 0.9519]
totalrmse = [1.1845, 1.1976, 1.1976]

averagemae = sum(totalmae)/len(totalmae)
averagermse = sum(totalrmse)/len(totalrmse)

print("Average RMSE:", averagermse, "\nAverage MAE:", averagemae)

Average RMSE: 1.1932333333333334 
Average MAE: 0.9475333333333333


In [110]:
#We will try ALS to see if this changes anything
from surprise import SVDpp
from surprise import BaselineOnly #predicts the baseline estimate given a user and item

#Using ALS below to see if there is any difference. Figures are from Surprise documentation
bsl_options = {"method": "als", "n_epochs": 5, "reg_u": 12, "reg_i": 5}
algo = BaselineOnly(bsl_options=bsl_options)

In [111]:
from surprise.model_selection import cross_validate

results = cross_validate(method, dataset, measures = ["RMSE", "MAE"], cv = 5, verbose = True) #verbose = True to print the progress
#and results of each fold

#This is to use k-fold cross validation to evaluate our model as another way to evaluate other than what was shown 
#above with test/train

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1924  1.1956  1.1912  1.1975  1.1861  1.1926  0.0039  
MAE (testset)     0.9512  0.9504  0.9443  0.9516  0.9413  0.9478  0.0042  
Fit time          0.56    0.54    0.56    0.54    0.56    0.55    0.01    
Test time         0.04    0.04    0.04    0.04    0.04    0.04    0.00    


In [113]:
import matplotlib.pyplot as plt
import numpy as np

meanrmse = results["test_rmse"]
meanmae = results["test_mae"]

folds = ("1st Fold", "2nd Fold", "3rd Fold", "4th Fold", "5th Fold")
x_axis = folds #for the x-coordinates for the graph
y_axis = meanrmse.tolist()
y_axis_mae = meanmae.tolist()

plt.bar(x_axis, y_axis, label = "RMSE", color = "pink")
plt.plot(x_axis, y_axis_mae, label = "MAE", marker='o', color = "black") #marker = 'o' used to 
#plt.show() #having trouble showing this so saved as PNG file instead
plt.legend() #add a legend

plt.savefig("rmsemaeresults.png") #mae is represented by the line graph and rmse by the bars