In [8]:
import pandas as pd
import json
import numpy as np
#import pandas_profiling

In [9]:
#we only need 3 columns in review.json. Since the file is so big (5GB!) we will avoid opening the whole JSON file
#and instead append to a list and then make it into a dataframe

#ADD IN REVIEW JSON

chunk = []
columns = ["user_id", "business_id", "stars"] #We only need these columns

with open("/Users/dariayip/Documents/vs code/yelp_academic_dataset_review.json", "r") as y:
    for line in y:
        doc = json.loads(line)
        lst = [doc["user_id"], doc["business_id"], doc["stars"]]
        chunk.append(lst)

review = pd.DataFrame(data=chunk, columns=columns)

review.rename(columns={"stars": "stars_review"}, inplace=True) #rename stars to stars review to avoid confusion

In [10]:
business_dtypes = {"business_id" : str, "name" : str, "address" : str, "city" : str, "state" : str, "postal_code" : str,
                  "latitude" : float, "longitude" : float, "stars" : float, "review_count" : int, "is_open" : int,
                  "attributes" : str, "categories" : str, "hours" : str}

#Open the Business JSON, with orient as columns --> values in that ordered pair
with open("/Users/dariayip/Documents/vs code/yelp_academic_dataset_business.json", "r") as y:
    business = pd.read_json(y, orient="records", lines = True,  dtype = business_dtypes) #save as a business df

business.rename(columns={"stars": "stars_business"}, inplace=True) #rename stars to stars biz to avoid confusion

In [11]:
#perform a left join to get the businesses with reviews and add the columns in the business 
businesseswithreviews = pd.merge(review, business, on = "business_id", how = "left")
#This is done to merge all relevant information into one dataframe

In [12]:
#Downsampling

businesseswithreviews = businesseswithreviews[businesseswithreviews["is_open"] == 1] #drop any businesses that are no longer operational
businesseswithreviews = businesseswithreviews[businesseswithreviews["categories"].str.contains("Restaurants", na=False)] #Delete any businesses that do not include the word "Restaurants" within the Categories section

In [13]:
#Downsampling Part 2

businesseswithreviews = businesseswithreviews[businesseswithreviews["city"] == "Philadelphia"] #filter out any cities other than Philadelphia (most businesses by city)
print(businesseswithreviews.shape)

#Group by business_id and count the number of reviews per business with the .transform(len)
counts_business_id = businesseswithreviews.groupby("business_id")["business_id"].transform(len)
counter = (counts_business_id > 20)
businesseswithreviews = businesseswithreviews[counter]
businesseswithreviews.shape
#remove businesses with less than 20 reviews

(511138, 16)


(499171, 16)

In [14]:
#Systematic Sampling (removing every 10th row)
businesseswithreviews = businesseswithreviews[businesseswithreviews.index % 10 != 0] 
businesseswithreviews.shape

(448906, 16)

In [15]:
#Random Sampling of 20% of dataset
businesseswithreviews = businesseswithreviews.sample(frac=0.20)
businesseswithreviews.shape

(89781, 16)

In [16]:
#Detect any missing data
missingdata = businesseswithreviews.isnull().sum() #sum up the amount of missing data points in this dataframe
print(missingdata)

user_id           0
business_id       0
stars_review      0
name              0
address           0
city              0
state             0
postal_code       0
latitude          0
longitude         0
stars_business    0
review_count      0
is_open           0
attributes        0
categories        0
hours             0
dtype: int64


In [17]:
#Detect any missing data
missingdata = business.isnull().sum() #sum up the amount of missing data points in this dataframe
print(missingdata)

business_id       0
name              0
address           0
city              0
state             0
postal_code       0
latitude          0
longitude         0
stars_business    0
review_count      0
is_open           0
attributes        0
categories        0
hours             0
dtype: int64


In [18]:
#Detect any missing data
missingdata = review.isnull().sum() #sum up the amount of missing data points in this dataframe
print(missingdata)

user_id         0
business_id     0
stars_review    0
dtype: int64


In [19]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split

reader = Reader(line_format='user item rating', rating_scale=(1, 5)) #using the rating scale to ensure it stays as
#1 to 5
dataset = Dataset.load_from_df(businesseswithreviews[["user_id", "name", "stars_review"]], reader)

#Split the dataset into train and test sets before we preprocess the data using Surprise to 80/20
df_train, df_test = train_test_split(dataset, test_size=0.2)

In [21]:
from surprise import SVD
method = SVD()
method.fit(df_train)  #An alternative is to use ALS, which is good for sparse datasets. Training the model here
recommendations = method.test(df_test) #generate recs by using SVD on the test dataset

In [22]:
from surprise import accuracy

#Using Surprise, we can calculate RMSE and MAE to evaluate our recommendation system
MAE = accuracy.mae(recommendations)
RMSE = accuracy.rmse(recommendations)

MAE:  0.9398
RMSE: 1.1845
