## Import Libraries

In [1]:
import math
import itertools

import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

from collections import defaultdict

from scipy.stats import pearsonr, spearmanr
from scipy.spatial.distance import cosine

## Get Data Needed

In [2]:
file_path = 'yelp_academic_dataset_business.json'

df_business_data = pd.read_json(file_path, lines=True)

In [3]:
file_path = 'yelp_academic_dataset_review.json'

chunk_size = 100000
chunks = []
for chunk in pd.read_json(file_path, lines=True, chunksize=chunk_size):
    chunks.append(chunk)

df_review_data = pd.concat(chunks, ignore_index=True)

In [4]:
file_path = 'yelp_academic_dataset_user.json'

chunk_size = 100000
chunks = []
for chunk in pd.read_json(file_path, lines=True, chunksize=chunk_size):
    chunks.append(chunk)

df_user_data = pd.concat(chunks, ignore_index=True)

# Change Data Types

In [5]:
df_business_data = df_business_data.astype({
    "business_id": str,
    "name": str,
    "address": str,
    "city": str,
    "state": str,
    "postal_code": str,
    "is_open": bool,
    "attributes": str,
    "categories": str,
    "hours": str
})

In [6]:
df_review_data = df_review_data.astype({
    "review_id": str,
    "user_id": str,
    "business_id": str,
    "text": str
})

# Data Preprocessing

In [7]:
df_combined = pd.merge(df_review_data, df_business_data, on = 'business_id', how = 'left')

In [8]:
df_combined = df_combined.rename(columns = {
    'stars_x': 'review_stars',
    'stars_y': 'business_stars'
})

## Drop columns

In [9]:
df_dropped = df_combined.drop(columns = ['useful', 'funny', 'cool', 'text', 'date', 'name', 
                                         'address', 'postal_code', 'latitude', 'longitude',
                                         'is_open', 'hours'])

In [10]:
df = df_dropped.dropna(axis = 0, how = 'any')

## Feature Engineering

### Aggregate average user rating

In [11]:
df_avg_rating = df_review_data.groupby('user_id')['stars'].mean()

In [12]:
df = df.merge(df_avg_rating, on = 'user_id', how = 'left')
df = df.rename(columns = {'stars': 'avg_stars'})

### Aggregate number of friends

In [13]:
df_user_data['num_friends'] = df_user_data['friends'].apply(lambda x: len(x.split(',')) if x and x != 'None' else 0)

In [14]:
df = df.merge(df_user_data[['user_id', 'num_friends']], on = 'user_id', how = 'left')

In [15]:
df = df.dropna(axis = 0, how = 'any')

# Recommend Model

## Train test split

In [16]:
n = len(df)
n_train = int(n * 0.8)
train = df[:n_train]
test = df[n_train:]

## Useful data structures

In [17]:
usersPerBusiness = defaultdict(set)
businessPerUser = defaultdict(set)
reviewsPerUser = defaultdict(list)
reviewsPerBusiness = defaultdict(list)
ratingDict = {}

for user, business, rating in zip(train['user_id'], train['business_id'], train['review_stars']):
    usersPerBusiness[business].add(user)
    businessPerUser[user].add(business)
    reviewsPerUser[user].append((business, rating))
    reviewsPerBusiness[business].append((user, rating))
    ratingDict[(user, business)] = rating

## Similarity measures

In [18]:
def Jaccard(s1, s2):
    """
    Method to calculate Jaccard similarity
    """
    n = len(s1.intersection(s2))
    u = len(s1.union(s2))
    return n/u

In [19]:
def Cosine(s1, s2):
    """
    Method to calculate Cosine similarity
    """
    u = list(s1.union(s2))
    v1 = np.array([1 if x in s1 else 0 for x in u])
    v2 = np.array([1 if x in s2 else 0 for x in u])
    return 1 - cosine(v1, v2)

## Most similar businesses

In [20]:
def mostSimilar(business, N, SIM = Jaccard):
    """
    Method to find N most similar restaurants
    - Using SIM similarity measure with a default of Jaccard
    Input: business, N, SIM
    """
    users = usersPerBusiness[business]
    sim = []
    businesses = set()
    for u in users:
        businesses = businesses.union(businessPerUser[u])
    for j in businesses:
        if business == j:
            continue
        score = SIM(usersPerBusiness[business], usersPerBusiness[j])
        sim.append((score, j))
    sim.sort(reverse = True)
    return sim[:N]

In [21]:
business1 = test['business_id'].iloc[1]
mostSimilar(business1, 5, Jaccard)

[]

In [22]:
mostSimilar(business1, 5, Cosine)

[]

## Highest rated businesses

In [23]:
def topBusinesses(user, R):
    """
    Method to find businesses that are rated >= R stars
    Input: user, R
    """
    top = []
    for b in businessPerUser[user]:
        if ratingDict[(user, b)] >= R:
            top.append(b)
    return top

In [24]:
user1 = test['user_id'].iloc[0]
topBusinesses(user1, 4)

['-wG5DQxGlSog4ynKrRdQMQ']

## Recommend businesses

In [25]:
def topSimilar(user, R, M, N, SIM = Jaccard):
    """
    Method to recommend N businesses
    - Using businesses rated as >= R stars
    - By each finding M similar businesses
    - Using SIM similarity measure with a default of Jaccard
    Input: user, R, M, N, SIM
    """
    top = topBusinesses(user, R)
    rated = businessPerUser[user]
    sims = []
    for b in top:
        sims.append(mostSimilar(b, M, SIM))
    flat_sims = list(itertools.chain(*sims))
    top_sims = [x for x in flat_sims if x[1] not in rated]
    sorted_sims = sorted(top_sims, key = lambda x: x[0], reverse = True)
    return [x[1] for x in sorted_sims][:N]

In [26]:
topSimilar(user1, 4, 3, 5)

['PnT9JSoMirpKzY6TnG1HhQ', 'FbPzY01omRFOAmbiqzBsUg', 'YL50DFkSKhVFyfKZgYQA8Q']

## Evaluate

In [27]:
# test
usersPerBusinessTest = defaultdict(set)
businessPerUserTest = defaultdict(set)
reviewsPerUserTest = defaultdict(list)
reviewsPerBusinessTest = defaultdict(list)
ratingDictTest = {}

for user, business, rating in zip(test['user_id'], test['business_id'], test['review_stars']):
    usersPerBusinessTest[business].add(user)
    businessPerUserTest[user].add(business)
    reviewsPerUserTest[user].append((business, rating))
    reviewsPerBusinessTest[business].append((user, rating))
    ratingDictTest[(user, business)] = rating

In [28]:
def precisionK(user):
    recs = topSimilar(user, 2, 10, 30)
    k = len(recs)
    if k == 0:
        return 0
    rated = [x[0] for x in reviewsPerUserTest[user]]
    precision = sum(1 for rec in recs if rec in rated)
    return precision/k

# Rating Model

## Helper functions

In [29]:
def getMean():
    return sum([d['review_stars'] for d in train])/len(train)

In [30]:
def predictRating(user, business):
    ratings = []
    sims = []
    for d in reviewsPerUser[user]:
        j = d['business_id']
        if j == business: 
            continue
        ratings.append(d['review_stars'])
        score = Jaccard(usersPerBusiness[business], usersPerBusiness[j])
        sims.append(score)
    if (sum(sims) > 0):
        weighted = [(x*y) for x,y in zip(ratings, sims)]
        return sum(weighted)/sum(sims)
    else:
        return getMean()

In [31]:
def RMSE(preds, labs):
    dif = [(x-y)**2 for x,y in zip(preds, labs)]
    return math.sqrt(sum(dif)/len(dif))

## Get train test

In [32]:
Xtrain = ratingDict.keys()
ytrain = ratingDict.values()

testDict = {}
for user, business, rating in zip(test['user_id'], test['business_id'], test['review_stars']):
    testDict[(user, business)] = rating

Xtest = testDict.keys()
ytest = testDict.values()

## Initialize params

In [33]:
globalAverage = sum(ytrain) / len(ytrain)

## Simple model

In [34]:
def simpleModel(n_it, lambda_):
    alpha = globalAverage
    beta_u = defaultdict(float)
    beta_i = defaultdict(float)
    for it in range(n_it):
        ratings_sum = sum(ratingDict[(d[0], d[1])] - (beta_u[d[0]] + beta_i[d[1]]) for d in Xtrain)
        alpha = ratings_sum / len(Xtrain)
        
        for u in businessPerUser:
            ratings_sum = sum(ratingDict[(u, i)] - (alpha + beta_i[i]) for i in businessPerUser[u])
            beta_u[u] = ratings_sum / (lambda_ + len(businessPerUser[u]))
            
        for i in usersPerBusiness:
            ratings_sum = sum(ratingDict[(u, i)] - (alpha + beta_u[u]) for u in usersPerBusiness[i])
            beta_i[i] = ratings_sum / (lambda_ + len(usersPerBusiness[i]))

    return alpha, beta_u, beta_i

In [35]:
def predict(alpha, beta_u, beta_i, data):
    return [alpha + beta_u[d[0]] + beta_i[d[1]] for d in data]

In [36]:
alpha, beta_u, beta_i = simpleModel(2, 1)
pred = predict(alpha, beta_u, beta_i, Xtest)

In [37]:
RMSE(pred, ytest)

1.4389923814633618

### Tune model

In [38]:
n_its = [1]
lambdas = [10]

best_rmse = float("inf")
best_params = None

for n_it, lambda_ in itertools.product(n_its, lambdas):
    alpha, beta_u, beta_i = simpleModel(n_it, lambda_)
    pred = predict(alpha, beta_u, beta_i, Xtest)
    rmse = RMSE(pred, ytest)
    if rmse < best_rmse:
        best_rmse = rmse
        best_params = (n_it, lambda_)

print(best_rmse)
print(best_params)

1.4265691438592876
(1, 10)


### Recommend using predictions

In [39]:
businesses_dict = dict(zip(df_business_data['business_id'], df_business_data['name']))
unique_users = list(set(df_user_data['user_id']))

In [108]:
def recommend(user):
    businesses = topSimilar(user, 4, 3, 5)
    data = {(user, business) for business in businesses}
    
    alpha, beta_u, beta_i = simpleModel(best_params[0], best_params[1])
    pred = predict(alpha, beta_u, beta_i, data)
    
    return list(zip([x[1] for x in data], pred))

In [138]:
def sort_get_names(recs):
    sorted_recs = sorted(recs, key = lambda x: x[1], reverse = True)
    return [businesses_dict[x[0]] for x in sorted_recs][:5]

In [139]:
[(x, sort_get_names(recommend(x))) for x in unique_users[:10]]

[('qSTYyg_UYP1xQKVOy-7yQA',
  ['DiBiase Heating & Cooling Company', "Chef Macjon's", "Arby's"]),
 ('1-fmrFh1OY5U1xV-YK782A', ['Lime Spa', 'Wedge Cheese Shop', 'Soak']),
 ('FGLxZRLF3a1gjM8-UOS_JA',
  ['Hampton Inn & Suites Nashville @ Opryland',
   'Subway',
   'Cafe At Bobby',
   'Michaels',
   'Academy Sports + Outdoors']),
 ('_ywTsR_kZ681T2PyiMLFSA',
  ["Jeni's Splendid Ice Creams",
   'Cochon',
   'Luke',
   'Otaku Ramen',
   'Rosepepper Mexican Grill']),
 ('8Lj6pkah532-jVdtr5Wtxw', []),
 ('YnXgIyOKCmJ-0E5g33swdQ', []),
 ('v6P-wNBuz7_b3DaejKi6iQ',
  ['The Bunker Gun Shop', "Feeney's", 'Philadelphia Federal Credit Union']),
 ('jKWDbCV9G4-FEzjdd3PUgw',
  ["Michael's Bar & Grill",
   'The Gateway Arch',
   'Serendipity Homemade Ice Cream',
   'Red Kitchen and Bar',
   'Strange Donuts']),
 ('sIookM05l3n-i84YcOaCvA', []),
 ('3Ht-0G76gJrDxnoXVPH9FA',
  ["Trader Joe's",
   'Cochon',
   'Acme Oyster House',
   'Burger Monger',
   "Mother's Restaurant"])]

### Take care of empty recommendations

In [140]:
friends_dict = dict(zip(df_user_data['user_id'], df_user_data['friends']))

In [141]:
def friend_recommend(user):
    if len(reviewsPerUser[user]) == 0:
        friends = friends_dict[user].split(", ")
        recs = []
        i = 0
        for friend in friends:
            if len(reviewsPerUser[friend]) != 0 and i < 3:
                i += 1
                recs.append(recommend(friend))
        return [x for sub in recs for x in sub]

In [146]:
final = [(x, sort_get_names(recommend(x))) if len(reviewsPerUser[x]) != 0 else (x, sort_get_names(friend_recommend(x))) for x in unique_users[:100]]

In [151]:
final_recs = pd.DataFrame(final, columns = ['user_id', 'recommended_businesses'])
final_recs['recommended_businesses'] = final_recs['recommended_businesses'].apply(lambda x: ", ".join(x))
final_recs.to_csv("final_recommendations.csv", index = False)