In [2]:
import warnings
warnings.filterwarnings('ignore')
import os
import random
import json
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
sns.set()

In [4]:
!pip install surprise



In [5]:
from surprise import SVD, NMF, NormalPredictor,KNNBasic
from surprise import accuracy, Reader, Dataset, dump 
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split

In [6]:
# Load review data
reviews_df = pd.read_pickle("nash_reviews.pkl")

In [7]:
unique_user_ids = reviews_df['user_id'].unique()
resto = reviews_df['business_id'].unique()

In [8]:
user_map = {unique_user_ids[i]: i + 1 for i in range(len(unique_user_ids))}
resto_map = {resto[i]: i + 1 for i in range(len(resto))}

## Preparing data for the surprise library

In [9]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(reviews_df[['user_id', 'business_id', 'stars']], reader)

In [11]:
# Evaluating different algorithms
performance = []
for algo in [SVD(), NMF(), NormalPredictor()]:
    cv_results = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)
    results_df = pd.DataFrame.from_dict(cv_results).mean(axis=0)
    results_df = results_df._append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    performance.append(results_df)

In [12]:
pd.DataFrame(performance).set_index('Algorithm')

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,1.125784,2.87358,0.821902
NMF,1.299851,9.856135,1.033674
NormalPredictor,1.610988,0.344858,0.706753


### Pick SVD, as it seems to perform better

In [13]:
# Grid search for best parameters
param_grid = {'n_factors': [30, 35], 'n_epochs': [20, 25], 'lr_all': [0.001, 0.003], 'reg_all': [0.08, 0.1]}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
grid_search.fit(data)


In [14]:
best_algo = grid_search.best_estimator['rmse']
print(grid_search.best_score['rmse'])
print(grid_search.best_params['rmse'])

1.1180152539296662
{'n_factors': 30, 'n_epochs': 25, 'lr_all': 0.003, 'reg_all': 0.08}


In [15]:
# Training on the entire dataset
trainset = data.build_full_trainset()
best_algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x28a20f22b80>

In [16]:
# Saving the trained model
file_name = os.path.expanduser('best_algo_dump')
dump.dump(file_name, algo=best_algo)

In [17]:
# Loading the saved model
_, loaded_best_algo = dump.load(file_name)

In [20]:
# Try to predict recommendation for each user
user_list = {i:j for i,j in enumerate(reviews_df['user_id'].unique())}

In [21]:
# Function to get top N recommendations
from collections import defaultdict
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions."""
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [22]:
len(user_list.values())

73311

In [23]:
batch_size = 10000
total_users = len(user_list)

for start_idx in range(0, total_users, batch_size):
    print(start_idx)
    batch_users = list(user_list.keys())[start_idx:start_idx + batch_size]
    anti_testset = []
    fill_value = trainset.global_mean
    for uid in batch_users:
        user_items = [item for (item, _) in trainset.ur[uid]]
        for iid in trainset.all_items():
            if iid not in user_items:
                anti_testset.append((trainset.to_raw_uid(uid), trainset.to_raw_iid(iid), fill_value))
    
    predictions = loaded_best_algo.test(anti_testset)
    top_n_recommendations = get_top_n(predictions, n=5)
    
    with open('rec_test_{}.json'.format(start_idx // batch_size + 1), 'w') as fp:
        json.dump(top_n_recommendations, fp)
    
    print(f"Processing batch {start_idx // batch_size + 1}: {batch_users[:5]} ... {batch_users[-5:]}")

0
Processing batch 1: [0, 1, 2, 3, 4] ... [9995, 9996, 9997, 9998, 9999]
10000
Processing batch 2: [10000, 10001, 10002, 10003, 10004] ... [19995, 19996, 19997, 19998, 19999]
20000
Processing batch 3: [20000, 20001, 20002, 20003, 20004] ... [29995, 29996, 29997, 29998, 29999]
30000
Processing batch 4: [30000, 30001, 30002, 30003, 30004] ... [39995, 39996, 39997, 39998, 39999]
40000
Processing batch 5: [40000, 40001, 40002, 40003, 40004] ... [49995, 49996, 49997, 49998, 49999]
50000
Processing batch 6: [50000, 50001, 50002, 50003, 50004] ... [59995, 59996, 59997, 59998, 59999]
60000
Processing batch 7: [60000, 60001, 60002, 60003, 60004] ... [69995, 69996, 69997, 69998, 69999]
70000
Processing batch 8: [70000, 70001, 70002, 70003, 70004] ... [73306, 73307, 73308, 73309, 73310]


The top predictions for a select few have been saved in rec.json

In [81]:
nash_restaurants = pd.read_pickle("nash_restaurants.pkl")

In [82]:
nash_restaurants.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10,1,"{'RestaurantsAttire': ''casual'', 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '..."
24,4iRzR7OaS-QaSXuvYxEGKA,Super Dog,1160 Gallatin Pike S,Nashville,TN,37115,36.24857,-86.719986,4.0,6,0,"{'RestaurantsReservations': 'False', 'Restaura...","Hot Dogs, Restaurants",
27,tMkwHmWFUEXrC9ZduonpTg,The Green Pheasant,215 1st Ave S,Nashville,TN,37201,36.159886,-86.773197,4.0,161,0,"{'RestaurantsGoodForGroups': 'True', 'HappyHou...","Restaurants, Japanese, Seafood","{'Wednesday': '16:0-22:0', 'Thursday': '16:0-2..."
47,lk9IwjZXqUMqqOhM774DtQ,Caviar & Bananas,2031 Broadway,Nashville,TN,37203,36.148371,-86.798895,3.5,159,0,"{'RestaurantsTakeOut': 'True', 'RestaurantsRes...","Coffee & Tea, Restaurants, Wine Bars, Bars, Ni...","{'Monday': '7:0-17:0', 'Tuesday': '7:0-17:0', ..."
89,oaboaRBUgGjbo2kfUIKDLQ,Mike's Ice Cream,129 2nd Ave N,Nashville,TN,37201,36.162649,-86.775973,4.5,593,1,"{'RestaurantsGoodForGroups': 'True', 'GoodForK...","Ice Cream & Frozen Yogurt, Coffee & Tea, Resta...","{'Monday': '8:0-23:0', 'Tuesday': '8:0-23:0', ..."


In [88]:
list(nash_restaurants.postal_code.unique())

['37207',
 '37115',
 '37201',
 '37203',
 '37204',
 '37214',
 '37209',
 '37219',
 '37212',
 '37211',
 '37215',
 '37205',
 '37243',
 '37221',
 '37206',
 '37228',
 '37217',
 '37013',
 '37208',
 '37216',
 '37067',
 '37220',
 '37210',
 '37076',
 '37026',
 '28801',
 '37218',
 '37011',
 '37027',
 '',
 '37213',
 '37246',
 '37232',
 '37189',
 '37229',
 '37138',
 '32709',
 '37135',
 '37024',
 '37238',
 '37072']

In [90]:
zipc=pd.read_csv('us_zip_code.csv')

In [97]:
zipc['postal code'] = zipc['postal code'].astype(str)

In [102]:
len(zipc[zipc['postal code'].isin(nash_restaurants.postal_code.unique())])

40

In [101]:
len(nash_restaurants.postal_code.unique())

41

In [103]:

zipc[zipc['postal code'].isin(nash_restaurants.postal_code.unique())][['postal code','geopoint', 'place name']]

Unnamed: 0,country code,postal code,place name,admin name1,admin code1,admin name2,admin code2,latitude,longitude
1554,US,32709,Christmas,Florida,FL,Orange,95.0,28.5462,-81.0116
2744,US,37203,Nashville,Tennessee,TN,Davidson,37.0,36.1504,-86.7916
6984,US,37189,Whites Creek,Tennessee,TN,Davidson,37.0,36.2744,-86.8292
6985,US,37204,Nashville,Tennessee,TN,Davidson,37.0,36.1067,-86.7743
6986,US,37210,Nashville,Tennessee,TN,Davidson,37.0,36.1379,-86.741
8507,US,37232,Nashville,Tennessee,TN,Davidson,37.0,36.1866,-86.7852
9833,US,37026,Bradyville,Tennessee,TN,Cannon,15.0,35.7053,-86.0912
9837,US,37205,Nashville,Tennessee,TN,Davidson,37.0,36.1114,-86.869
9838,US,37220,Nashville,Tennessee,TN,Davidson,37.0,36.0641,-86.7697
11893,US,37214,Nashville,Tennessee,TN,Davidson,37.0,36.1633,-86.6609


In [104]:
zipc['geopoint'] = zipc.apply(lambda row: [row['latitude'], row['longitude']], axis=1)


In [106]:
zipc[zipc['postal code'].isin(nash_restaurants.postal_code.unique())][['postal code','geopoint', 'place name']].to_csv('nash_zip.csv',index=False)