In [21]:
import pandas as pd
import numpy as np

from os import listdir
from os.path import isfile, join
from datetime import datetime

import pickle
import ast
import json

import turicreate as tc

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [22]:
df = pd.read_pickle('/Users/arianiherrera/Downloads/LVdf_restaurants.pkl')

In [23]:
df.shape

(1053142, 69)

In [24]:
df.columns

Index(['address', 'attributes', 'attributes_AcceptsInsurance',
       'attributes_AgesAllowed', 'attributes_Alcohol', 'attributes_Ambience',
       'attributes_BYOB', 'attributes_BYOBCorkage', 'attributes_BestNights',
       'attributes_BikeParking', 'attributes_BusinessAcceptsBitcoin',
       'attributes_BusinessAcceptsCreditCards', 'attributes_BusinessParking',
       'attributes_ByAppointmentOnly', 'attributes_Caters',
       'attributes_CoatCheck', 'attributes_Corkage',
       'attributes_DietaryRestrictions', 'attributes_DogsAllowed',
       'attributes_DriveThru', 'attributes_GoodForDancing',
       'attributes_GoodForKids', 'attributes_GoodForMeal',
       'attributes_HairSpecializesIn', 'attributes_HappyHour',
       'attributes_HasTV', 'attributes_Music', 'attributes_NoiseLevel',
       'attributes_Open24Hours', 'attributes_OutdoorSeating',
       'attributes_RestaurantsAttire', 'attributes_RestaurantsCounterService',
       'attributes_RestaurantsDelivery', 'attributes_Restau

In [25]:
df.rename(columns={'stars_x':'overall_stars','stars_y':'review_stars'}, inplace=True)

In [26]:
df.head()

Unnamed: 0,address,attributes,attributes_AcceptsInsurance,attributes_AgesAllowed,attributes_Alcohol,attributes_Ambience,attributes_BYOB,attributes_BYOBCorkage,attributes_BestNights,attributes_BikeParking,...,overall_stars,state,cool,date,funny,review_id,review_stars,text,useful,user_id
8,"3940 Martin Luther King Blvd, Ste 101",,,,,,,,,True,...,1.5,NV,1,2015-08-02,0.0,UgwmWy_68S_aKR9wTWKGOg,3.0,I am a huge fan of both locally owned business...,4.0,jSD05fFrAFa5gX3ZQae1tw
9,"3940 Martin Luther King Blvd, Ste 101",,,,,,,,,True,...,1.5,NV,2,2015-06-08,2.0,AywPsODuQbUMhBEjmKiGXw,1.0,Let me start off by saying. If you cant make a...,8.0,oagZh5A2cWJXZBLakS_KpQ
10,"3940 Martin Luther King Blvd, Ste 101",,,,,,,,,True,...,1.5,NV,1,2015-08-08,0.0,6j39TEUBDBTEK37OKACWLw,1.0,Decent product. ..HORRIBLE service. Totally un...,2.0,6BSwuyc7fvcccJgrY4_W5Q
31,2255 N Rampart Blvd,,,,none,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,2.5,NV,0,2016-07-03,0.0,c6iTbCMMYWnOd79ZiWwobg,1.0,"I ordered a few 12 inch sandwiches , a turkey ...",1.0,ih7Dmu7wZpKVwlBRbakJOQ
32,2255 N Rampart Blvd,,,,none,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,2.5,NV,0,2010-06-20,2.0,fisvc16rY1Q5NTX_O_QQ2A,1.0,A very dirty place. You can watch the employee...,3.0,zqL43wHcz133AVvEuba67w


In [27]:
df = df.dropna(subset=['user_id'])

split_idx = int(df.shape[0]*0.8)
train = df[['user_id','business_id','review_stars']].iloc[:split_idx,:]
test = df[['user_id','business_id','review_stars']].iloc[split_idx:,:]

In [28]:
print(train.shape)
print(test.shape)

(842512, 3)
(210629, 3)


In [29]:
train_data = tc.SFrame(train)
test_data = tc.SFrame(test)


# Popularity Model

In [30]:
popularity_model = tc.popularity_recommender.create(train_data,\
                                                    user_id='user_id', item_id='business_id', target='review_stars')

In [31]:
train_data.head()

user_id,business_id,review_stars
jSD05fFrAFa5gX3ZQae1tw,vJIuDBdu01vCA8y1fwR1OQ,3.0
oagZh5A2cWJXZBLakS_KpQ,vJIuDBdu01vCA8y1fwR1OQ,1.0
6BSwuyc7fvcccJgrY4_W5Q,vJIuDBdu01vCA8y1fwR1OQ,1.0
ih7Dmu7wZpKVwlBRbakJOQ,kgffcoxT6BQp-gJ-UQ7Czw,1.0
zqL43wHcz133AVvEuba67w,kgffcoxT6BQp-gJ-UQ7Czw,1.0
erWAz4mU0dHuScIMwjwmHg,kgffcoxT6BQp-gJ-UQ7Czw,3.0
m3WBc9bGxn1q1ikAFq8PaA,kgffcoxT6BQp-gJ-UQ7Czw,1.0
uZ4z-4ftg7LbGR_blmZTow,kgffcoxT6BQp-gJ-UQ7Czw,5.0
dLrMDYQ9TwfodJxYLfaKDQ,kgffcoxT6BQp-gJ-UQ7Czw,2.0
Fv0e9RIV9jw5TX3ctA1WbA,kgffcoxT6BQp-gJ-UQ7Czw,4.0


In [32]:
check_users = ['jSD05fFrAFa5gX3ZQae1tw','oagZh5A2cWJXZBLakS_KpQ','6BSwuyc7fvcccJgrY4_W5Q',
              'ih7Dmu7wZpKVwlBRbakJOQ','zqL43wHcz133AVvEuba67w']
popularity_recomm = popularity_model.recommend(users=check_users,k=5)
popularity_recomm.print_rows(num_rows=25)


+------------------------+------------------------+-------+------+
|        user_id         |      business_id       | score | rank |
+------------------------+------------------------+-------+------+
| jSD05fFrAFa5gX3ZQae1tw | ZKgRlphtyvKNl2sa0Mz2LA |  5.0  |  1   |
| jSD05fFrAFa5gX3ZQae1tw | ElM8yyW45TAy4vjApwy7ZQ |  5.0  |  2   |
| jSD05fFrAFa5gX3ZQae1tw | vmJekvxsAV8dCEO4MGO4sg |  5.0  |  3   |
| jSD05fFrAFa5gX3ZQae1tw | PEFcy5D1nXdHS67Ge0h0Kg |  5.0  |  4   |
| jSD05fFrAFa5gX3ZQae1tw | sOkQ2Y1-D8h18oN5NgxuLg |  5.0  |  5   |
| oagZh5A2cWJXZBLakS_KpQ | ZKgRlphtyvKNl2sa0Mz2LA |  5.0  |  1   |
| oagZh5A2cWJXZBLakS_KpQ | ElM8yyW45TAy4vjApwy7ZQ |  5.0  |  2   |
| oagZh5A2cWJXZBLakS_KpQ | vmJekvxsAV8dCEO4MGO4sg |  5.0  |  3   |
| oagZh5A2cWJXZBLakS_KpQ | PEFcy5D1nXdHS67Ge0h0Kg |  5.0  |  4   |
| oagZh5A2cWJXZBLakS_KpQ | sOkQ2Y1-D8h18oN5NgxuLg |  5.0  |  5   |
| 6BSwuyc7fvcccJgrY4_W5Q | ZKgRlphtyvKNl2sa0Mz2LA |  5.0  |  1   |
| 6BSwuyc7fvcccJgrY4_W5Q | ElM8yyW45TAy4vjApwy7ZQ |  5.0  |  2

In [33]:
train.groupby(by='business_id')['review_stars'].mean().sort_values(ascending=False).head(10)

business_id
tg1Y3RZfUcK8_GHYhLLxgw    5.0
ZMcvBaSEtMrieTYpPBIzVQ    5.0
7kO8bObVGFa-T3mj6PVYsg    5.0
DHa-YDvcs_0LeOT9ojUQWw    5.0
e--8ev5DbM_5cbveVTWAEg    5.0
oSDZ60KdSkMDQNaHGqvQBA    5.0
r4iis9DWfwmK1h6JT9PnAw    5.0
ZKgRlphtyvKNl2sa0Mz2LA    5.0
y-O3h7CV6X6P5ey1ZFcQwA    5.0
T1XikHI9k4VtoNzQVpIATA    5.0
Name: review_stars, dtype: float64

# Collaborative Filtering Model

In [34]:
# item-to-item similarity model based on pearson similarity
item_pearson_sim_model = tc.item_similarity_recommender.create(train_data,\
                                                       user_id='user_id', item_id='business_id',\
                                                       target='review_stars', similarity_type='pearson')



In [35]:
#Make Recommendations:
item_pearson_sim_recomm = item_pearson_sim_model.recommend(users=check_users,k=5)
item_pearson_sim_recomm.print_rows(num_rows=25)

+------------------------+------------------------+-------+------+
|        user_id         |      business_id       | score | rank |
+------------------------+------------------------+-------+------+
| jSD05fFrAFa5gX3ZQae1tw | vmJekvxsAV8dCEO4MGO4sg |  5.0  |  1   |
| jSD05fFrAFa5gX3ZQae1tw | PEFcy5D1nXdHS67Ge0h0Kg |  5.0  |  2   |
| jSD05fFrAFa5gX3ZQae1tw | ZKgRlphtyvKNl2sa0Mz2LA |  5.0  |  3   |
| jSD05fFrAFa5gX3ZQae1tw | ElM8yyW45TAy4vjApwy7ZQ |  5.0  |  4   |
| jSD05fFrAFa5gX3ZQae1tw | sOkQ2Y1-D8h18oN5NgxuLg |  5.0  |  5   |
| oagZh5A2cWJXZBLakS_KpQ | vmJekvxsAV8dCEO4MGO4sg |  5.0  |  1   |
| oagZh5A2cWJXZBLakS_KpQ | PEFcy5D1nXdHS67Ge0h0Kg |  5.0  |  2   |
| oagZh5A2cWJXZBLakS_KpQ | ZKgRlphtyvKNl2sa0Mz2LA |  5.0  |  3   |
| oagZh5A2cWJXZBLakS_KpQ | ElM8yyW45TAy4vjApwy7ZQ |  5.0  |  4   |
| oagZh5A2cWJXZBLakS_KpQ | sOkQ2Y1-D8h18oN5NgxuLg |  5.0  |  5   |
| 6BSwuyc7fvcccJgrY4_W5Q | vmJekvxsAV8dCEO4MGO4sg |  5.0  |  1   |
| 6BSwuyc7fvcccJgrY4_W5Q | PEFcy5D1nXdHS67Ge0h0Kg |  5.0  |  2

In [18]:
# item-to-item similarity model based on cosine similarity
item_cos_sim_model = tc.item_similarity_recommender.create(train_data,\
                                                       user_id='user_id', item_id='business_id',\
                                                       target='review_stars', similarity_type='cosine')



In [36]:
#Make Recommendations:
item_cos_sim_recomm = item_cos_sim_model.recommend(users=check_users,k=5)
item_cos_sim_recomm.print_rows(num_rows=25)

+------------------------+------------------------+----------------------+------+
|        user_id         |      business_id       |        score         | rank |
+------------------------+------------------------+----------------------+------+
| jSD05fFrAFa5gX3ZQae1tw | k2b3niokS_tosjah_rzCPw | 0.08606218374692477  |  1   |
| jSD05fFrAFa5gX3ZQae1tw | jBs2HmEOkqCvdHNnw3N-og | 0.07024011245140663  |  2   |
| jSD05fFrAFa5gX3ZQae1tw | m8u3fsbhfdXu7IhXeHBO5A | 0.06099940721805279  |  3   |
| jSD05fFrAFa5gX3ZQae1tw | cRtPP-AKezf0m94W-aisYA | 0.060665786266326904 |  4   |
| jSD05fFrAFa5gX3ZQae1tw | lKH_cqeaeVpHdsO2dK58OA | 0.06059996898357685  |  5   |
| oagZh5A2cWJXZBLakS_KpQ | l_GV0hgEoTUf70uJVT0_hg | 0.08959227800369263  |  1   |
| oagZh5A2cWJXZBLakS_KpQ | rioQ_p2pILNbJ4Xp5jW6-Q | 0.08794295787811279  |  2   |
| oagZh5A2cWJXZBLakS_KpQ | wuQDMDlqM17jQNo0lYQZ7g | 0.08385350874492101  |  3   |
| oagZh5A2cWJXZBLakS_KpQ | zpoZ6WyQUYff18-z4ZU1mA | 0.08046167237418038  |  4   |
| oagZh5A2cWJXZB

# Evaluating Recommendation Models

In [37]:
model_performance = tc.recommender.util.compare_models(test_data, [item_content_model,
                                                                   item_pearson_sim_model,item_cos_sim_model],\
                                                      verbose=True)


NameError: name 'item_content_model' is not defined

In [38]:
model_performance

NameError: name 'model_performance' is not defined

# Item-Content Based Model

In [None]:
# build table for business attributes

In [39]:
df.head()

Unnamed: 0,address,attributes,attributes_AcceptsInsurance,attributes_AgesAllowed,attributes_Alcohol,attributes_Ambience,attributes_BYOB,attributes_BYOBCorkage,attributes_BestNights,attributes_BikeParking,...,overall_stars,state,cool,date,funny,review_id,review_stars,text,useful,user_id
8,"3940 Martin Luther King Blvd, Ste 101",,,,,,,,,True,...,1.5,NV,1,2015-08-02,0.0,UgwmWy_68S_aKR9wTWKGOg,3.0,I am a huge fan of both locally owned business...,4.0,jSD05fFrAFa5gX3ZQae1tw
9,"3940 Martin Luther King Blvd, Ste 101",,,,,,,,,True,...,1.5,NV,2,2015-06-08,2.0,AywPsODuQbUMhBEjmKiGXw,1.0,Let me start off by saying. If you cant make a...,8.0,oagZh5A2cWJXZBLakS_KpQ
10,"3940 Martin Luther King Blvd, Ste 101",,,,,,,,,True,...,1.5,NV,1,2015-08-08,0.0,6j39TEUBDBTEK37OKACWLw,1.0,Decent product. ..HORRIBLE service. Totally un...,2.0,6BSwuyc7fvcccJgrY4_W5Q
31,2255 N Rampart Blvd,,,,none,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,2.5,NV,0,2016-07-03,0.0,c6iTbCMMYWnOd79ZiWwobg,1.0,"I ordered a few 12 inch sandwiches , a turkey ...",1.0,ih7Dmu7wZpKVwlBRbakJOQ
32,2255 N Rampart Blvd,,,,none,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,2.5,NV,0,2010-06-20,2.0,fisvc16rY1Q5NTX_O_QQ2A,1.0,A very dirty place. You can watch the employee...,3.0,zqL43wHcz133AVvEuba67w


In [20]:
arr = []
with open('../input/yelp-dataset/yelp_academic_dataset_business.json', 'r') as input_file: 
    for line in input_file:
        arr.append(json.loads(line))


In [21]:
def parse_business(business_obj):
    final_obj = {}
    # get first layer of attributes
    first_layer = ['business_id','stars','review_count','is_open']
    for a in first_layer:
        final_obj[a] = business_obj[a]
    
    if business_obj['attributes']:
        final_obj = {**final_obj, **business_obj['attributes']}
    if business_obj['hours']:
        final_obj = {**final_obj, **business_obj['hours']}
    try:
        if final_obj['BusinessParking']:
            parking = final_obj.pop('BusinessParking')
            parking = ast.literal_eval(parking)
            final_obj = {**final_obj, **parking}
    except:
        pass
    if 'Ambience' in final_obj.keys():
        ambience = final_obj.pop('Ambience')
        ambience = ast.literal_eval(ambience)
        final_obj = {**final_obj, **ambience}
        
    if 'GoodForMeal' in final_obj.keys():
        meal = final_obj.pop('GoodForMeal')
        meal = ast.literal_eval(meal)
        final_obj = {**final_obj, **meal}
    
    return final_obj
    

In [22]:
business_arr = [parse_business(obj) for obj in arr]

In [23]:
business_df = pd.DataFrame(business_arr)

In [24]:
business_df = business_df[business_df.business_id.isin(df.business_id.unique())]

In [25]:
business_df.head()

Unnamed: 0,AcceptsInsurance,AgesAllowed,Alcohol,BYOB,BYOBCorkage,BestNights,BikeParking,BusinessAcceptsBitcoin,BusinessAcceptsCreditCards,ByAppointmentOnly,...,lunch,review_count,romantic,stars,street,touristy,trendy,upscale,valet,validated
19,,,,,,,True,,True,,...,,3,,1.5,,,,,,
32,,,none,,,,True,,True,,...,True,13,False,2.5,False,False,False,False,False,False
33,,,beer_and_wine,,,,True,,True,,...,True,242,False,4.0,False,False,False,False,False,False
61,,,,,,,,,,,...,,4,,2.0,,,,,,
141,,,none,,,,False,,True,,...,False,16,False,1.5,False,False,False,False,False,False


In [26]:
list(business_df.isnull().sum().index)

['AcceptsInsurance',
 'AgesAllowed',
 'Alcohol',
 'BYOB',
 'BYOBCorkage',
 'BestNights',
 'BikeParking',
 'BusinessAcceptsBitcoin',
 'BusinessAcceptsCreditCards',
 'ByAppointmentOnly',
 'Caters',
 'CoatCheck',
 'Corkage',
 'DietaryRestrictions',
 'DogsAllowed',
 'DriveThru',
 'Friday',
 'GoodForDancing',
 'GoodForKids',
 'HairSpecializesIn',
 'HappyHour',
 'HasTV',
 'Monday',
 'Music',
 'NoiseLevel',
 'Open24Hours',
 'OutdoorSeating',
 'RestaurantsAttire',
 'RestaurantsCounterService',
 'RestaurantsDelivery',
 'RestaurantsGoodForGroups',
 'RestaurantsPriceRange2',
 'RestaurantsReservations',
 'RestaurantsTableService',
 'RestaurantsTakeOut',
 'Saturday',
 'Smoking',
 'Sunday',
 'Thursday',
 'Tuesday',
 'Wednesday',
 'WheelchairAccessible',
 'WiFi',
 'breakfast',
 'brunch',
 'business_id',
 'casual',
 'classy',
 'dessert',
 'dinner',
 'divey',
 'garage',
 'hipster',
 'intimate',
 'is_open',
 'latenight',
 'lot',
 'lunch',
 'review_count',
 'romantic',
 'stars',
 'street',
 'touristy',
 

In [27]:
features_to_keep = ['Alcohol','BikeParking','BusinessAcceptsCreditCards','Caters',
                   'GoodForKids','HappyHour','HasTV','NoiseLevel',
                   'OutdoorSeating','RestaurantsAttire','RestaurantsDelivery',
                   'RestaurantsGoodForGroups','RestaurantsPriceRange2',
                    'RestaurantsReservations','RestaurantsTableService',
                    'RestaurantsTakeOut','Smoking','WheelchairAccessible','WiFi','breakfast',
                    'brunch','business_id','casual','classy','dessert','dinner','divey',
                    'garage','hipster','intimate','is_open','latenight','lot','lunch',
                    'review_count','romantic','stars','street','touristy','trendy',
                    'upscale','valet','validated']

hours_fields = ['Saturday','Tuesday','Wednesday','Sunday','Thursday','Monday','Friday']

In [28]:
business_final = pd.get_dummies(business_df[features_to_keep],
                                columns=[col for col in features_to_keep if col not in ['business_id',
                                                                                   'stars','review_count','is_open']])

In [29]:
business_final.shape

(6153, 90)

In [30]:
business_final.columns

Index(['business_id', 'is_open', 'review_count', 'stars',
       'Alcohol_beer_and_wine', 'Alcohol_full_bar', 'Alcohol_none',
       'BikeParking_False', 'BikeParking_True',
       'BusinessAcceptsCreditCards_False', 'BusinessAcceptsCreditCards_True',
       'Caters_False', 'Caters_True', 'GoodForKids_False', 'GoodForKids_True',
       'HappyHour_False', 'HappyHour_True', 'HasTV_False', 'HasTV_True',
       'NoiseLevel_average', 'NoiseLevel_loud', 'NoiseLevel_quiet',
       'NoiseLevel_very_loud', 'OutdoorSeating_False', 'OutdoorSeating_True',
       'RestaurantsAttire_casual', 'RestaurantsAttire_dressy',
       'RestaurantsAttire_formal', 'RestaurantsDelivery_False',
       'RestaurantsDelivery_True', 'RestaurantsGoodForGroups_False',
       'RestaurantsGoodForGroups_True', 'RestaurantsPriceRange2_1',
       'RestaurantsPriceRange2_2', 'RestaurantsPriceRange2_3',
       'RestaurantsPriceRange2_4', 'RestaurantsReservations_False',
       'RestaurantsReservations_True', 'RestaurantsTabl

In [76]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# scale_cols = [col for col in list(business_final) if col != 'business_id']
# business_final[scale_cols] = scaler.fit_transform(business_final[scale_cols])
# item_data = tc.SFrame(business_final)
factorization_model = tc.factorization_recommender.create(item_data=item_data,item_id='business_id',\
                                                    observation_data=train_data,user_id='user_id', \
                                                       target='review_stars',user_data=user_data)


In [79]:
factorization_model.evaluate_rmse(test_data, target='review_stars')

{'rmse_by_user': Columns:
 	user_id	str
 	rmse	float
 	count	int
 
 Rows: 134880
 
 Data:
 +------------------------+---------------------+-------+
 |        user_id         |         rmse        | count |
 +------------------------+---------------------+-------+
 | t-jIQFGZAduBAIDuSfRDew | 0.34496518069709836 |   1   |
 | huVZckXjxfXC8eZbW8OvuA | 0.20152036326944334 |   1   |
 | jM-kmx4D1JC4Bcav7xOgqQ |  0.9589960991961717 |   1   |
 | RaP9rDxFeHn7bLUg47uuIA |  0.1498231174731428 |   1   |
 | Pjk0cNsCF8i7ssrzuc0G5g |  0.6351878878796087 |   2   |
 | hfF66BroJAhPHuLwtH9Etg |  1.2087763172947354 |   2   |
 | O7lpi9OXaH5LVsh4Qa1hag |  0.7041346859121429 |   1   |
 | 2Qg2KPlazMjTmK6mHlC-PQ |  0.613002174150743  |   1   |
 | Z65xcj7xGe_lmH0-8OZ1oA |  0.7815900234104918 |   1   |
 | BYotwilY7fec246G5Lxsmw |  0.4106804814528795 |   1   |
 +------------------------+---------------------+-------+
 [134880 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(

In [41]:
business_final[[col for col in list(business_final) if col != 'business_id']].head()

Unnamed: 0,is_open,review_count,stars,Alcohol_beer_and_wine,Alcohol_full_bar,Alcohol_none,BikeParking_False,BikeParking_True,BusinessAcceptsCreditCards_False,BusinessAcceptsCreditCards_True,...,touristy_False,touristy_True,trendy_False,trendy_True,upscale_False,upscale_True,valet_False,valet_True,validated_False,validated_True
19,0.0,0.0,0.125,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32,1.0,0.001255,0.375,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
33,1.0,0.030006,0.75,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
61,0.0,0.000126,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
141,1.0,0.001632,0.125,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [46]:
business_final.head()

Unnamed: 0,business_id,is_open,review_count,stars,Alcohol_beer_and_wine,Alcohol_full_bar,Alcohol_none,BikeParking_False,BikeParking_True,BusinessAcceptsCreditCards_False,...,touristy_False,touristy_True,trendy_False,trendy_True,upscale_False,upscale_True,valet_False,valet_True,validated_False,validated_True
0,vJIuDBdu01vCA8y1fwR1OQ,0.0,0.0,0.125,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,kgffcoxT6BQp-gJ-UQ7Czw,1.0,0.001255,0.375,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0jtRI7hVMpQHpUVtUy4ITw,1.0,0.030006,0.75,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,JJEx5wIqs9iGGATOagE8Sg,0.0,0.000126,0.25,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,zhxnD7J5_sCrKSw5cwI9dQ,1.0,0.001632,0.125,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [43]:
set(business_final.dtypes)

{dtype('float64'), dtype('O')}

In [44]:
business_final.select_dtypes('O')

Unnamed: 0,business_id
19,vJIuDBdu01vCA8y1fwR1OQ
32,kgffcoxT6BQp-gJ-UQ7Czw
33,0jtRI7hVMpQHpUVtUy4ITw
61,JJEx5wIqs9iGGATOagE8Sg
141,zhxnD7J5_sCrKSw5cwI9dQ
143,2kWrSFkIes_d2BMg4YrRtA
145,6llKs7K_tn8ChXcIM-oTvg
197,YV9GVfmDSDM7HSV0jVdTOA
203,F7OsiFk9aLZtqZczA84xpw
249,XeDLyY2a7nZ3IEY4RYslXA


In [45]:
business_final.reset_index(drop=True,inplace=True)

In [49]:
set(list(business_final.isnull().sum()))

{0}

In [54]:
test_data['user_id'].head()

dtype: str
Rows: 10
['MtE3xl8AUYPbGWQQhY5IVQ', 'eyj4r8be__c7fVtfxeHr8Q', 'hsxx48aGP1UEusdIju55MQ', 'Jmlqb5UYfygtUXugrpywjg', 'NOMT-fdqT31oUGZUGf6_sQ', 'Iw_k5xk0_jCQPOetf60xgA', 'WKnDQ3K1BpBuP6sR2QpLdQ', 'sYVc7LtZjJ5RUSATwMpBzA', '3____Pc_RF_7TnGuQdGJpw', 'l35uoedk9e740vl9fwYRqg']

In [55]:
recs = item_content_model.recommend(test_data['user_id'])

In [58]:
tc.toolkits.recommender.util.precision_recall_by_user(test_data, recs, cutoffs=[5, 10])


user_id,cutoff,precision,recall,count
---1lKK3aKOuomHnwAkAow,5,0.0,0.0,4
---1lKK3aKOuomHnwAkAow,10,0.0,0.0,4
--3WaS23LcIXtxyFULJHTA,5,0.0,0.0,1
--3WaS23LcIXtxyFULJHTA,10,0.0,0.0,1
--41c9Tl0C9OGewIR7Qyzg,5,0.0,0.0,1
--41c9Tl0C9OGewIR7Qyzg,10,0.0,0.0,1
--4q8EyqThydQm-eKZpS-A,5,0.0,0.0,1
--4q8EyqThydQm-eKZpS-A,10,0.0,0.0,1
--4uW4yJiRT2oXMYkCPq1Q,5,0.0,0.0,1
--4uW4yJiRT2oXMYkCPq1Q,10,0.0,0.0,1


In [59]:
arr = []
with open('../input/yelp-dataset/yelp_academic_dataset_user.json', 'r') as input_file: 
    for line in input_file:
        arr.append(json.loads(line))

In [61]:
user_df = pd.DataFrame(arr)

In [63]:
user_df = user_df[user_df.user_id.isin(df.user_id.unique())]

In [64]:
user_df.shape

(384829, 22)

In [73]:
user_df.head()

Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,...,cool,elite,fans,friends,funny,review_count,useful,user_id,yelping_since,yelping_years
4,3.0,0,0,0,0,0,0,0,0,0,...,0,,0,,0,3,0,s4FoIXE_LSGviTHBe8dmcg,2017-06-18,1.227397
12,3.86,0,0,0,0,0,0,0,0,0,...,0,,0,,0,6,0,SgYDjNCecPidsRB_su5-tw,2011-01-25,7.627397
16,1.0,0,0,0,0,0,0,0,0,0,...,0,,0,,0,2,2,NfE1uHFWzzMyXkgBeEuR1A,2015-07-19,3.145205
17,4.0,0,0,0,0,0,0,0,0,0,...,0,,0,,0,4,0,RiBVI6UgLjfpA4EQ1SWDzA,2014-06-15,4.238356
22,3.0,0,0,0,0,0,0,0,0,0,...,0,,0,,0,3,0,vLekYe1hK12NJc8PyxRjIg,2012-10-09,5.920548


In [None]:
user_df['yelping_years'] = (pd.to_datetime(datetime.now()) - pd.to_datetime(user_df['yelping_since'])).dt.days/365

In [74]:
user_df.drop(['yelping_since','name'], axis=1, inplace=True)

In [75]:
user_data = tc.SFrame(user_df)

In [80]:
test.head()

Unnamed: 0,user_id,business_id,review_stars
1494629,MtE3xl8AUYPbGWQQhY5IVQ,KkmPDIWzvwbBpyqOHT6pcQ,4.0
1494630,eyj4r8be__c7fVtfxeHr8Q,KkmPDIWzvwbBpyqOHT6pcQ,2.0
1494631,hsxx48aGP1UEusdIju55MQ,KkmPDIWzvwbBpyqOHT6pcQ,5.0
1494632,Jmlqb5UYfygtUXugrpywjg,KkmPDIWzvwbBpyqOHT6pcQ,4.0
1494633,NOMT-fdqT31oUGZUGf6_sQ,KkmPDIWzvwbBpyqOHT6pcQ,5.0


In [88]:
user_pair = ['MtE3xl8AUYPbGWQQhY5IVQ','eyj4r8be__c7fVtfxeHr8Q']
rec1 = factorization_model.recommend([user_pair[0]],100).to_dataframe()
rec2 = factorization_model.recommend([user_pair[1]],100).to_dataframe()

In [90]:
possible_recs = rec1.merge(rec2,on='business_id')
possible_recs['total_score'] = possible_recs['rank_x'] + possible_recs['rank_y']


In [91]:
possible_recs[possible_recs.total_score == possible_recs.total_score.min()]['business_id'].iloc[0]

'IhNASEZ3XnBHmuuVnWdIwA'

In [92]:
if factorization_model:
    print('yes')

yes
