In [None]:
pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162724 sha256=7a8dd410be330971b11afb85d35861dfb91346af86b5e58986c29c9f14971dff
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [None]:
import os
from google.colab import drive
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil import relativedelta
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from surprise import NormalPredictor
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, KFold, GridSearchCV
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import json

warnings.filterwarnings( "ignore")
warnings.filterwarnings( "ignore", module = '.*matplot.*' )

drive.mount('/content/drive')

shared_folder_path = '/content/drive/My Drive/DSCI 591'
os.chdir(shared_folder_path)

Mounted at /content/drive


### Read in Data
Here the original Yelp Philadelphia data is read in, to be used later in the subsequent models.

In [None]:
restaurants = pd.read_csv('yelp_Philly_restaurants_nonull.csv',index_col = 0)
restaurants.head(2)

Unnamed: 0,address,business_id,categories,latitude,longitude,name,postal_code,review_count,stars
0,600 Washington Ave,2fJ-WxJlUN6azp3bzrJ0zA,"Chinese, Vietnamese",39.934452,-75.154402,Pho Ha,19147,485,4.0
1,216 S 11th St,3FKIev7ZB_KE6XHL9sUJCg,"Nightlife, Event Planning & Services, Local Fl...",39.948089,-75.159211,Strangelove's,19107,443,4.0


In [None]:
reviews = pd.read_csv('yelp_Philly_reviews_nonull.csv',index_col = 0)
reviews.head(2)

Unnamed: 0,business_id,date,review_id,stars,text,user_id
0,aa0xi7fgFJyA4qWux6vz0A,2012-08-19 11:05:34,RSq3cm26c2BAcBJLJuEs5A,4.0,What can I say..........since bruddah Obama wa...,-9da1xk7zgnnfO1uTVYGkA
1,IkY2ticzHEn4QFn8hQLSWg,2016-07-12 11:49:59,hy27H2LOaiBBqqIgfSPiWA,3.0,"When I was working in Philly a few years ago, ...",-9da1xk7zgnnfO1uTVYGkA


In [None]:
users = pd.read_csv('yelp_Philly_users_nonull.csv',index_col = 0)
users.head(2)

Unnamed: 0,average_stars,friends,review_count,user_id,elite_count,yelping_count,friend_count
0,4.14,"-gSOcP0jp_3qLN_uQqvJpQ, -ekI2PAaTMlb_Q8qqjrz9w...",1438,-9da1xk7zgnnfO1uTVYGkA,10,11,579
1,3.71,"QyeNqz1bJMbsWianX3D09Q, b_btENKt_8bk7dz15xqAHA...",100,-hnBzgVoRoqLrGVSxO0ilg,0,8,12


# ***Content-based Recommendor***
This reccomender model utilizes restaurant categories and reviews, as well as the stars and review count per restaurantas the *content* for the model.

In [None]:
restaur = restaurants[['business_id', 'name', 'address']]
review = reviews[['user_id','business_id','stars', 'text', 'date']]

In [None]:
# Merge the Review and Restaurant dataset on business_id
content_data = pd.merge(review, restaur, on='business_id')
content_data.head(2)

Unnamed: 0,user_id,business_id,stars,text,date,name,address
0,-9da1xk7zgnnfO1uTVYGkA,aa0xi7fgFJyA4qWux6vz0A,4.0,What can I say..........since bruddah Obama wa...,2012-08-19 11:05:34,Carmen's Famous Italian Hoagies,12th St & Arch St
1,kMQpdrn7N5GEAPtmIxL1yw,aa0xi7fgFJyA4qWux6vz0A,5.0,Been coming here for 11 years and have the bes...,2014-01-15 16:07:00,Carmen's Famous Italian Hoagies,12th St & Arch St


### NLTK Process - TFIDF with Stemming
This process takes every review that exists for a restaurant and combines it into a single string. Following this the text is cleaned and TFIDFVectorizor is applied.

In [None]:
# new df for review text
text = content_data[['business_id','text']]
text.head(3)

Unnamed: 0,business_id,text
0,aa0xi7fgFJyA4qWux6vz0A,What can I say..........since bruddah Obama wa...
1,aa0xi7fgFJyA4qWux6vz0A,Been coming here for 11 years and have the bes...
2,aa0xi7fgFJyA4qWux6vz0A,"First time in Phily, first time trying a chees..."


In [None]:
# number of reviews for each restaurant
text['business_id'].value_counts(5)

ytynqOUb3hjKeJfRj5Tshw    0.012604
PP3BBaVxZLcJU54uP_wL6Q    0.011057
IkY2ticzHEn4QFn8hQLSWg    0.008384
sTPueJEwcRDj7ZJmG7okYA    0.007065
9PZxjhTIU7OgPIzuGi89Ew    0.006783
                            ...   
QnFsgq_Ez9D9p_4kDsKiuw    0.000004
YnKBXCCp2bORMz3pKqqMkQ    0.000004
tuA0E2YPAaKBRZL7rhsU-g    0.000004
JnPIjvC0cmooNDfsa9BmXg    0.000004
DIz7ET7IDNsfvYp7p5RZyA    0.000004
Name: business_id, Length: 3410, dtype: float64

In [None]:
# adding all the reviews of each restaurant and grouping in one string
final_text = text.groupby('business_id').agg(lambda x: ' '.join(x)).reset_index()

In [None]:
# extract features from reviews text
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
ENGLISH_STOP_WORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
no_use_words = ['restaurant', 'food', 'table', 'place', 'dish', 'menu', 'order', 'service', 'time', 'experience']

In [None]:
import string
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

def word_tokenizer(sentence):
    sentence = sentence.replace('\n','')

    # Removing numbers
    digit_list = ['0','1', '2', '3', '4', '5', '6', '7', '8', '9']
    for digit in digit_list:
        sentence = sentence.replace(digit, '')

    # Removing punctuation and setting to lower case
    for punctuations in string.punctuation:
        sentence = sentence.replace(punctuations, '').lower()

    # Split sentence into words
    words_list = sentence.split(' ')
    stemmed_words_list = []

    # Remove stopwords/tokens that are just empty strings
    for word in words_list:
        if (word not in ENGLISH_STOP_WORDS) and (word != '') and (word not in no_use_words):
            stemmed_word = stemmer.stem(word)
            stemmed_words_list.append(stemmed_word)

    return stemmed_words_list

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TfidfVectorizer with word_tokenizer
tfidf = TfidfVectorizer(min_df=100, max_df=0.7, tokenizer = word_tokenizer)

# Fit and transform the data
text_tfidf = tfidf.fit_transform(final_text['text'])
text_tfidf.shape

(3410, 4006)

Below are the top 10 words found in the reviews

In [None]:
# top 10 words

weights_of_word = np.array(np.sum(text_tfidf, axis=0)).reshape((-1,))

words = np.array(tfidf.get_feature_names_out())
words_df = pd.DataFrame({"word": words,
                         "weight": weights_of_word})

words_df.sort_values(by="weight", ascending=False).head(10)

Unnamed: 0,word,weight
2600,pizz,250.839571
1416,fri,168.827617
2969,sandwich,157.189446
589,chees,151.657588
2980,sauc,135.536872
254,bar,131.500222
1069,drink,130.687167
979,din,123.234051
1348,flav,115.540558
3050,serv,114.695027


The below dataframe are the results of the TFIDFVectorizor

In [None]:
# Changing our original df using the tfidf vectorizer

vector_result = (text_tfidf).toarray()
vector_df = pd.DataFrame(vector_result, columns = tfidf.get_feature_names_out())
text_features = pd.concat([final_text, vector_df], axis=1)

In [None]:
text_features.head(3)

Unnamed: 0,business_id,text,abandon,abl,abrupt,abs,absolv,absorb,absurd,abund,...,yum,yummy,yup,yuppy,zero,zest,zesty,zing,zon,zucchin
0,-0TffRSXXIlBYVbb5AwfTg,This is probably one of my favorite Indian res...,0.0,0.020661,0.0,0.002944,0.023874,0.001445,0.0,0.001169,...,0.004135,0.015868,0.0,0.0,0.00218,0.0,0.003092,0.001573,0.001339,0.001349
1,-1B9pP_CrRBJYPICE5WbRA,I got take out here and it was okay. Dan Dan n...,0.002527,0.016098,0.002591,0.002294,0.020033,0.002252,0.0,0.001822,...,0.064432,0.011826,0.0,0.0,0.005096,0.0,0.0,0.0,0.0,0.008406
2,-3ArWZfDjfab8qVHf3WVtg,We tried Taste Africa for dinner. The restaura...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The below code then takes the categories within the dataset and performs one hot encoding on the top 150 categories

In [None]:
# Now transforming restaurant categories whill will be used with reviews to calculate cosine similarity between restaurants
restaur_categories = restaurants[['business_id','categories']]
restaur_categories = restaur_categories.groupby('business_id').nth([0]).reset_index()

In [None]:
restaur_categories.head()

Unnamed: 0,business_id,categories
0,-0TffRSXXIlBYVbb5AwfTg,"Cocktail Bars, Food Delivery Services, Nightli..."
1,-1B9pP_CrRBJYPICE5WbRA,"Asian Fusion, American (New), Thai, Szechuan, ..."
2,-3ArWZfDjfab8qVHf3WVtg,African
3,-3m_nXlyvdKAVNNmVirpGQ,"Halal, Middle Eastern"
4,-5Rah4ZvWsDu4oilUZxhtw,"Nightlife, Arts & Entertainment, Music Venues,..."


In [None]:
# Total distinct no. of categories
list_categories = ', '.join(list(restaur_categories['categories'].unique()))
list_categories = list_categories.split(', ')
from collections import Counter, defaultdict
c = Counter(list_categories)
print(len(c))

251


In [None]:
#top 150 restaurant categories
cat_150 = c.most_common(150)
cat_150[:10]

[('Nightlife', 596),
 ('Bars', 567),
 ('Sandwiches', 497),
 ('American (Traditional)', 381),
 ('Breakfast & Brunch', 355),
 ('American (New)', 334),
 ('Coffee & Tea', 287),
 ('Pizza', 255),
 ('Seafood', 202),
 ('Italian', 199)]

In [None]:
top_cat = [cat[0] for cat in cat_150]

In [None]:
# Creating and filling top category columns with null values
top_cat = [cat[0] for cat in cat_150]
restaur_categories['list_categ'] = restaur_categories['categories'].apply(lambda x: x.split(', '))
for cat in top_cat:
    restaur_categories[cat] = 0

# Looping over the new column to get the one-hot encoded category values
for idx, row in restaur_categories.iterrows():
    for sub_cat in row['list_categ']:
        if sub_cat in top_cat:
            restaur_categories.at[idx, sub_cat] = 1

The below dataframe are all the one-hot encoded categories

In [None]:
restaur_categories.head(3)

Unnamed: 0,business_id,categories,list_categ,Nightlife,Bars,Sandwiches,American (Traditional),Breakfast & Brunch,American (New),Coffee & Tea,...,Live/Raw Food,Hawaiian,Art Galleries,Whiskey Bars,Pop-Up Restaurants,Eatertainment,Portuguese,Iberian,Party & Event Planning,Chocolatiers & Shops
0,-0TffRSXXIlBYVbb5AwfTg,"Cocktail Bars, Food Delivery Services, Nightli...","[Cocktail Bars, Food Delivery Services, Nightl...",1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1B9pP_CrRBJYPICE5WbRA,"Asian Fusion, American (New), Thai, Szechuan, ...","[Asian Fusion, American (New), Thai, Szechuan,...",0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,-3ArWZfDjfab8qVHf3WVtg,African,[African],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
restaur_categories.shape

(3410, 153)

Below is the process of merging the transformed reviews, the one-hot encoded categories, stars and review counts into a single final dataframe. This final dataframe will then be used with cosine similarity for the content-based model.

In [None]:
# Merging the transformed reviews and categories df
df_0 = pd.merge(text_features, restaur_categories, on ='business_id')
df_0.shape

(3410, 4160)

In [None]:
df_0.head(2)

Unnamed: 0,business_id,text,abandon,abl,abrupt,abs,absolv,absorb,absurd,abund,...,Live/Raw Food,Hawaiian,Art Galleries,Whiskey Bars,Pop-Up Restaurants,Eatertainment,Portuguese,Iberian,Party & Event Planning,Chocolatiers & Shops
0,-0TffRSXXIlBYVbb5AwfTg,This is probably one of my favorite Indian res...,0.0,0.020661,0.0,0.002944,0.023874,0.001445,0.0,0.001169,...,0,0,0,0,0,0,0,0,0,0
1,-1B9pP_CrRBJYPICE5WbRA,I got take out here and it was okay. Dan Dan n...,0.002527,0.016098,0.002591,0.002294,0.020033,0.002252,0.0,0.001822,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# With categories getting the review_count, stars from the business dataset
df_1 = restaurants[['business_id','review_count', 'stars']]

# Grouping by the name of the restaurant
df_2 = df_1.groupby('business_id').nth([0]).reset_index()

In [None]:
# Merging df_2 with categories pivot table
df_final = pd.merge(df_0, df_2, on = "business_id", how='left')
df_final.head(3)

Unnamed: 0,business_id,text,abandon,abl,abrupt,abs,absolv,absorb,absurd,abund,...,Art Galleries,Whiskey Bars,Pop-Up Restaurants,Eatertainment,Portuguese,Iberian,Party & Event Planning,Chocolatiers & Shops,review_count,stars_y
0,-0TffRSXXIlBYVbb5AwfTg,This is probably one of my favorite Indian res...,0.0,0.020661,0.0,0.002944,0.023874,0.001445,0.0,0.001169,...,0,0,0,0,0,0,0,0,1097,4.5
1,-1B9pP_CrRBJYPICE5WbRA,I got take out here and it was okay. Dan Dan n...,0.002527,0.016098,0.002591,0.002294,0.020033,0.002252,0.0,0.001822,...,0,0,0,0,0,0,0,0,822,4.0
2,-3ArWZfDjfab8qVHf3WVtg,We tried Taste Africa for dinner. The restaura...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,5,4.0


In [None]:
df_final.shape

(3410, 4162)

### Calculating cosine similarity
The below code applies cosine similarity onto the previous dataframe to compute the similarity between every restaurant in the dataset

In [None]:
df_cos = df_final.drop(['business_id', 'categories', 'text', 'list_categ'], axis=1)

In [None]:
content_cosine_score = cosine_similarity(df_cos, df_cos)
content_cosine_score

array([[1.        , 0.99999205, 0.76500298, ..., 0.99988234, 0.99108856,
        0.78856477],
       [0.99999205, 1.        , 0.76546703, ..., 0.99989144, 0.99118058,
        0.78901413],
       [0.76500298, 0.76546703, 1.        , ..., 0.77260492, 0.83103499,
        0.96099601],
       ...,
       [0.99988234, 0.99989144, 0.77260492, ..., 1.        , 0.99251256,
        0.795916  ],
       [0.99108856, 0.99118058, 0.83103499, ..., 0.99251256, 1.        ,
        0.85204334],
       [0.78856477, 0.78901413, 0.96099601, ..., 0.795916  , 0.85204334,
        1.        ]])

In [None]:
#content_cosine_score.shape

Next a dataframe is created where each row and column corresponds to a restaurant name. The values within the dataFrame denote the cosine similarity between the respective restaurants. The diagonal entries, representing the similarity of each restaurant to itself, will be 1. Values will range from 0 (indicating no similarity) to 1 (very similar)

In [None]:
df_restaur_simil = pd.DataFrame(content_cosine_score, columns = df_final['business_id'], index = df_final['business_id'])

Below is the function to generate the Top 10 reccomendations based on the supplied restaurant

In [None]:
def content_model(restaurant):
    '''
    Generates top 10 restaurant recommendations, ordered by cosine similarity

    input: restaurant name

    output: dataframe with the names and similartiy score of the top 10
            restaurants.
    '''

    # Translates the restaurnat name to business_id
    rated_id = restaurants.loc[restaurants['name'] == restaurant, 'business_id'].iloc[0]

    # finds similarities from pivot based on business_id
    sims = list(df_restaur_simil[rated_id].sort_values(ascending=False)[1:11].index)
    # finds the names of the similar restaurants
    sim_names = [restaurants.loc[restaurants['business_id'] == id, 'name'].iloc[0] for id in sims]

    # creates the dataframe
    recommendations_df = pd.DataFrame()
    recommendations_df['Restaurant Id'] = sims
    recommendations_df['Restaurant Name'] = sim_names
    recommendations_df['Similarity Score'] = list(df_restaur_simil[rated_id].sort_values(ascending=False)[1:11].values)


    return recommendations_df

In [None]:
test1 = content_model('Spice 28')
test1

Unnamed: 0,Restaurant Id,Restaurant Name,Similarity Score
0,MMRRS6YhVRx_iN5-JhMRYg,Han Dynasty,0.999997
1,vUkiYPpbkMXA99WneLSGkQ,Banana Leaf,0.999996
2,eaDZlSuVS0EY67Ke6pRP6Q,Penang,0.999996
3,kbXpjBSweedxQ1WrlJ5i7Q,Dan Dan,0.999996
4,mtvT7uRey3F395STFRM1Tg,Vernick Food & Drink,0.999995
5,iksVwRfpWymIUUFqw0tXpw,Chubby Cattle,0.999995
6,rYqmaOIULRouz_1db07OdQ,Green Eggs Cafe,0.999994
7,ueAkLzWFFTzQkq3jzyBlnA,Pho Xe Lua Viet Thai Restaurant,0.999994
8,TunmRrfZb7bt53T6HJi4UQ,Bleu Sushi,0.999994
9,nIAbuktMEzVjT4P9pG89rQ,Buddakan,0.999994


In [None]:
test2 = content_model('1 Stop Pizza')
test2

Unnamed: 0,Restaurant Id,Restaurant Name,Similarity Score
0,Gj5G0asdNZILROHEUEF-iQ,DaFranco Pizzeria,0.969892
1,xtEDyU98DnjqsCuV8apV6A,Lamberti Pizza and Market,0.968599
2,TVe1FyolMt9xjBtY7oMTOQ,Pitruco Pizza Franklin's Table,0.967794
3,vJKxxr2auAsbmLfis-PJnw,Old Nelson Food Company,0.967256
4,tt_n0ZJ-rd-yftr_5RQLhg,Best Deli II,0.966756
5,R7U3lCaFWGA1p3sRkFcYAA,Balducci's,0.96675
6,i021iBZ1SatdR-NzBuR7Cw,Tano's Deli,0.966165
7,kPXsRJKRRE5wmMBMBqkFbQ,Evergreen Deli,0.966057
8,_FUE9ZOqNTiDmgP2LGPGdw,Dolores,0.965922
9,oCWx92FxeWJc5_tkQYGJTw,Pauline's Deli,0.965454


# ***Collaborative filtering recommendor***
Below is the process to develop, train and test the collabrative-filtering model

### Developing train test split

In [None]:
#CF pivot table with names
restaur = restaurants[['business_id', 'name', 'address']]
review = reviews[['user_id','business_id','stars', 'date']]

In [None]:
#Merge the Review and Restaurant dataset on business_id
cf_data = pd.merge(review, restaur, on='business_id')
cf_data.head()

Unnamed: 0,user_id,business_id,stars,date,name,address
0,-9da1xk7zgnnfO1uTVYGkA,aa0xi7fgFJyA4qWux6vz0A,4.0,2012-08-19 11:05:34,Carmen's Famous Italian Hoagies,12th St & Arch St
1,kMQpdrn7N5GEAPtmIxL1yw,aa0xi7fgFJyA4qWux6vz0A,5.0,2014-01-15 16:07:00,Carmen's Famous Italian Hoagies,12th St & Arch St
2,MPhFP4qkuTn_CJpEYR_BCQ,aa0xi7fgFJyA4qWux6vz0A,4.0,2016-02-15 19:29:18,Carmen's Famous Italian Hoagies,12th St & Arch St
3,8TQ4QWrbS6nKvQAjvQMbug,aa0xi7fgFJyA4qWux6vz0A,2.0,2012-12-27 03:35:08,Carmen's Famous Italian Hoagies,12th St & Arch St
4,O2ebBq0SoZuwh7JLUd5LFQ,aa0xi7fgFJyA4qWux6vz0A,5.0,2018-08-06 21:28:22,Carmen's Famous Italian Hoagies,12th St & Arch St


In [None]:
cf_data['date'] = pd.to_datetime(cf_data.date) # so we have data that spans 2005 to 2022
cf_data['date'].dt.year.unique()

array([2012, 2014, 2016, 2018, 2019, 2015, 2013, 2017, 2011, 2010, 2021,
       2009, 2020, 2008, 2022, 2007, 2006, 2005])

This code was adapted from the Drexel Reccomender Systems Course MLDemoSplit notebook to split the data based on the time. The Training data encompasses all data BEFORE Q4 2021. The Test data encompassed everything AFTER the start of Q4 2021 (October to December 2021). The year 2022 was not used due to the size of the current data and because 2022 was an incomplete year in the data.

In [None]:
ev_test = cf_data.query("date >= '2021-10-01' and date < '2022-01-01'") #Q4 2021 oct - dec
ev_train = cf_data.query("date < '2021-10-01'") #everything before the start of Q4 2021
ev_test['date'].describe()

count                    3657
unique                   3657
top       2021-10-05 00:10:53
freq                        1
first     2021-10-01 00:21:56
last      2021-12-31 23:06:13
Name: date, dtype: object

In [None]:
ev_test_users = ev_test.groupby('user_id')['stars'].agg(['count', 'mean'])
ev_train_users = ev_train.groupby('user_id')['stars'].agg(['count', 'mean'])
ev_test_users = ev_test_users.join(ev_train_users, how='left', rsuffix='_train')
ev_test_users['count_train'].fillna(0, inplace=True)
ev_test_users.head()

Unnamed: 0_level_0,count,mean,count_train,mean_train
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1mlvIhZA3l1Gta2zxG3Tw,2,3.0,0.0,
-6GY04bTPM2Zo4z0GN4a1A,19,4.315789,10.0,4.2
-78RRWJBrFlldvllagVVEw,2,5.0,0.0,
-Av5_Ee5eXO1kz9etK_DIA,1,4.0,8.0,4.875
-AwZjQNFpCqc0bVNNyBqwg,4,3.25,32.0,3.71875


In [None]:
ev_test_users['count_train'].describe()

count    1507.000000
mean       22.941606
std        39.772384
min         0.000000
25%         0.000000
50%         6.000000
75%        28.000000
max       437.000000
Name: count_train, dtype: float64

In [None]:
np.sum(ev_test_users['count_train'] == 0) # 27% of the test users are not in the train

421

Filtering out users in the test set that DO NOT exist in the train set

In [None]:
ev_test_filtered = ev_test[ev_test['user_id'].isin(ev_train_users.index)]

In [None]:
ev_train.info() #223,105 rows

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223105 entries, 0 to 227462
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   user_id      223105 non-null  object        
 1   business_id  223105 non-null  object        
 2   stars        223105 non-null  float64       
 3   date         223105 non-null  datetime64[ns]
 4   name         223105 non-null  object        
 5   address      223105 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 11.9+ MB


In [None]:
ev_test_filtered.info() #2,837 rows

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2837 entries, 382 to 227463
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      2837 non-null   object        
 1   business_id  2837 non-null   object        
 2   stars        2837 non-null   float64       
 3   date         2837 non-null   datetime64[ns]
 4   name         2837 non-null   object        
 5   address      2837 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 155.1+ KB


In [None]:
ev_test_filtered.user_id.nunique() # there are 1086 final unique users in the test set

1086

In [None]:
# ev_test_filtered.to_csv('yelp_test_data.csv') save this off to use in GNN

### Grid Search

Now to run Grid Search for SVD on the training data and evaluate on the test data. The below code takes the training and test dataset, loads the dataset into a suprise dataet, then takes the raw ratings in order to load it into a custom training and testing set.

In [None]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(cf_data[['user_id', 'business_id', 'stars']], reader)
train = Dataset.load_from_df(ev_train[['user_id', 'business_id', 'stars']], reader)
test = Dataset.load_from_df(ev_test_filtered[['user_id', 'business_id', 'stars']], reader)

raw_train = train.raw_ratings
raw_test = test.raw_ratings

trainset = data.construct_trainset(raw_train)
testset = data.construct_testset(raw_test)

Below is the Grid Search to find the optimal hyperparameters per metric

In [None]:
print('RUNNING GRID SEARCH...')
param_grid = {'n_epochs': [40, 50, 80, 100], 'lr_all': [0.001, 0.002, 0.005], 'reg_all': [0.02, 0.04, 0.06, 0.08]}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], refit=True, cv=5, joblib_verbose=1)
grid_search.fit(train)

RUNNING GRID SEARCH...


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  5.6min
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed: 32.1min


In [None]:
# best RMSE score
print('Best RMSE: ', grid_search.best_score["rmse"])
print('Best Parameters: ', grid_search.best_params["rmse"])

Best RMSE:  0.9602837986444002
Best Parameters:  {'n_epochs': 80, 'lr_all': 0.002, 'reg_all': 0.08}


In [None]:
print('Best MAE: ', grid_search.best_score["mae"])
print('Best Parameters: ', grid_search.best_params["mae"])

Best MAE:  0.7446810215599222
Best Parameters:  {'n_epochs': 80, 'lr_all': 0.002, 'reg_all': 0.08}


In [None]:
gpreds = grid_search.test(testset)

In [None]:
gridpreds = pd.DataFrame(gpreds)
gridpreds.drop("details", inplace=True, axis=1)
gridpreds.columns = ['user_id', 'business_id', 'actual', 'cf_predictions']
gridpreds.head(2)

Unnamed: 0,user_id,business_id,actual,cf_predictions
0,IWusDJZl16-pO2LXSWX8aw,IkY2ticzHEn4QFn8hQLSWg,1.0,2.195147
1,u1Bd6O9duBOp3SUHN2vjPA,IkY2ticzHEn4QFn8hQLSWg,2.0,2.731745


In [None]:
print('RSME: ', accuracy.rmse(gpreds, verbose=False))
print('MAE: ', accuracy.mae(gpreds, verbose = False))

RSME:  0.9575060999551038
MAE:  0.7342946237163045


In [None]:
#grid_search.cv_results

In [None]:
#grid_search.predict('0sxiTd-rg8IICNUuDEvptQ','bgxDswHIdFP0Go0pNfyAAw')

Below is a simple function to get the Top 10 recomendations independent of model. This means that this function can take in any surpise SVD model and generate recomendations.

In [None]:
def collab_model(user_id, restaurant, model):
    '''
    Generates top 10 restaurant recommendations, ordered by predicted rating

    input: user_id, restaurant name

    output: dataframe with the names and predicted score of the top 10
            restaurants from the content model
    '''
    # Gets the top n restaurants from content model

    reccomended = content_model(restaurant)
    to_pred = reccomended['Restaurant Id'].tolist()

    preds = [model.predict(user_id, rid).est for rid in to_pred]

    reccomended['Predicted Scores'] = preds

    return reccomended

This is an example of the collab_model function with the grid_search model

In [None]:
collab_model('0sxiTd-rg8IICNUuDEvptQ','Spice 28', grid_search)

Unnamed: 0,Restaurant Id,Restaurant Name,Similarity Score,Predicted Scores
0,MMRRS6YhVRx_iN5-JhMRYg,Han Dynasty,0.999997,3.815995
1,vUkiYPpbkMXA99WneLSGkQ,Banana Leaf,0.999996,3.584649
2,eaDZlSuVS0EY67Ke6pRP6Q,Penang,0.999996,3.732655
3,kbXpjBSweedxQ1WrlJ5i7Q,Dan Dan,0.999996,3.617378
4,mtvT7uRey3F395STFRM1Tg,Vernick Food & Drink,0.999995,4.163467
5,iksVwRfpWymIUUFqw0tXpw,Chubby Cattle,0.999995,3.818815
6,rYqmaOIULRouz_1db07OdQ,Green Eggs Cafe,0.999994,3.853286
7,ueAkLzWFFTzQkq3jzyBlnA,Pho Xe Lua Viet Thai Restaurant,0.999994,3.477725
8,TunmRrfZb7bt53T6HJi4UQ,Bleu Sushi,0.999994,3.573807
9,nIAbuktMEzVjT4P9pG89rQ,Buddakan,0.999994,3.734852


In [None]:
collab_model('0sxiTd-rg8IICNUuDEvptQ','1 Stop Pizza', grid_search)

Unnamed: 0,Restaurant Id,Restaurant Name,Similarity Score,Predicted Scores
0,Gj5G0asdNZILROHEUEF-iQ,DaFranco Pizzeria,0.969892,3.471421
1,xtEDyU98DnjqsCuV8apV6A,Lamberti Pizza and Market,0.968599,3.95036
2,TVe1FyolMt9xjBtY7oMTOQ,Pitruco Pizza Franklin's Table,0.967794,3.914968
3,vJKxxr2auAsbmLfis-PJnw,Old Nelson Food Company,0.967256,3.882616
4,tt_n0ZJ-rd-yftr_5RQLhg,Best Deli II,0.966756,3.192632
5,R7U3lCaFWGA1p3sRkFcYAA,Balducci's,0.96675,3.634817
6,i021iBZ1SatdR-NzBuR7Cw,Tano's Deli,0.966165,3.790339
7,kPXsRJKRRE5wmMBMBqkFbQ,Evergreen Deli,0.966057,3.363725
8,_FUE9ZOqNTiDmgP2LGPGdw,Dolores,0.965922,3.735647
9,oCWx92FxeWJc5_tkQYGJTw,Pauline's Deli,0.965454,3.893929


### Training SVD model
This model is trained off of the best parameters found in the grid search

**Tested:**
*   n_epochs=50, lr=0.005, reg_all=0.02 (RMSE = 0.99, MAE = 0.76)
*   n_epochs=50, lr_all=0.002, reg_all=0.01 (RMSE = 0.982, MAE = 0.7637)
*   n_epochs=20, lr_all=0.005, reg_all=0.06 (RMSE = 0.9630, MAE = 0.7465)
*   n_epochs=80, lr_all=0.002, reg_all=0.06 (RMSE = 0.9687, MAE = 0.7432)
*   n_epochs=100, lr_all=0.002, reg_all=0.08 (RMSE = 0.9570, MAE = 0.7310)

In [None]:
algo = SVD(n_epochs=100, lr_all=0.002, reg_all=0.08)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7d5e6ea1cc10>

In [None]:
predictions = algo.test(testset)

In [None]:
preds = pd.DataFrame(predictions)
preds.drop("details", inplace=True, axis=1)
preds.columns = ['user_id', 'business_id', 'actual', 'cf_predictions']

In [None]:
# RMSE from suprise
print('RMSE: ', accuracy.rmse(predictions, verbose=False))
# MAE from suprise
print('MAE: ', accuracy.mae(predictions, verbose=False))

RMSE:  0.9568072746455937
MAE:  0.7323205166529431


# **Hybrid Recommendor System**

1.   Input a user
2.   Take that users top 10 reccomendations
3.   Go to the cosine similarity matrix and retrieve the similarity scores
4.   Multiply the similarity scores by 0.6 (60%) and the predicted rating returned by the cf model by 0.4 (40%) to get the hybrid prediction <br/>
     **(W = 0.6(content similarity score) + 0.4(predicted rating for user))**
5. Order by weighted score

This function mimics the functionality of the collab_model function but also incorporates the content-based reccomendations.

In [None]:
def hybrid_model(user_id, restaurant, model, weights=0.6):
    '''
    Generates top 10 restaurant recommendations, ordered by predicted rating

    input: user_id, restaurant name

    output: dataframe with the names and predicted score of the top 10
            restaurants from the content model
    '''
    # Gets the top n restaurants from content model

    reccomended = content_model(restaurant)
    to_pred = reccomended['Restaurant Id'].tolist()

    preds = [model.predict(user_id, rid).est for rid in to_pred]
    reccomended['Predicted Scores'] = preds

    weight = weights
    reccomended['Weighted Scores'] = (weight)*reccomended['Similarity Score'] + (1-weight)*reccomended['Predicted Scores']
    reccomended = reccomended.sort_values(by=['Weighted Scores'], ascending=False)

    return reccomended

This is an implementation of the hybrid_model function using the retrained SVD model _algo_

In [None]:
hybrid_model('0sxiTd-rg8IICNUuDEvptQ','Spice 28', algo)

Unnamed: 0,Restaurant Id,Restaurant Name,Similarity Score,Predicted Scores,Weighted Scores
4,mtvT7uRey3F395STFRM1Tg,Vernick Food & Drink,0.999995,4.067796,2.227115
6,rYqmaOIULRouz_1db07OdQ,Green Eggs Cafe,0.999994,3.9412,2.176477
5,iksVwRfpWymIUUFqw0tXpw,Chubby Cattle,0.999995,3.893151,2.157257
0,MMRRS6YhVRx_iN5-JhMRYg,Han Dynasty,0.999997,3.682463,2.072984
3,kbXpjBSweedxQ1WrlJ5i7Q,Dan Dan,0.999996,3.661081,2.06443
8,TunmRrfZb7bt53T6HJi4UQ,Bleu Sushi,0.999994,3.612295,2.044915
7,ueAkLzWFFTzQkq3jzyBlnA,Pho Xe Lua Viet Thai Restaurant,0.999994,3.600567,2.040224
9,nIAbuktMEzVjT4P9pG89rQ,Buddakan,0.999994,3.578802,2.031518
1,vUkiYPpbkMXA99WneLSGkQ,Banana Leaf,0.999996,3.560639,2.024253
2,eaDZlSuVS0EY67Ke6pRP6Q,Penang,0.999996,3.393728,1.957489


In [None]:
hybrid_model('0sxiTd-rg8IICNUuDEvptQ','1 Stop Pizza', algo)

Unnamed: 0,Restaurant Id,Restaurant Name,Similarity Score,Predicted Scores,Weighted Scores
9,oCWx92FxeWJc5_tkQYGJTw,Pauline's Deli,0.965454,3.916709,2.145956
2,TVe1FyolMt9xjBtY7oMTOQ,Pitruco Pizza Franklin's Table,0.967794,3.896254,2.139178
6,i021iBZ1SatdR-NzBuR7Cw,Tano's Deli,0.966165,3.888715,2.135185
3,vJKxxr2auAsbmLfis-PJnw,Old Nelson Food Company,0.967256,3.76937,2.088101
1,xtEDyU98DnjqsCuV8apV6A,Lamberti Pizza and Market,0.968599,3.726224,2.071649
8,_FUE9ZOqNTiDmgP2LGPGdw,Dolores,0.965922,3.54662,1.998201
0,Gj5G0asdNZILROHEUEF-iQ,DaFranco Pizzeria,0.969892,3.508514,1.985341
5,R7U3lCaFWGA1p3sRkFcYAA,Balducci's,0.96675,3.465693,1.966327
4,tt_n0ZJ-rd-yftr_5RQLhg,Best Deli II,0.966756,3.251761,1.880758
7,kPXsRJKRRE5wmMBMBqkFbQ,Evergreen Deli,0.966057,3.227275,1.870544


In [None]:
hybrid_model('-9da1xk7zgnnfO1uTVYGkA','1 Stop Pizza', algo)

Unnamed: 0,Restaurant Id,Restaurant Name,Similarity Score,Predicted Scores,Weighted Scores
8,_FUE9ZOqNTiDmgP2LGPGdw,Dolores,0.965922,4.512644,2.384611
1,xtEDyU98DnjqsCuV8apV6A,Lamberti Pizza and Market,0.968599,4.439483,2.356953
6,i021iBZ1SatdR-NzBuR7Cw,Tano's Deli,0.966165,4.440012,2.355704
2,TVe1FyolMt9xjBtY7oMTOQ,Pitruco Pizza Franklin's Table,0.967794,4.430233,2.352769
3,vJKxxr2auAsbmLfis-PJnw,Old Nelson Food Company,0.967256,4.346303,2.318874
9,oCWx92FxeWJc5_tkQYGJTw,Pauline's Deli,0.965454,4.324465,2.309058
0,Gj5G0asdNZILROHEUEF-iQ,DaFranco Pizzeria,0.969892,4.022265,2.190841
5,R7U3lCaFWGA1p3sRkFcYAA,Balducci's,0.96675,3.955167,2.162117
4,tt_n0ZJ-rd-yftr_5RQLhg,Best Deli II,0.966756,3.822241,2.10895
7,kPXsRJKRRE5wmMBMBqkFbQ,Evergreen Deli,0.966057,3.760591,2.083871


In [None]:
hybrid_model('-9da1xk7zgnnfO1uTVYGkA','Spice 28', algo)

Unnamed: 0,Restaurant Id,Restaurant Name,Similarity Score,Predicted Scores,Weighted Scores
4,mtvT7uRey3F395STFRM1Tg,Vernick Food & Drink,0.999995,4.70491,2.481961
5,iksVwRfpWymIUUFqw0tXpw,Chubby Cattle,0.999995,4.543979,2.417588
6,rYqmaOIULRouz_1db07OdQ,Green Eggs Cafe,0.999994,4.286744,2.314694
7,ueAkLzWFFTzQkq3jzyBlnA,Pho Xe Lua Viet Thai Restaurant,0.999994,4.284912,2.313962
3,kbXpjBSweedxQ1WrlJ5i7Q,Dan Dan,0.999996,4.271657,2.30866
0,MMRRS6YhVRx_iN5-JhMRYg,Han Dynasty,0.999997,4.257932,2.303171
2,eaDZlSuVS0EY67Ke6pRP6Q,Penang,0.999996,4.146751,2.258698
8,TunmRrfZb7bt53T6HJi4UQ,Bleu Sushi,0.999994,4.130697,2.252276
9,nIAbuktMEzVjT4P9pG89rQ,Buddakan,0.999994,4.116583,2.24663
1,vUkiYPpbkMXA99WneLSGkQ,Banana Leaf,0.999996,3.940448,2.176177


## Calculating Coverage of the Recommendations

The below cells calculate the Coverage metric for the sample users using the hybrid model method. Total time to complete takes roughly 2 hours

In [None]:
sample_users = pd.read_csv('sample_users.csv', index_col=0)
sample = sample_users['User Id'].values.tolist()

In [None]:
def calculate_coverage_hybrid(hybrid_model_func, model, sample_users):

    # all the possible restaurants to a list
    rest_test = restaurants['name'].tolist()

    # Iterate over the sample users and restaurants  pairs
    for usr in sample_users:
      all_recommendations_df = pd.DataFrame(columns=['Restaurant Id'])
      for rest in rest_test:

        # Generate hybrid recommendations for the users and the current restaurant
        recommendations_df = hybrid_model_func(usr, rest, model, weights=0.6)

        # Concatenate recommendations for the current restaurant to the aggregated DataFrame
        all_recommendations_df = pd.concat([all_recommendations_df, recommendations_df[['Restaurant Id']]], ignore_index=True)

    # Calculate coverage using the unique restaurants suggested and total possible restaurants
    unique_items = all_recommendations_df['Restaurant Id'].nunique()
    coverage = unique_items / len(rest_test)

    return coverage

# Example usage
cov = calculate_coverage_hybrid(hybrid_model, algo, sample[:341])
print(f"Coverage Score for Hybrid Recommendations: {cov}")


Coverage Score for Hybrid Recommendations: 0.8568914956011731
