# Yelp Data Challenge - Restaurant Recommender
Here I build a restaurant recommender system based on the yelp data from the past two years. I first clean the data by selecting the relevent columns. Then create the utility matrix. Then I build the Item-Item Collaborative Filtering Recommender, Matrix Factorization recommender (NMF).

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use("ggplot")

In [7]:
df = pd.read_csv('last_2_years_restaurant_reviews.csv')

In [8]:
df.head(2)

Unnamed: 0,business_id,name,categories,avg_stars,review_id,user_id,stars,date,text,useful,funny,cool
0,--9e1ONYQuAa-CB_Rrw7Tw,"""Delmonico Steakhouse""",Cajun/Creole;Steakhouses;Restaurants,4.0,6SgvNWJltnZhW7duJgZ42w,oFyOUOeGTRZhFPF9uTqrTQ,5,2016-03-31,This is mine and my fiancé's favorite steakhou...,0,0,0
1,--9e1ONYQuAa-CB_Rrw7Tw,"""Delmonico Steakhouse""",Cajun/Creole;Steakhouses;Restaurants,4.0,UxFpgng8dPMWOj99653k5Q,aVOGlN9fZ-BXcbtj6dbf0g,5,2016-02-10,Truly Fantastic! Best Steak ever. Service was...,0,0,0


## Data Cleaning

#### Select Relevent columns

In [9]:
recommender_df = df[['business_id', 'user_id', 'stars']]
recommender_df.head(3)

Unnamed: 0,business_id,user_id,stars
0,--9e1ONYQuAa-CB_Rrw7Tw,oFyOUOeGTRZhFPF9uTqrTQ,5
1,--9e1ONYQuAa-CB_Rrw7Tw,aVOGlN9fZ-BXcbtj6dbf0g,5
2,--9e1ONYQuAa-CB_Rrw7Tw,KC8H7qTZVPIEnanw9fG43g,5


There are many users that haven't given many reviews, I will exclude these users from the item-item similarity recommender.

In [10]:
reviews_count_df = recommender_df.groupby('user_id')['stars'].count()
reviews_count_df.head(5)

user_id
---1lKK3aKOuomHnwAkAow    1
---udAKDsn0yQXmzbWQNSw    2
--0sXNBv6IizZXuV-nl0Aw    1
--2bpE5vyR-2hAP7sZZ4lA    1
--2vR0DIsmQ6WfcSzKWigw    2
Name: stars, dtype: int64

In [11]:
print('Max reviews: %s, Min reviews: %s' % (max(reviews_count_df), min(reviews_count_df)))
print('Median reviews: %s, Mean reviews: %s' % (np.median(reviews_count_df), round(np.mean(reviews_count_df),2)))
print('25%% reviews: %d,  75%% reviews: %d' % (np.percentile(reviews_count_df, 25), np.percentile(reviews_count_df, 75)))
print('Number of unique business: %d' % (len(set(recommender_df['business_id']))))

Max reviews: 389, Min reviews: 1
Median reviews: 1.0, Mean reviews: 2.08
25% reviews: 1,  75% reviews: 2
Number of unique business: 4484


In [12]:
active_user = list(reviews_count_df[reviews_count_df >= 5].index)
mask = [user in active_user for user in recommender_df['user_id']]
active_user_df = recommender_df[mask]
active_user_df.head(5)

Unnamed: 0,business_id,user_id,stars
0,--9e1ONYQuAa-CB_Rrw7Tw,oFyOUOeGTRZhFPF9uTqrTQ,5
3,--9e1ONYQuAa-CB_Rrw7Tw,3RTesI_MAwct13LWm4rhLw,4
5,--9e1ONYQuAa-CB_Rrw7Tw,C6kw0Rny7jZAGjTj0MWA3Q,5
16,--9e1ONYQuAa-CB_Rrw7Tw,JaqcCU3nxReTW2cBLHounA,5
19,--9e1ONYQuAa-CB_Rrw7Tw,581NruH_Ns8_FH7jr9wLiQ,5


#### Create utility matrix from records

In [17]:
from scipy import sparse
highest_user_id = len(set(active_user_df['user_id']))
highest_movie_id = len(set(active_user_df['business_id']))
ratings_mat = sparse.lil_matrix((highest_user_id, highest_movie_id))
ratings_mat

<12478x4185 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in LInked List format>

In [18]:
user_id = list(set(active_user_df['user_id']))
business_id = list(set(active_user_df['business_id']))
for _, row in active_user_df.iterrows():
    ratings_mat[user_id.index(row.user_id), business_id.index(row.business_id)] = row.stars
ratings_mat

<12478x4185 sparse matrix of type '<class 'numpy.float64'>'
	with 126350 stored elements in LInked List format>

## Item - Item Collaborative Filter Recommender

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
from time import time
class ItemItemRecommender(object):

    def __init__(self, neighborhood_size):
        self.neighborhood_size = neighborhood_size

    def fit(self, ratings_mat):
        self.ratings_mat = ratings_mat
        self.n_users = ratings_mat.shape[0]
        self.n_items = ratings_mat.shape[1]
        self.item_sim_mat = cosine_similarity(self.ratings_mat.T)
        self._set_neighborhoods()

    def _set_neighborhoods(self):
        least_to_most_sim_indexes = np.argsort(self.item_sim_mat, 1)
        self.neighborhoods = least_to_most_sim_indexes[:, -self.neighborhood_size:]

    def pred_one_user(self, user_id, report_run_time=False):
        start_time = time()
        items_rated_by_this_user = self.ratings_mat[user_id].nonzero()[1]
        # Just initializing so I have somewhere to put rating preds
        out = np.zeros(self.n_items)
        for item_to_rate in range(self.n_items):
            relevant_items = np.intersect1d(self.neighborhoods[item_to_rate],
                                            items_rated_by_this_user,
                                            assume_unique=True)  # assume_unique speeds up intersection op
            out[item_to_rate] = self.ratings_mat[user_id, relevant_items] * \
                self.item_sim_mat[item_to_rate, relevant_items] / \
                self.item_sim_mat[item_to_rate, relevant_items].sum()
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        cleaned_out = np.nan_to_num(out)
        return cleaned_out

    def pred_all_users(self, report_run_time=False):
        start_time = time()
        all_ratings = [
            self.pred_one_user(user_id) for user_id in range(self.n_users)]
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        return np.array(all_ratings)

    def top_n_recs(self, user_id, n):
        pred_ratings = self.pred_one_user(user_id)
        item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))
        items_rated_by_this_user = self.ratings_mat[user_id].nonzero()[1]
        unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                        if item not in items_rated_by_this_user]
        return unrated_items_by_pred_rating[-n:], items_rated_by_this_user

In [20]:
my_rec_engine = ItemItemRecommender(neighborhood_size=80)
my_rec_engine.fit(ratings_mat)

Let me try the recommder system with a lucky user.

In [21]:
lucky_user = np.random.choice(active_user_df['user_id'], 1)[0]
lucky_user_index = user_id.index(lucky_user)
lucky_user_recommend, items_rated_by_this_user = my_rec_engine.top_n_recs(user_id=lucky_user_index, n = 10)



In [22]:
print("The top ten recommendation for user %s are: " % (lucky_user))
print('%s' % (', '.join(list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                       for i in lucky_user_recommend)))

The top ten recommendation for user JADdo9NEeO5Az9aOYbyvZA are: 
"Grand Lux Cafe", "Panevino Italian Grille", "Binion's Deli", "Canyon Ranch Grill", "Vic & Anthony's Steakhouse", "Zuma Las Vegas", "Jean Georges Steakhouse", "Blondies Sports Bar & Grill", "Jaburritos", "Avenue Cafe"


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
original_rated_restaurants = [list(set(df['name'][df['business_id'] == business_id[i]]))[0] for i in items_rated_by_this_user]
mask = [name in original_rated_restaurants for name in df['name']]
original_category = df['categories'][mask]
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
original_category_vec = vectorizer.fit_transform(original_category).toarray()
original_word = vectorizer.get_feature_names()
print('Categories from user rated restaurants: \n%s' % (','.join(i for i in original_word)))

Categories from user rated restaurants: 
american,arts,asian,bars,beer,breakfast,brunch,burgers,cafe,casinos,chinese,clubs,dance,dim,entertainment,event,food,fusion,gastropubs,hong,hotels,italian,japanese,kong,lounges,new,nightlife,noodles,planning,restaurants,services,soup,southern,spirits,sports,steakhouses,style,sum,sushi,traditional,travel,wine


In [26]:
recommend_res = [list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                 for i in lucky_user_recommend]
mask = [name in recommend_res for name in df['name']]
recommend_category = df['categories'][mask]
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
recommend_category_vec = vectorizer.fit_transform(recommend_category).toarray()
recommend_word = vectorizer.get_feature_names()
print('Categories from recommend restaurants: \n%s' % (','.join(i for i in recommend_word)))

Categories from recommend restaurants: 
american,asian,barbeque,bars,breakfast,brunch,delis,desserts,dogs,food,fusion,hot,italian,japanese,mexican,new,nightlife,restaurants,salad,sandwiches,seafood,sports,stands,steakhouses,sushi,traditional,vegan


In [27]:
#Check the common labels
print("Common labels are: \n%s" % (', '.join(word for word in recommend_word if word in original_word)))

Common labels are: 
american, asian, bars, breakfast, brunch, food, fusion, italian, japanese, new, nightlife, restaurants, sports, steakhouses, sushi, traditional


## Matrix Factorization recommender (NMF)

In [28]:
from sklearn.decomposition import NMF
class NMF_Recommender(object):

    def __init__(self, n_components):
        self.n_components = n_components

    def fit(self, ratings_mat):
        self.ratings_mat = ratings_mat
        self.n_users = ratings_mat.shape[0]
        self.n_items = ratings_mat.shape[1]
        nmf = NMF(n_components = 200)
        nmf.fit(ratings_mat)
        self.W = nmf.transform(ratings_mat)
        self.H = nmf.components_
        self.error = nmf.reconstruction_err_
        self.ratings_mat_fitted = self.W.dot(self.H)

    def get_error(self):
        return self.error
        
    def pred_one_user(self, user_id, report_run_time=False):
        start_time = time()
        cleaned_out = self.ratings_mat_fitted[user_id,:]
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        return cleaned_out

    def pred_all_users(self, report_run_time=False):
        start_time = time()
        all_ratings = [
            self.pred_one_user(user_id) for user_id in range(self.n_users)]
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        return np.array(all_ratings)

    def top_n_recs(self, user_id, n):
        pred_ratings = self.pred_one_user(user_id)
        item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))
        items_rated_by_this_user = self.ratings_mat[user_id].nonzero()[1]
        unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                        if item not in items_rated_by_this_user]
        return unrated_items_by_pred_rating[-n:], items_rated_by_this_user

In [29]:
# get recommendations for the same lucky user
my_rec_engine = NMF_Recommender(n_components=200)
my_rec_engine.fit(ratings_mat)
lucky_user_recommend, items_rated_by_this_user = my_rec_engine.top_n_recs(user_id=lucky_user_index, n = 10)
print("The top ten recommendation for user %s are: " % (lucky_user))
print('%s' % (', '.join(list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                       for i in lucky_user_recommend)))

The top ten recommendation for user JADdo9NEeO5Az9aOYbyvZA are: 
"Aureole", "Michael Mina Pub 1842", "Chef Marc's Trattoria", "Chengdu Taste", "STK Las Vegas", "Bazaar Meat by José Andrés", "SUSHISAMBA - Las Vegas", "Fiamma Italian Kitchen", "Wolfgang Puck Bar & Grill Las Vegas", "MGM Grand Hotel"


In [30]:
print("The users original rated resturants are :\n %s" % (','.join(list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                       for i in items_rated_by_this_user)))

The users original rated resturants are :
 "Carnevino","Foundation Room","Public House","MOzen Bistro","Hong Kong Cafe","Yardbird Southern Table & Bar","Morimoto"


In [31]:
original_rated_restaurants = [list(set(df['name'][df['business_id'] == business_id[i]]))[0] for i in items_rated_by_this_user]
mask = [name in original_rated_restaurants for name in df['name']]
original_category = df['categories'][mask]
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
original_category_vec = vectorizer.fit_transform(original_category).toarray()
original_word = vectorizer.get_feature_names()
print('Categories from user rated restaurants: \n%s' % (','.join(i for i in original_word)))

Categories from user rated restaurants: 
american,arts,asian,bars,beer,breakfast,brunch,burgers,cafe,casinos,chinese,clubs,dance,dim,entertainment,event,food,fusion,gastropubs,hong,hotels,italian,japanese,kong,lounges,new,nightlife,noodles,planning,restaurants,services,soup,southern,spirits,sports,steakhouses,style,sum,sushi,traditional,travel,wine


In [32]:
recommend_res = [list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                 for i in lucky_user_recommend]
mask = [name in recommend_res for name in df['name']]
recommend_category = df['categories'][mask]
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
recommend_category_vec = vectorizer.fit_transform(recommend_category).toarray()
recommend_word = vectorizer.get_feature_names()
print('Categories from recommend restaurants: \n%s' % (','.join(i for i in recommend_word)))

Categories from recommend restaurants: 
american,arts,asian,bars,breakfast,brunch,casinos,chinese,cocktail,dim,entertainment,event,fusion,gastropubs,hotels,irish,italian,new,nightlife,pizza,planning,plates,pubs,resorts,restaurants,salad,sandwiches,seafood,services,small,spanish,steakhouses,sum,szechuan,tapas,traditional,travel,wine


In [33]:
#Check the common labels
print("Common labels are: \n%s" % (', '.join(word for word in recommend_word if word in original_word)))

Common labels are: 
american, arts, asian, bars, breakfast, brunch, casinos, chinese, dim, entertainment, event, fusion, gastropubs, hotels, italian, new, nightlife, planning, restaurants, services, steakhouses, sum, traditional, travel, wine


Based on user's previous rating, the NMF recommder shows better performance.

## Matrix Factorization recommender (SVD) with restaurants' labels.

Each business has its own labels. Suppose we have a table of business_id against category labels. Each element in the table represents the style score of resturants to labels. Additionally, we can build another table of user_id against category labels. Each element in the table stands for the preference/taste of clients to each label. By multipling two tables, we can get the utility table. The two sub-table can have negative number as preference can be divided into like or dislike.

In [34]:
#get the number of labels 
mask = [business in business_id for business in df['business_id']]
category = df['categories'][mask]
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
category_vec = vectorizer.fit_transform(category).toarray()
words = vectorizer.get_feature_names()
#This is the number of unique categories
print('The total number of restaurant labels is %d' % (len(words))) 

The total number of restaurant labels is 423


In [35]:
from sklearn.decomposition import TruncatedSVD
class SVD_Recommender(object):

    def __init__(self):
        self.n_components = 361 #the number of labels

    def fit(self, ratings_mat):
        self.ratings_mat = ratings_mat
        self.n_users = ratings_mat.shape[0]
        self.n_items = ratings_mat.shape[1]
        svd = TruncatedSVD(n_components=self.n_components, n_iter=7, random_state=1)
        svd.fit(ratings_mat)
        self.V = svd.components_
        self.U = svd.transform(ratings_mat)
        self.ratings_mat_fitted = self.U.dot(self.V)

    def get_error(self):
        return ((self.ratings_mat_fitted - self.ratings_mat)**2).mean(axis=None)
        
    def pred_one_user(self, user_id, report_run_time=False):
        start_time = time()
        cleaned_out = self.ratings_mat_fitted[user_id,:]
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        return cleaned_out

    def pred_all_users(self, report_run_time=False):
        start_time = time()
        all_ratings = [
            self.pred_one_user(user_id) for user_id in range(self.n_users)]
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        return np.array(all_ratings)

    def top_n_recs(self, user_id, n):
        pred_ratings = self.pred_one_user(user_id)
        item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))
        items_rated_by_this_user = self.ratings_mat[user_id].nonzero()[1]
        unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                        if item not in items_rated_by_this_user]
        return unrated_items_by_pred_rating[-n:], items_rated_by_this_user

In [36]:
# get recommendations for the same lucky user
my_rec_engine = SVD_Recommender()
my_rec_engine.fit(ratings_mat)
lucky_user_recommend, items_rated_by_this_user = my_rec_engine.top_n_recs(user_id=lucky_user_index, n = 10)
print("The top ten recommendation for user %s are: " % (lucky_user))
print('%s' % (', '.join(list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                       for i in lucky_user_recommend)))

The top ten recommendation for user JADdo9NEeO5Az9aOYbyvZA are: 
"Charlie Palmer Steak", "Fiamma Italian Kitchen", "Joël  Robuchon", "Chengdu Taste", "Top of the World", "Delmonico Steakhouse", "Habaneros Taco Grill", "Table 10", "CUT by Wolfgang Puck", "Bazaar Meat by José Andrés"


Let me check whether the recommendation make sence. I can check through whether the category labels are consistent between original returants and recommend restaurants.

In [37]:
original_rated_restaurants = [list(set(df['name'][df['business_id'] == business_id[i]]))[0] for i in items_rated_by_this_user]
mask = [name in original_rated_restaurants for name in df['name']]
original_category = df['categories'][mask]
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
original_category_vec = vectorizer.fit_transform(original_category).toarray()
original_word = vectorizer.get_feature_names()
print('Categories from user rated restaurants: \n%s' % (','.join(i for i in original_word)))

Categories from user rated restaurants: 
american,arts,asian,bars,beer,breakfast,brunch,burgers,cafe,casinos,chinese,clubs,dance,dim,entertainment,event,food,fusion,gastropubs,hong,hotels,italian,japanese,kong,lounges,new,nightlife,noodles,planning,restaurants,services,soup,southern,spirits,sports,steakhouses,style,sum,sushi,traditional,travel,wine


In [38]:
recommend_res = [list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                 for i in lucky_user_recommend]
mask = [name in recommend_res for name in df['name']]
recommend_category = df['categories'][mask]
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
recommend_category_vec = vectorizer.fit_transform(recommend_category).toarray()
recommend_word = vectorizer.get_feature_names()
print('Categories from recommend restaurants: \n%s' % (','.join(i for i in recommend_word)))

Categories from recommend restaurants: 
american,bars,cajun,chinese,creole,diners,fast,food,french,italian,lounges,mexican,new,nightlife,plates,restaurants,seafood,small,spanish,steakhouses,szechuan,tacos,tapas,traditional,wine


In [39]:
#Check the common labels
print("Common labels are: \n%s" % (', '.join(word for word in recommend_word if word in original_word)))

Common labels are: 
american, bars, chinese, food, italian, lounges, new, nightlife, restaurants, steakhouses, traditional, wine
