## Content-based Filtering

Content-based filtering lijkt voor de CELP database een zinvolle optie, aangezien er sprake is van een lange tail en de rating density slechts een paar procent is in vele gevallen.

In [1]:
import sklearn.metrics.pairwise as pw
import pandas as pd
import numpy as np

In [2]:
def load_city(city_name):
    lst=[]
    for file in ['business', 'checkin', 'review', 'tip', 'user']:
        f = open(f'yelp-all/{city_name}/{file}.json')
        lst.append(pd.read_json(f, lines = True))

    business = lst[0]
    checkin = lst[1]
    review = lst[2]
    tip = lst[3]
    user = lst[4]
    return business, checkin, review, tip, user

In [3]:
def number_of_businesses(reviews):
    """
    returns the number of unique businesses in a set of reviews
    """
    return len(reviews['business_id'].unique())

def number_of_users(reviews):
    """
    returns the number of unique users in a set of reviews
    """
    return len(reviews['user_id'].unique())

def number_of_ratings(reviews):
    """
    returns the number of ratings of a set of reviews
    """
    return reviews.shape[0]

def rating_density(reviews):
    """
    Compute the density of a dataset
    """
    return number_of_ratings(reviews)/(number_of_businesses(reviews) * number_of_users(reviews))

def split_data(data, d=0.75):
    """
    Split data in a training and test set
    """
    data = data[['user_id', 'business_id', 'stars']]
    np.random.seed(seed=5)
    mask_test = np.random.rand(data.shape[0]) < d
    mask_test = mask_test
    return data[mask_test], data[~mask_test]

In [4]:
business, checkin, review, tip, user = load_city('stouffville')
number_of_businesses(review)
number_of_users(review)
number_of_ratings(review)
rating_density(review)

0.02642570281124498

0.02642570281124498

In [5]:
review_business = review.set_index('business_id').join(business.set_index('business_id'), rsuffix='review').sort_values(by='address').reset_index()

In [6]:
def split_categories(review_business):
    genres_m = review_business.apply(lambda row: pd.Series([row['review_id']] + row['categories'].lower().split(",")), axis=1)
    stack_genres = genres_m.set_index(0).stack()
    df_stack_genres = stack_genres.to_frame()
    df_stack_genres['review_id'] = stack_genres.index.droplevel(1)
    df_stack_genres.columns = ['category', 'review_id']
    temp =  df_stack_genres.reset_index()[['review_id', 'category']].set_index('review_id')
    review_business = review_business.set_index('review_id')
    return temp.join(review_business, rsuffix='r')[['category', 'business_id', 'user_id', 'stars']].reset_index()

def just_categories(business):
    genres_m = review_business.apply(lambda row: pd.Series([row['business_id']] + row['categories'].lower().split(",")), axis=1)
    stack_genres = genres_m.set_index(0).stack()
    df_stack_genres = stack_genres.to_frame()
    df_stack_genres['business_id'] = stack_genres.index.droplevel(1)
    df_stack_genres.columns = ['category', 'business_id']
    return df_stack_genres.reset_index()[['business_id', 'category']]
    

def pivot_categories(df):
    """
    Create a one-hot encoded matrix for genres
    """
    return df.pivot_table(index='business_id', columns='category', aggfunc='size', fill_value=0)

def pivot_ratings(df):
    """
    Creates a utility matrix for user ratings for businesses
    """
    return df.pivot(values='stars', columns='user_id', index='business_id')
    

In [7]:
df_categories = split_categories(review_business).sort_values(['review_id', 'business_id'])
df_utility_categories = pivot_categories(df_categories)

In [8]:
df_categories.head()

Unnamed: 0,review_id,category,business_id,user_id,stars
0,-2S0jfXSoZQBGsbCvyOH9A,restaurants,yKaAlvQILs53ML8BpavKhw,8pHGSTZaYG42CMk1CcQ6CA,3
1,-2S0jfXSoZQBGsbCvyOH9A,japanese,yKaAlvQILs53ML8BpavKhw,8pHGSTZaYG42CMk1CcQ6CA,3
2,-6OBu_Nj0AnRE0jNOR3nMw,restaurants,6x3zxKZ9GAu1HGH3fDbYyA,TXTwGSSoE0oop0mYZqElTw,3
3,-6OBu_Nj0AnRE0jNOR3nMw,pizza,6x3zxKZ9GAu1HGH3fDbYyA,TXTwGSSoE0oop0mYZqElTw,3
4,-6OBu_Nj0AnRE0jNOR3nMw,caterers,6x3zxKZ9GAu1HGH3fDbYyA,TXTwGSSoE0oop0mYZqElTw,3


Unnamed: 0,review_id,category,business_id,user_id,stars
0,-2S0jfXSoZQBGsbCvyOH9A,restaurants,yKaAlvQILs53ML8BpavKhw,8pHGSTZaYG42CMk1CcQ6CA,3
1,-2S0jfXSoZQBGsbCvyOH9A,japanese,yKaAlvQILs53ML8BpavKhw,8pHGSTZaYG42CMk1CcQ6CA,3
2,-6OBu_Nj0AnRE0jNOR3nMw,restaurants,6x3zxKZ9GAu1HGH3fDbYyA,TXTwGSSoE0oop0mYZqElTw,3
3,-6OBu_Nj0AnRE0jNOR3nMw,pizza,6x3zxKZ9GAu1HGH3fDbYyA,TXTwGSSoE0oop0mYZqElTw,3
4,-6OBu_Nj0AnRE0jNOR3nMw,caterers,6x3zxKZ9GAu1HGH3fDbYyA,TXTwGSSoE0oop0mYZqElTw,3


In [9]:
df_utility_categories.head()

category,amateur sports teams,american (traditional),arts & entertainment,auto parts & supplies,auto repair,automotive,bakeries,bars,beauty & spas,breakfast & brunch,...,pet stores,pizza,public services & government,restaurants,shopping,specialty food,thai,veterinarians,waxing,wineries
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0Hc7Wgai2l9jzEmzpmV0EQ,0,0,0,0,0,0,8,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29-aOjDbsDML8DSXkxmS7g,0,0,0,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2YuwHtBrpXTL0g2eD0wDJA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39rLHYJOy2774ZIUouuWLw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3UdIrlCtMyUO0SdGqS3Y1w,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0


category,amateur sports teams,american (traditional),arts & entertainment,auto parts & supplies,auto repair,automotive,bakeries,bars,beauty & spas,breakfast & brunch,...,pet stores,pizza,public services & government,restaurants,shopping,specialty food,thai,veterinarians,waxing,wineries
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0Hc7Wgai2l9jzEmzpmV0EQ,0,0,0,0,0,0,8,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29-aOjDbsDML8DSXkxmS7g,0,0,0,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2YuwHtBrpXTL0g2eD0wDJA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39rLHYJOy2774ZIUouuWLw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3UdIrlCtMyUO0SdGqS3Y1w,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0


In [10]:
def create_similarity_matrix_categories(matrix):
    """
    Create a similarity matrix based on categories
    """
    npu = matrix.values
    m1 = npu @ npu.T
    diag = np.diag(m1)
    m2 = m1 / diag
    m3 = np.minimum(m2, m2.T)
    return pd.DataFrame(m3, index = matrix.index, columns = matrix.index)

In [11]:
df_similarity_categories = create_similarity_matrix_categories(df_utility_categories)
df_similarity_categories.head()

business_id,0Hc7Wgai2l9jzEmzpmV0EQ,29-aOjDbsDML8DSXkxmS7g,2YuwHtBrpXTL0g2eD0wDJA,39rLHYJOy2774ZIUouuWLw,3UdIrlCtMyUO0SdGqS3Y1w,4ffMqWpWJOvKJB2_3kuZgQ,4khalYLq_dee7HG105328g,5E62baucwayv5gi3qVhxdA,5P1PHW150N388Dp9dtMQNw,6x3zxKZ9GAu1HGH3fDbYyA,...,u_uzO8Ni_0D7HV_-edpM3A,weqwqBHq9YfKAZFKSbAfCQ,x2WWUrweUlR71lMPzBE4SQ,xIS_V5oUR3BIUwtNf2yq7A,yKaAlvQILs53ML8BpavKhw,yQe-M9TWMOWLFYMJlZ_asw,yZ4UGFwGMPVdjgYMhWdmdg,z6f90BZ2OtwGH0KkTUSZJA,zlF2RzJYTgZRbzFbhe3zTA,zuHzvkj-NenIYUvdUexMBw
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0Hc7Wgai2l9jzEmzpmV0EQ,1.0,0.0,0.053571,0.064516,0.0,0.0,0.0,0.0,0.072727,0.0,...,0.178571,0.089286,0.107143,0.0,0.0,0.160714,0.0,0.133333,0.0,0.0
29-aOjDbsDML8DSXkxmS7g,0.0,1.0,0.0,0.0,0.0,0.085714,0.119048,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2YuwHtBrpXTL0g2eD0wDJA,0.053571,0.0,1.0,0.024194,0.0,0.0,0.0,0.0,0.027273,0.0,...,0.0,0.0,0.125,0.0,0.0,0.4,0.0,0.05,0.0,0.0
39rLHYJOy2774ZIUouuWLw,0.064516,0.0,0.024194,1.0,0.0,0.0,0.0,0.0,0.177419,0.0,...,0.0,0.0,0.048387,0.0,0.0,0.024194,0.0,0.193548,0.0,0.0
3UdIrlCtMyUO0SdGqS3Y1w,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.333333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.375,0.0,0.0,0.0


business_id,0Hc7Wgai2l9jzEmzpmV0EQ,29-aOjDbsDML8DSXkxmS7g,2YuwHtBrpXTL0g2eD0wDJA,39rLHYJOy2774ZIUouuWLw,3UdIrlCtMyUO0SdGqS3Y1w,4ffMqWpWJOvKJB2_3kuZgQ,4khalYLq_dee7HG105328g,5E62baucwayv5gi3qVhxdA,5P1PHW150N388Dp9dtMQNw,6x3zxKZ9GAu1HGH3fDbYyA,...,u_uzO8Ni_0D7HV_-edpM3A,weqwqBHq9YfKAZFKSbAfCQ,x2WWUrweUlR71lMPzBE4SQ,xIS_V5oUR3BIUwtNf2yq7A,yKaAlvQILs53ML8BpavKhw,yQe-M9TWMOWLFYMJlZ_asw,yZ4UGFwGMPVdjgYMhWdmdg,z6f90BZ2OtwGH0KkTUSZJA,zlF2RzJYTgZRbzFbhe3zTA,zuHzvkj-NenIYUvdUexMBw
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0Hc7Wgai2l9jzEmzpmV0EQ,1.0,0.0,0.053571,0.064516,0.0,0.0,0.0,0.0,0.072727,0.0,...,0.178571,0.089286,0.107143,0.0,0.0,0.160714,0.0,0.133333,0.0,0.0
29-aOjDbsDML8DSXkxmS7g,0.0,1.0,0.0,0.0,0.0,0.085714,0.119048,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2YuwHtBrpXTL0g2eD0wDJA,0.053571,0.0,1.0,0.024194,0.0,0.0,0.0,0.0,0.027273,0.0,...,0.0,0.0,0.125,0.0,0.0,0.4,0.0,0.05,0.0,0.0
39rLHYJOy2774ZIUouuWLw,0.064516,0.0,0.024194,1.0,0.0,0.0,0.0,0.0,0.177419,0.0,...,0.0,0.0,0.048387,0.0,0.0,0.024194,0.0,0.193548,0.0,0.0
3UdIrlCtMyUO0SdGqS3Y1w,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.333333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.375,0.0,0.0,0.0


In [12]:
def predict_ratings(similarity, utility, to_predict):
    """Predicts the predicted rating for the input test data.
    
    Arguments:
    similarity -- a dataFrame that describes the similarity between items
    utility    -- a dataFrame that contains a rating for each user (columns) and each movie (rows). 
                  If a user did not rate an item the value np.nan is assumed. 
    to_predict -- A dataFrame containing at least the columns movieId and userId for which to do the predictions
    """
    # copy input (don't overwrite)
    ratings_test_c = to_predict.copy()
    # apply prediction to each row
    ratings_test_c['predicted rating'] = to_predict.apply(lambda row: predict_ids(similarity, utility, row['user_id'], row['business_id']), axis=1)
    return ratings_test_c[['business_id', 'user_id', 'stars', 'predicted rating']]

### Helper functions for predict_ratings_item_based ###

def predict_ids(similarity, utility, user_id, business_id):
    # select right series from matrices and compute
    if user_id in utility.columns and business_id in similarity.index:
        return predict_vectors(utility.loc[:,user_id], similarity[business_id])
    return 0

def predict_vectors(user_ratings, similarities):
    # select only movies actually rated by user
    relevant_ratings = user_ratings.dropna()
    
    # select corresponding similairties
    similarities_s = similarities[relevant_ratings.index]
    
    # select neighborhood
    similarities_s = similarities_s[similarities_s > 0.0]
    relevant_ratings = relevant_ratings[similarities_s.index]
    
    # if there's nothing left return a prediction of 0
    norm = similarities_s.sum()
    if(norm == 0):
        return 0
    
    # compute a weighted average (i.e. neighborhood is all) 
    return np.dot(relevant_ratings, similarities_s)/norm

def mse(predicted_ratings):
    """
    Computes the means square error betweeen actual ratings 
    """
    diff = predicted_ratings['stars'] - predicted_ratings['predicted rating']
    return (diff**2).mean()

 ## Nieuwe Test, vanaf hier proberen

In [13]:
df = just_categories(business).drop_duplicates()
df_reviews_training, df_reviews_test = split_data(review, d=0.9)
df_reviews_training = df_reviews_training.groupby(['user_id', 'business_id'])['user_id', 'business_id', 'stars'].mean().reset_index()
df_utility_ratings = pivot_ratings(df_reviews_training)
df_utility_genres = pivot_categories(df)
df_similarity_genres = create_similarity_matrix_categories(df_utility_genres)
df_predicted_new = predict_ratings(df_similarity_genres, df_utility_ratings, df_reviews_test[['user_id', 'business_id', 'stars']])
mse_content = mse(df_predicted_new)
mse_content

10.30623309170236

10.30623309170236

In [107]:
def business_id_to_name(ids, city):
    business = pd.read_json(open(f'yelp-all/{city}/business.json'), lines = True)
    outcome = [list(business[business['business_id'] == i]['name'].values)[0] for i in ids]
    return outcome
    
def top_recommendations(df_predicted, city, amount = 10, name=False):
    temp = list(df_predicted.sort_values('predicted rating', ascending=False)[:10]['business_id'].values)
    if name:
        return business_id_to_name(temp, city)
    return temp

In [117]:
def total_run(city, name=False):
    business, check_in, review, tip, user = load_city(city)
    df = just_categories(business).drop_duplicates()
    df_reviews_training, df_reviews_test = split_data(review, d=0.9)
    df_reviews_training = df_reviews_training.groupby(['user_id', 'business_id'])['user_id', 'business_id', 'stars'].mean().reset_index()
    df_utility_ratings = pivot_ratings(df_reviews_training)
    df_utility_genres = pivot_categories(df)
    df_similarity_genres = create_similarity_matrix_categories(df_utility_genres)
    df_predicted_new = predict_ratings(df_similarity_genres, df_utility_ratings, df_reviews_test[['user_id', 'business_id', 'stars']])
    print(mse(df_predicted_new))
    return top_recommendations(df_predicted_new, city, name=name)

In [119]:
total_run('westlake', name=True)

14.464566929133857
14.464566929133857


['B Spot',
 'Sibling Revelry Brewing',
 'Arashi Japan Sushi & Steak House',
 'Brio Tuscan Grille',
 "McDonald's",
 'Inn To Pets Inc',
 'Apple Store',
 'Yard House',
 'Yard House',
 'Crocker Park']

['B Spot',
 'Sibling Revelry Brewing',
 'Arashi Japan Sushi & Steak House',
 'Brio Tuscan Grille',
 "McDonald's",
 'Inn To Pets Inc',
 'Apple Store',
 'Yard House',
 'Yard House',
 'Crocker Park']

In [110]:
for i in user.index:
    id_chosen = user.loc[i]['user_id']
    already_rated = list(review[review['user_id'] == id_chosen].values)
    print('\n')
    
    









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































