In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_city(city_name):
    """
    Load all the information of a city into 5 dataframes
    """
    lst=[]
    for file in ['business', 'checkin', 'review', 'tip', 'user']:
        f = open(f'yelp-all/{city_name}/{file}.json')
        lst.append(pd.read_json(f, lines = True))

    business = lst[0]
    checkin = lst[1]
    review = lst[2]
    tip = lst[3]
    user = lst[4]
    return business, checkin, review, tip, user

def just_categories(business):
    """
    Split the  categories column of a dataframe into seperate categories with a seperate entry with combination category
    and business_id as key.
    """
    genres_m = business.apply(lambda row: pd.Series([row['business_id']] + row['categories'].lower().split(",")), axis=1)
    stack_genres = genres_m.set_index(0).stack()
    df_stack_genres = stack_genres.to_frame()
    df_stack_genres['business_id'] = stack_genres.index.droplevel(1)
    df_stack_genres.columns = ['category', 'business_id']
    return df_stack_genres.reset_index()[['business_id', 'category']]

def split_data(data, d=0.75):
    """
    Split data in a training and test set with a standard distrubution of 0.75
    """
    np.random.seed(seed=5)
    mask_test = np.random.rand(data.shape[0]) < d
    mask_test = mask_test
    return data[mask_test], data[~mask_test]

def handle_duplicates (reviews):
    """
    returns the mean of businesses having multiple reviews by the same user
    """
    try:
        return reviews.groupby(['business_id', 'categories', 'user_id'])['stars'].mean().reset_index()
    except:
        return reviews.groupby(['business_id', 'user_id'])['stars'].mean().reset_index()

def join_business_reviews(business, reviews):
    business = business.set_index('business_id')
    reviews = reviews.set_index('business_id')
    temp = reviews.join(business, rsuffix='business').reset_index()
    return temp[['user_id', 'business_id', 'review_id', 'categories', 'stars']]
    
def rating_density(reviews):
    """
    Compute the density of a dataset
    """
    return number_of_ratings(reviews)/(number_of_businesses(reviews) * number_of_users(reviews))

def number_of_businesses(reviews):
    """
    returns the number of unique businesses in a set of reviews
    """
    return len(reviews['business_id'].unique())

def number_of_users(reviews):
    """
    returns the number of unique users in a set of reviews
    """
    return len(reviews['user_id'].unique())

def number_of_ratings(reviews):
    """
    returns the number of ratings of a set of reviews
    """
    return reviews.shape[0]



In [3]:
def pivot_categories(df):
    """
    Create a one-hot encoded matrix for genres
    """
    return df.pivot_table(index='business_id', columns='category', aggfunc='size', fill_value=0)

def pivot_ratings(df):
    """
    Creates a utility matrix for user ratings for businesses
    """
    return df.pivot(values='stars', columns='user_id', index='business_id')

def create_similarity_matrix_categories(matrix):
    """
    Create a similarity matrix based on categories
    """
    npu = matrix.values
    m1 = npu @ npu.T
    diag = np.diag(m1)
    m2 = m1 / diag
    m3 = np.minimum(m2, m2.T)
    return pd.DataFrame(m3, index = matrix.index, columns = matrix.index)

In [4]:
def predict_ratings(similarity, utility, to_predict):
    """Predicts the predicted rating for the input test data.
    
    Arguments:
    similarity -- a dataFrame that describes the similarity between items
    utility    -- a dataFrame that contains a rating for each user (columns) and each movie (rows). 
                  If a user did not rate an item the value np.nan is assumed. 
    to_predict -- A dataFrame containing at least the columns movieId and userId for which to do the predictions
    """
    # copy input (don't overwrite)
    ratings_test_c = to_predict.copy()
    # apply prediction to each row
    ratings_test_c['predicted rating'] = to_predict.apply(lambda row: predict_ids(similarity, utility, row['user_id'], row['business_id']), axis=1)
    return ratings_test_c[['business_id', 'user_id', 'stars', 'predicted rating']]

### Helper functions for predict_ratings_item_based ###

def predict_ids(similarity, utility, user_id, business_id):
    # select right series from matrices and compute
    if user_id in utility.columns and business_id in similarity.index:
        return predict_vectors(utility.loc[:,user_id], similarity[business_id])
    return 0

def predict_vectors(user_ratings, similarities):
    # select only movies actually rated by user
    relevant_ratings = user_ratings.dropna()
    
    # select corresponding similairties
    similarities_s = similarities[relevant_ratings.index]
    
    # select neighborhood
    similarities_s = similarities_s[similarities_s > 0.0]
    relevant_ratings = relevant_ratings[similarities_s.index]
    
    # if there's nothing left return a prediction of 0
    norm = similarities_s.sum()
    if(norm == 0):
        return 0
    
    # compute a weighted average (i.e. neighborhood is all) 
    return np.dot(relevant_ratings, similarities_s)/norm

def mse(predicted_ratings):
    """
    Computes the means square error betweeen actual ratings 
    """
    diff = predicted_ratings['stars'] - predicted_ratings['predicted rating']
    return (diff**2).mean()

def mean_center_columns(matrix):
    for column in matrix.columns:
        matrix[column] -= matrix[column].mean()
    return matrix

In [5]:
def cosine_similarity(matrix, id1, id2):
    """Compute cosine similarity"""
    selected_features = matrix.loc[id1].notna() & matrix.loc[id2].notna()
    
    # if no matching features, return 0
    if not selected_features.any():
        return 0
    
    # get the features from the matrix
    features1 = matrix.loc[id1][selected_features]
    features2 = matrix.loc[id2][selected_features]
    
    # return 1 for the diagonals and 0 if there are no matching features
    if features1.equals(features2):
        return 1
    if features1.max() == 0 or features2.max() == 0:
        return 0
    
    return sum(features1 * features2)/((sum(features1**2)**0.5) * sum((features2**2))**0.5)


def create_similarity_matrix_cosine(matrix):
    """ creates the similarity matrix based on cosine similarity """
    similarity_matrix = pd.DataFrame(0, index=matrix.index, columns=matrix.index, dtype=float)
    id1 = similarity_matrix.columns.values
    for i in id1:
        for j in id1:
            similarity_matrix[i][j] = cosine_similarity(matrix, i, j)
    return similarity_matrix

In [19]:
def content_based_benchmark(city):
    """
    Test the performance of our content-based system
    """
    a, b, c, d, e = load_city(city)
    print(city, "Content-based collaborative filtering")
    print("Rating Density", rating_density(c))
    review_business = join_business_reviews(a,c)
    review_business_clean = handle_duplicates(review_business)
    training, test = split_data(review_business_clean, d=0.9)
    training_split = just_categories(training).drop_duplicates()
    utility_categories = pivot_categories(training_split)
    utility_ratings = pivot_ratings(review_business_clean)
    similarity_categories = create_similarity_matrix_categories(utility_categories)
    predictions = predict_ratings(similarity_categories, utility_ratings, test[['user_id', 'business_id', 'stars']])
    print("MSE", mse(predictions), end='\n\n')
    return predictions

In [13]:
def user_based_benchmark(city):
    """
    Test the performance of our user-based system
    """
    city = 'stouffville'
    a, b, c, d, e = load_city(city)
    print(city, "User-based collaborative filtering")
    print("Rating Density", rating_density(c))
    reviews_clean = handle_duplicates(c)
    training, test = split_data(reviews_clean, d=0.9)
    utility = mean_center_columns(pivot_ratings(training))
    similarity = create_similarity_matrix_cosine(utility)
    predictions = predict_ratings(similarity, utility, test[['user_id', 'business_id', 'stars']])
    print("MSE", mse(predictions), end='\n\n')
    return predictions

In [17]:
def item_based_benchmark(city):
    """
    Test the performance of our user-based system
    """
    city = 'stouffville'
    a, b, c, d, e = load_city(city)
    print(city, "Item-based collaborative filtering")
    print("Rating Density", rating_density(c))
    reviews_clean = handle_duplicates(c)
    training, test = split_data(reviews_clean, d=0.9)
    utility = mean_center_columns(pivot_ratings(training).T).T
    similarity = create_similarity_matrix_cosine(utility)
    predictions = predict_ratings(similarity, utility, test[['user_id', 'business_id', 'stars']])
    print("MSE", mse(predictions), end='\n\n')
    return predictions

In [16]:
item_based_benchmark('stouffville')

stouffville Item-based collaborative filtering
Rating Density 0.02642570281124498
MSE 12.553846153846154



Unnamed: 0,business_id,user_id,stars,predicted rating
3,0Hc7Wgai2l9jzEmzpmV0EQ,OvoOosmjgiYQ5oLyRs9bRA,4.0,0
27,39rLHYJOy2774ZIUouuWLw,LB5ViGU59ww2XRCx803t0w,3.0,0
28,39rLHYJOy2774ZIUouuWLw,PBV_oQCoABX958-zjiLGwQ,3.0,0
39,39rLHYJOy2774ZIUouuWLw,m-JcuuWReGyqSvx47v_DHg,3.0,0
43,39rLHYJOy2774ZIUouuWLw,vEoLjqsqYqHswtp7-kJc2g,3.0,0
46,3UdIrlCtMyUO0SdGqS3Y1w,EiP1OFgs-XGcKZux0OKWIA,4.0,0
56,4khalYLq_dee7HG105328g,R4lGRCrDn6618IiZZedNdw,5.0,0
62,4khalYLq_dee7HG105328g,pn_flI3EBNugBEYFp9okxQ,3.0,0
63,4khalYLq_dee7HG105328g,pvwEROiu2kfTqyp6cG2HIA,5.0,0
79,5P1PHW150N388Dp9dtMQNw,eUvbsSKepPrA8bFTDtl77g,1.0,0


In [None]:
for city in ['stouffville', 'sun city', 'westlake']:
    content_based_benchmark(city)
    user_based_benchmark(city)
    item_based_benchmark(city)

stouffville Content-based collaborative filtering
Rating Density 0.02642570281124498
MSE 0.12250452950721911

stouffville User-based collaborative filtering
Rating Density 0.02642570281124498
MSE 12.367245558057054

stouffville Item-based collaborative filtering
Rating Density 0.02642570281124498
MSE 11.695756678713476

sun city Content-based collaborative filtering
Rating Density 0.0048678109581154734
MSE 0.040468525854130265

stouffville User-based collaborative filtering
Rating Density 0.02642570281124498
MSE 12.367245558057054

stouffville Item-based collaborative filtering
Rating Density 0.02642570281124498
MSE 11.695756678713476

westlake Content-based collaborative filtering
Rating Density 0.0043414322968397595
MSE 0.12593121629337964

stouffville User-based collaborative filtering
Rating Density 0.02642570281124498
