## User-Based filtering

In [1]:
import pandas as pd
import numpy as np

In [2]:
lst=[]
for file in ['business', 'checkin', 'review', 'tip', 'user']:
    f = open(f'yelp-all/stouffville/{file}.json')
    lst.append(pd.read_json(f, lines = True))

business = lst[0]
checkin = lst[1]
review = lst[2]
tip = lst[3]
user = lst[4]

Na het inladen van de modules en de databestanden is het nodig met een aantal helperfuncties een utility matrix te maken. Het verschil tussen user-based en item-based is de oriëntatie van de matrix.

In [3]:
# define a helper function for accessing data
def get_rating(reviews, user_id = None, business_id = None, review_id = None):
    try:
        if user_id and business_id:
            return reviews[(reviews['user_id'] == user_id) & (reviews['business_id'] == business_id)]['stars'].values.mean()
        elif review_id:
            return reviews[(reviews['review_id'] == review_id)]
    except:
        return np.nan# define a helper function to create a pivot table

In [4]:
def pivot_ratings(reviews):
    businessIds = reviews['business_id'].unique()
    userIds = reviews['user_id'].unique()
    
    # create empty data frame
    pivot_data = pd.DataFrame(np.nan, columns=userIds, index=businessIds, dtype=float)
    
    # use the function get_rating to fill the matrix
    for i in pivot_data.columns:
        for j in pivot_data.index:
            pivot_data[i][j] = get_rating(reviews, i, j)
    
    return pivot_data.T


In [5]:
def mean_center_columns(matrix):
    for column in matrix.columns:
        matrix[column] -= matrix[column].mean()
    return matrix.T

In [6]:
def cosine_similarity(matrix, id1, id2):
    """Compute cosine similarity"""
    selected_features = matrix.loc[id1].notna() & matrix.loc[id2].notna()
    
    # if no matching features, return 0
    if not selected_features.any():
        return 0
    
    # get the features from the matrix
    features1 = matrix.loc[id1][selected_features]
    features2 = matrix.loc[id2][selected_features]
    
    # return 1 for the diagonals and 0 if there are no matching features
    if features1.equals(features2):
        return 1
    if features1.max() == 0 or features2.max() == 0:
        return 0
    
    return sum(features1 * features2)/((sum(features1**2)**0.5) * sum((features2**2))**0.5)


def create_similarity_matrix_cosine(matrix):
    """ creates the similarity matrix based on cosine similarity """
    similarity_matrix = pd.DataFrame(0, index=matrix.index, columns=matrix.index, dtype=float)
    id1 = similarity_matrix.columns.values
    for i in id1:
        for j in id1:
            similarity_matrix[i][j] = cosine_similarity(matrix, i, j)
    return similarity_matrix 

In [7]:
def select_neighborhood(similarity_matrix, utility_matrix, target_user, target_business):
    """selects all items with similarity > 0"""
    try:
        all_neighbors = similarity_matrix.loc[utility_matrix[target_user].dropna().index.values][target_business]
        return all_neighbors[all_neighbors > 0]
    except:
        return np.nan

def weighted_mean(neighborhood, utility_matrix, user_id):
    try:
        ratings = utility_matrix[user_id][neighborhood.index.values]
        return (neighborhood * ratings).sum() / neighborhood.sum()
    except:
        return np.nan

In [8]:
def create_prediction_df(utility_matrix, similarity_matrix):
    prediction_df = pd.DataFrame(np.nan, columns=utility_matrix.columns, index=utility_matrix.index, dtype=float)           
    for business in utility_matrix.index:
        for user in utility_matrix.columns:
            neighborhood = select_neighborhood(similarity_matrix, utility_matrix, user, business)
            prediction_df[user][business] = weighted_mean(neighborhood, utility_matrix, user)
    return prediction_df

In [9]:
def top_recommendations(predictions_df, user, amount=10):
    reccs = prediction_df[user].sort_values(ascending=False)[:amount]
    for id in reccs.index:
        print(Sun_business[Sun_business['business_id'] == id]['name'].values, reccs[id])
    return reccs

In [10]:
def get_recommendations_website(user, city, new = False):
    reviews = pd.read_json(open(f'yelp-all/{city}/review.json'), lines=True)
    users = pd.read_json(open(f'yelp-all/{city}/user.json'), lines=True)
    businesses = pd.read_json(open(f'yelp-all/{city}/business.json'), lines=True)
    utility_matrix = pivot_ratings(reviews)
    centered_utility_matrix = mean_center_columns(utility_matrix)
    similarity_matrix = create_similarity_matrix_cosine(centered_utility_matrix)
    prediction_df = create_prediction_df(utility_matrix, similarity_matrix)
    if new == True:
        prediction_df= prediction_df.subtract(utility_matrix, fill_value = 0)
    return top_recommendations(prediction_df, user)

Volgende stap is alles uit te testen en een resultaat te krijgen

In [11]:
get_recommendations_website('--bk6oc1GSNnTZG-UakcfQ', 'stouffville')

  """
  ret = ret.dtype.type(ret / rcount)


NameError: name 'utility_matrix' is not defined

In [None]:
# utility_matrix = pivot_ratings(review)
# adjusted_utility_matrix = mean_center_columns(utility_matrix)
# similarity_matrix = create_similairty_matrix_cosine (adjusted_utility_matrix)
