## User-Based filtering

In [1]:
import pandas as pd
import numpy as np

In [2]:
lst=[]
for file in ['business', 'checkin', 'review', 'tip', 'user']:
    f = open(f'yelp-all/stouffville/{file}.json')
    lst.append(pd.read_json(f, lines = True))

business = lst[0]
checkin = lst[1]
review = lst[2]
tip = lst[3]
user = lst[4]

Na het inladen van de modules en de databestanden is het nodig met een aantal helperfuncties een utility matrix te maken. Het verschil tussen user-based en item-based is de oriëntatie van de matrix.

In [3]:
# define a helper function for accessing data
def get_rating(reviews, user_id = None, business_id = None, review_id = None):
    try:
        if user_id and business_id:
            return reviews[(reviews['user_id'] == user_id) & (reviews['business_id'] == business_id)]['stars'].values.mean()
        elif review_id:
            return reviews[(reviews['review_id'] == review_id)]
    except:
        return np.nan# define a helper function to create a pivot table

In [4]:
def pivot_ratings(reviews):
    businessIds = reviews['business_id'].unique()
    userIds = reviews['user_id'].unique()
    
    # create empty data frame
    pivot_data = pd.DataFrame(np.nan, columns=userIds, index=businessIds, dtype=float)
    
    # use the function get_rating to fill the matrix
    for i in pivot_data.columns:
        for j in pivot_data.index:
            pivot_data[i][j] = get_rating(reviews, i, j)
    
    return pivot_data.T


In [5]:
def mean_center_columns(matrix):
    for column in matrix.columns:
        matrix[column] -= matrix[column].mean()
    return matrix

In [6]:
def cosine_similarity(matrix, id1, id2):
    """Compute cosine similarity"""
    selected_features = matrix.loc[id1].notna() & matrix.loc[id2].notna()
    
    # if no matching features, return 0
    if not selected_features.any():
        return 0
    
    # get the features from the matrix
    features1 = matrix.loc[id1][selected_features]
    features2 = matrix.loc[id2][selected_features]
    
    # return 1 for the diagonals and 0 if there are no matching features
    if features1.equals(features2):
        return 1
    if features1.max() == 0 or features2.max() == 0:
        return 0
    
    return sum(features1 * features2)/((sum(features1**2)**0.5) * sum((features2**2))**0.5)


def create_similarity_matrix_cosine(matrix):
    """ creates the similarity matrix based on cosine similarity """
    similarity_matrix = pd.DataFrame(0, index=matrix.index, columns=matrix.index, dtype=float)
    id1 = similarity_matrix.columns.values
    for i in id1:
        for j in id1:
            similarity_matrix[i][j] = cosine_similarity(matrix, i, j)
    return similarity_matrix 

In [7]:
def select_neighborhood(similarity_matrix, utility_matrix, target_user, target_business):
    """selects all items with similarity > 0"""
    try:
        all_neighbors = similarity_matrix.loc[utility_matrix[target_user].dropna().index.values][target_business]
        return all_neighbors[all_neighbors > 0]
    except:
        return np.nan

def weighted_mean(neighborhood, utility_matrix, user_id):
    try:
        ratings = utility_matrix[user_id][neighborhood.index.values]
        return (neighborhood * ratings).sum() / neighborhood.sum()
    except:
        return np.nan

In [8]:
def create_prediction_df(utility_matrix, similarity_matrix):
    prediction_df = pd.DataFrame(np.nan, columns=utility_matrix.columns, index=utility_matrix.index, dtype=float)           
    for business in utility_matrix.index:
        for user in utility_matrix.columns:
            neighborhood = select_neighborhood(similarity_matrix, utility_matrix, user, business)
            prediction_df[user][business] = weighted_mean(neighborhood, utility_matrix, user)
    return prediction_df

In [42]:
def top_recommendations(prediction_df, user, amount=10):
    reccs = prediction_df.T[user].sort_values(ascending=False)[:amount]
    for id in reccs.index:
        print(business[business['business_id'] == id]['name'].values, reccs[id])
    return reccs

In [10]:
def get_recommendations_website(user, city, new = False):
    review = pd.read_json(open(f'yelp-all/{city}/review.json'), lines=True)
    user = pd.read_json(open(f'yelp-all/{city}/user.json'), lines=True)
    business = pd.read_json(open(f'yelp-all/{city}/business.json'), lines=True)
    print('files read')
    utility_matrix = pivot_ratings(review)
    print('utility_matrix constructed')
    centered_utility_matrix = mean_center_columns(utility_matrix)
    print('matrix centered')
    similarity_matrix = create_similarity_matrix_cosine(centered_utility_matrix)
    print('similarity_matrix created')
    prediction_df = create_prediction_df(utility_matrix, similarity_matrix)
    print('prediction_df made')
    if new == True:
        prediction_df = prediction_df.subtract(utility_matrix, fill_value = 0)
    return top_recommendations(prediction_df, user)

Volgende stap is alles uit te testen en een resultaat te krijgen

In [None]:
get_recommendations_website('--bk6oc1GSNnTZG-UakcfQ', 'stouffville')

files read


  """
  ret = ret.dtype.type(ret / rcount)


utility_matrix constructed
matrix centered


### The function above is split in multiple pieces below. This is not necessary

In [15]:
occurences = review.groupby('business_id')['stars'].count()
review1 = review.join(occurences, on='business_id', rsuffix='count')
review_new = review1[review1['starscount'] >= 30]
review_new

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,starscount
0,R4pI7oeAhg1aaGbn9Pv0Sg,0,2018-09-01 03:08:14,0,IFmI6XI1dT5pdWmyur57sQ,2,Oh boy. I love giving proactive reviews but th...,2,aiKbXyRy-WxJNMCussaefg,48
3,R4pI7oeAhg1aaGbn9Pv0Sg,0,2018-06-19 16:16:10,0,7BxpHVk4OwcJaFGBu3smRA,3,If you get a seat on the patio the view of the...,0,aRFTEoz6QS5YWwmDeZoiYw,48
4,R4pI7oeAhg1aaGbn9Pv0Sg,0,2016-12-19 18:08:21,0,MFjXc-Jq0Z87L4MacIm-Fw,4,The atmosphere alone made the 20 minute drive ...,0,-JwJiOuKHl9AGfua8GQzkA,48
10,R4pI7oeAhg1aaGbn9Pv0Sg,0,2015-08-08 15:15:13,0,RT4HYadgsfyTbAB1ajez0A,3,The atmosphere is amazing at this outdoor rest...,1,v5o9CV8nFMfaCwAzEmkR2Q,48
11,R4pI7oeAhg1aaGbn9Pv0Sg,1,2016-09-06 14:00:50,0,d_Oq-IOyIRloTsTNhS2VXw,4,"Great view - checked, romantic - checked, grea...",1,4D6LLuJfao_eHGA6XZR-bA,48
13,R4pI7oeAhg1aaGbn9Pv0Sg,0,2018-07-08 11:12:21,0,XeoK2v9PTmJHzCgFbcZlWA,4,What a great view and a perfect place to hang ...,0,W30LSAdFOnDwXVXl8jCSIw,48
17,R4pI7oeAhg1aaGbn9Pv0Sg,1,2018-08-15 19:57:41,0,_g56h9TtsFiStgzoCgviVg,4,"First time here on Saturday, just 20 mins from...",4,2Rw4bXIrKd94be-BXMzoyw,48
23,R4pI7oeAhg1aaGbn9Pv0Sg,0,2015-08-10 04:53:09,0,brBo-UGaN7MJj9VGhfmZ7g,4,The view is really the first thing we came her...,0,UmTMCfPlhA6kJLAsLycSfg,48
24,R4pI7oeAhg1aaGbn9Pv0Sg,1,2016-08-14 21:20:32,0,wKg7LIAp5bxF_ZyUYq9hyw,4,Great spot with a view! \n\nThis resto is easy...,0,cCpiu1GqdzxlvANKuiAp0Q,48
28,R4pI7oeAhg1aaGbn9Pv0Sg,0,2015-07-31 00:30:07,0,eIE2neT89zwBZxB7shSlXQ,5,I've had the opportunity to revisit Fishbone t...,1,5MeOxFGpR3ku4fdzz38BlA,48


In [36]:
utility_new = pivot_ratings(review_new)
utility_new.head()

  """


Unnamed: 0,R4pI7oeAhg1aaGbn9Pv0Sg,39rLHYJOy2774ZIUouuWLw,yKaAlvQILs53ML8BpavKhw,VmSkQsAjUtzJeliKeGQCOQ
aiKbXyRy-WxJNMCussaefg,2.0,,,
aRFTEoz6QS5YWwmDeZoiYw,3.0,,,
-JwJiOuKHl9AGfua8GQzkA,4.0,,,
v5o9CV8nFMfaCwAzEmkR2Q,3.0,,,
4D6LLuJfao_eHGA6XZR-bA,4.0,,,


In [28]:
centered_utility_new = mean_center_columns(utility_new)
centered_utility_new.head()

Unnamed: 0,R4pI7oeAhg1aaGbn9Pv0Sg,39rLHYJOy2774ZIUouuWLw,yKaAlvQILs53ML8BpavKhw,VmSkQsAjUtzJeliKeGQCOQ
aiKbXyRy-WxJNMCussaefg,-1.958333,,,
aRFTEoz6QS5YWwmDeZoiYw,-0.958333,,,
-JwJiOuKHl9AGfua8GQzkA,0.041667,,,
v5o9CV8nFMfaCwAzEmkR2Q,-0.958333,,,
4D6LLuJfao_eHGA6XZR-bA,0.041667,,,


In [29]:
similarity_new = create_similarity_matrix_cosine(centered_utility_new)
similarity_new.head()

Unnamed: 0,aiKbXyRy-WxJNMCussaefg,aRFTEoz6QS5YWwmDeZoiYw,-JwJiOuKHl9AGfua8GQzkA,v5o9CV8nFMfaCwAzEmkR2Q,4D6LLuJfao_eHGA6XZR-bA,W30LSAdFOnDwXVXl8jCSIw,2Rw4bXIrKd94be-BXMzoyw,UmTMCfPlhA6kJLAsLycSfg,cCpiu1GqdzxlvANKuiAp0Q,5MeOxFGpR3ku4fdzz38BlA,...,tmTA7HiCtL9uMVQaszdE3w,w0YPNwGofPGUEIlsQSRx0Q,UjGEzuLocbqd4SSXPs--mQ,96bTII9Tt5dPg4srZHPWSA,oZfEvCP9JMYzIp13AF2sPw,e5kg9bLvlJz-MEUrGjIeVQ,jAcodA4L4f5SK2WcVF5N0g,UqmS4j0RzJP1NzUhGPV4EA,KkvXuZfi68vZPqgbeSmdow,-xxIawSxb5_L7EUXUNxz6g
aiKbXyRy-WxJNMCussaefg,1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aRFTEoz6QS5YWwmDeZoiYw,1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-JwJiOuKHl9AGfua8GQzkA,-1.0,-1.0,1.0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
v5o9CV8nFMfaCwAzEmkR2Q,1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4D6LLuJfao_eHGA6XZR-bA,-1.0,-1.0,1.0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
prediction_df = create_prediction_df(utility_new, similarity_new)
prediction_df

  if sys.path[0] == '':


Unnamed: 0,R4pI7oeAhg1aaGbn9Pv0Sg,39rLHYJOy2774ZIUouuWLw,yKaAlvQILs53ML8BpavKhw,VmSkQsAjUtzJeliKeGQCOQ
aiKbXyRy-WxJNMCussaefg,2.454545,,,
aRFTEoz6QS5YWwmDeZoiYw,2.454545,,,
-JwJiOuKHl9AGfua8GQzkA,4.405405,2.750000,3.000000,4.333333
v5o9CV8nFMfaCwAzEmkR2Q,2.454545,,,
4D6LLuJfao_eHGA6XZR-bA,4.405405,2.750000,3.000000,4.333333
W30LSAdFOnDwXVXl8jCSIw,4.405405,2.750000,3.000000,4.333333
2Rw4bXIrKd94be-BXMzoyw,4.405405,2.750000,3.000000,4.333333
UmTMCfPlhA6kJLAsLycSfg,4.405405,2.750000,3.000000,4.333333
cCpiu1GqdzxlvANKuiAp0Q,4.405405,2.750000,3.000000,4.333333
5MeOxFGpR3ku4fdzz38BlA,4.411376,3.000000,4.000000,4.965621


In [45]:
top_recommendations(prediction_df, 'yqzmRdR0Am4DhS0EpnCOPA')

['Maki Zushi'] 4.217391304347826
['Fickle Pickle Restaurant & Deli'] 4.0
['Fishbone By The Lake'] 4.0
['ViPei Bistro'] 1.0


yKaAlvQILs53ML8BpavKhw    4.217391
39rLHYJOy2774ZIUouuWLw    4.000000
R4pI7oeAhg1aaGbn9Pv0Sg    4.000000
VmSkQsAjUtzJeliKeGQCOQ    1.000000
Name: yqzmRdR0Am4DhS0EpnCOPA, dtype: float64