Based on this article: https://cambridgespark.com/content/tutorials/implementing-your-own-recommender-systems-in-Python/index.html

In [61]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt

In [62]:
reviews_df = pd.read_json("../../dataset/review_10k.json", lines=True)

In [63]:
# This cell creates a unique integer per business_id and a unique integer per user_id. 
# It then maps the business_id and user_id into business_id_int and user_id_int in the reviews_df.
# This is so that when we work with user-business matrices, we are dealing with integers as indexes rather than strings.

# Create an integer-based user-id
uid_map = {}
c = 0
for uid in reviews_df.user_id.unique():
    uid_map[uid] = c
    c += 1

# Create an integer-based business-id
bid_map = {}
c = 0
for bid in reviews_df.business_id.unique():
    bid_map[bid] = c
    c += 1
    
reviews_df['user_id_int'] = reviews_df.user_id.apply(lambda x: uid_map[x])
reviews_df['business_id_int'] = reviews_df.business_id.apply(lambda x: bid_map[x])

In [51]:
reviews_df.head(10)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,user_id_int,business_id_int
0,0W4lkclzZThpx3V65bVgig,0,2016-05-28,0,v0i_UHJMo_hPBq9bxWvW4w,5,"Love the staff, love the meat, love the place....",0,bv2nCi5Qv5vroFiqKGopiw,0,0
1,AEx2SYEUJmTxVVB18LlCwA,0,2016-05-28,0,vkVSCC7xljjrAI4UGfnKEQ,5,Super simple place but amazing nonetheless. It...,0,bv2nCi5Qv5vroFiqKGopiw,0,1
2,VR6GpWIda3SfvPC-lg9H3w,0,2016-05-28,0,n6QzIUObkYshz4dz2QRJTw,5,Small unassuming place that changes their menu...,0,bv2nCi5Qv5vroFiqKGopiw,0,2
3,CKC0-MOWMqoeWf6s-szl8g,0,2016-05-28,0,MV3CcKScW05u5LVfF6ok0g,5,Lester's is located in a beautiful neighborhoo...,0,bv2nCi5Qv5vroFiqKGopiw,0,3
4,ACFtxLv8pGrrxMm6EgjreA,0,2016-05-28,0,IXvOzsEMYtiJI0CARmj77Q,4,Love coming here. Yes the place always needs t...,0,bv2nCi5Qv5vroFiqKGopiw,0,4
5,s2I_Ni76bjJNK9yG60iD-Q,0,2016-05-28,0,L_9BTb55X0GDtThi6GlZ6w,4,Had their chocolate almond croissant and it wa...,0,bv2nCi5Qv5vroFiqKGopiw,0,5
6,8QWPlVQ6D-OExqXoaD2Z1g,0,2014-09-24,0,HRPm3vEZ_F-33TYVT7Pebw,5,Cycle Pub Las Vegas was a blast! Got a groupon...,1,_4iMDXbXZ1p1ONG297YEAQ,1,6
7,9_CGhHMz8698M9-PkVf0CQ,2,2012-05-11,0,ymAUG8DZfQcFTBSOiaNN4w,4,Who would have guess that you would be able to...,0,u0LXt3Uea_GidxRW1xcsfg,2,7
8,gkCorLgPyQLsptTHalL61g,0,2015-10-27,0,8UIishPUD92hXtScSga_gw,4,Always drove past this coffee house and wonder...,1,u0LXt3Uea_GidxRW1xcsfg,2,8
9,5r6-G9C4YLbC7Ziz57l3rQ,0,2013-02-09,0,w41ZS9shepfO3uEyhXEWuQ,3,"Not bad!! Love that there is a gluten-free, ve...",1,u0LXt3Uea_GidxRW1xcsfg,2,9


In [53]:
df = reviews_df[['user_id_int', 'business_id_int', 'stars']]
n_users = df.user_id_int.unique().shape[0]
n_businesses = df.business_id_int.unique().shape[0]
print 'Number of users = ' + str(n_users) + ' | Number of businesses = ' + str(n_businesses)

Number of users = 1059 | Number of businesses = 8261


In [54]:
# Split train and test data
train_data, test_data = cv.train_test_split(df, test_size=0.25)

In [55]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_businesses))
for line in train_data.itertuples():
    train_data_matrix[line[1], line[2]] = line[3]

test_data_matrix = np.zeros((n_users, n_businesses))
for line in test_data.itertuples():
    test_data_matrix[line[1], line[2]] = line[3]

In [56]:
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [57]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [58]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [59]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [60]:
print 'User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
print 'Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))

User-based CF RMSE: 3.92262210994
Item-based CF RMSE: 3.92100355252
