# Collaborative Filtering Using Surprise


In [13]:
# import library
import pandas as pd
import numpy as np

from surprise import Reader
from surprise import Dataset
from surprise import SVD
from surprise.model_selection import cross_validate


In [30]:
# import data
# Amazon ratings of Video Games
df = pd.read_csv("Data/Ratings.csv",header = None)
df.columns =['userID','itemID','rating','unixReviewTime']
df.head()

Unnamed: 0,userID,itemID,rating,unixReviewTime
0,AB9S9279OZ3QO,0078764343,5.0,1373155200
1,A24SSUT5CSW8BH,0078764343,5.0,1377302400
2,AK3V0HEBJMQ7J,0078764343,4.0,1372896000
3,A10BECPH7W8HM7,043933702X,5.0,1404950400
4,A2PRV9OULX1TWP,043933702X,5.0,1386115200


In [33]:
df.dropna()
df.describe()

Unnamed: 0,rating,unixReviewTime
count,1324753.0,1324753.0
mean,3.98,1299233854.71
std,1.38,109255206.4
min,1.0,878947200.0
25%,3.0,1252022400.0
50%,5.0,1346544000.0
75%,5.0,1379894400.0
max,5.0,1406073600.0


In [50]:
# check number of users and games

n_users = df.userID.nunique()
print('Number of users',n_users)
n_games = df.itemID.nunique()
print('Number of games', n_games)
  
max_user = df.groupby('userID')['rating'].count().max()
print('Top Reveiwer', max_user, 'ratings')
max_game = df.groupby('itemID')['rating'].count().max()
print('Top Game', max_game, 'ratings')

Number of users 826767
Number of games 50210
Top Reveiwer 880 ratings
Top Game 16221 ratings


In [34]:
# customize dataset to use Surprise

from surprise import Reader
from surprise import Dataset

reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x161994ea4e0>

### SVD

In [37]:
# Use the famous SVD algorithm.
model = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2505  1.2549  1.2522  1.2558  1.2530  1.2533  0.0019  
MAE (testset)     0.9807  0.9835  0.9810  0.9831  0.9815  0.9820  0.0011  
Fit time          158.96  166.04  163.27  157.38  89.30   146.99  29.01   
Test time         6.29    5.24    4.70    3.71    3.01    4.59    1.15    


{'fit_time': (158.95942616462708,
  166.03821873664856,
  163.2711832523346,
  157.37851119041443,
  89.3022038936615),
 'test_mae': array([ 0.98073135,  0.9835231 ,  0.98098581,  0.98307183,  0.98151104]),
 'test_rmse': array([ 1.2505102 ,  1.25488836,  1.25224264,  1.25584443,  1.25298003]),
 'test_time': (6.292738199234009,
  5.2379326820373535,
  4.704514741897583,
  3.7082934379577637,
  3.010585069656372)}

### Recommendation

In [69]:
# for specific user AZZTK25SW2VNN

id= 'AZZTK25SW2VNN'
user_df = df[(df['userID'] == id) & (df['rating'] == 5)]
user_df = user_df.set_index('itemID')
print(user_df)

                   userID  rating  unixReviewTime
itemID                                           
B00009VE6E  AZZTK25SW2VNN    5.00      1386892800
B0002J9UB4  AZZTK25SW2VNN    5.00      1386892800
B000FUWCRY  AZZTK25SW2VNN    5.00      1386892800
B0013OL0BK  AZZTK25SW2VNN    5.00      1386892800
B0037JT3TK  AZZTK25SW2VNN    5.00      1360368000


In [73]:
# recommendation for this user
# predict what game would user love to play

trainset = data.build_full_trainset()
model.fit(trainset)

user_df = user_df.reset_index()
user_df['estimate_score'] = user_df['itemID'].apply(lambda x: model.predict(id, x).est)
user_df = user_df.sort_values('estimate_score', ascending=False)
print(user_df.head(10))

   level_0  index      itemID         userID  rating  unixReviewTime  \
3        3      3  B0013OL0BK  AZZTK25SW2VNN    5.00      1386892800   
0        0      0  B00009VE6E  AZZTK25SW2VNN    5.00      1386892800   
1        1      1  B0002J9UB4  AZZTK25SW2VNN    5.00      1386892800   
2        2      2  B000FUWCRY  AZZTK25SW2VNN    5.00      1386892800   
4        4      4  B0037JT3TK  AZZTK25SW2VNN    5.00      1360368000   

   estimate_score  
3            4.76  
0            4.60  
1            4.59  
2            4.57  
4            4.57  
