# Collaborative Filtering Using Surprise


In [1]:
# import library
import pandas as pd
import numpy as np

from surprise import Reader
from surprise import Dataset
from surprise import SVD
from surprise.model_selection import cross_validate


In [2]:
# import data
# Amazon ratings of Video Games
df = pd.read_csv("Data/Ratings.csv",header = None)
df.columns =['userID','itemID','rating','unixReviewTime']
df.head()

Unnamed: 0,userID,itemID,rating,unixReviewTime
0,AB9S9279OZ3QO,0078764343,5.0,1373155200
1,A24SSUT5CSW8BH,0078764343,5.0,1377302400
2,AK3V0HEBJMQ7J,0078764343,4.0,1372896000
3,A10BECPH7W8HM7,043933702X,5.0,1404950400
4,A2PRV9OULX1TWP,043933702X,5.0,1386115200


In [3]:
df.dropna()
df.describe()

Unnamed: 0,rating,unixReviewTime
count,1324753.0,1324753.0
mean,3.978754,1299234000.0
std,1.378987,109255200.0
min,1.0,878947200.0
25%,3.0,1252022000.0
50%,5.0,1346544000.0
75%,5.0,1379894000.0
max,5.0,1406074000.0


In [4]:
# check number of users and games

n_users = df.userID.nunique()
print('Number of users',n_users)
n_games = df.itemID.nunique()
print('Number of games', n_games)
  
max_user = df.groupby('userID')['rating'].count().max()
print('Top Reveiwer', max_user, 'ratings')
max_game = df.groupby('itemID')['rating'].count().max()
print('Top Game', max_game, 'ratings')

Number of users 826767
Number of games 50210
Top Reveiwer 880 ratings
Top Game 16221 ratings


In [5]:
# customize dataset to use Surprise

from surprise import Reader
from surprise import Dataset

reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x159bda0eef0>

### SVD

In [6]:
# Use the famous SVD algorithm.
model = SVD()

# Run 3-fold cross-validation and print results.
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2568  1.2573  1.2547  1.2563  0.0011  
MAE (testset)     0.9858  0.9867  0.9835  0.9853  0.0014  
Fit time          104.01  104.84  99.22   102.69  2.48    
Test time         5.97    4.53    3.79    4.76    0.91    


{'fit_time': (104.01475405693054, 104.84029698371887, 99.22409987449646),
 'test_mae': array([ 0.98581752,  0.98670601,  0.98347126]),
 'test_rmse': array([ 1.25683448,  1.25734866,  1.25472148]),
 'test_time': (5.969317436218262, 4.5317792892456055, 3.7885751724243164)}

### Recommendation

In [7]:
# for specific user AZZTC2OYVNE2Q
# historical ratings

id= 'AZZTC2OYVNE2Q'
df[(df['userID'] == id)]

Unnamed: 0,userID,itemID,rating,unixReviewTime
281591,AZZTC2OYVNE2Q,B000B9RI14,5.0,1365811200
576519,AZZTC2OYVNE2Q,B001PB9J14,3.0,1356566400
683761,AZZTC2OYVNE2Q,B002I0K50U,3.0,1365811200
704649,AZZTC2OYVNE2Q,B002P35JMO,5.0,1356566400
889750,AZZTC2OYVNE2Q,B004PAGJOC,4.0,1365811200
948325,AZZTC2OYVNE2Q,B0050SYX8W,4.0,1356566400
1028378,AZZTC2OYVNE2Q,B006W41X1S,5.0,1356566400
1176638,AZZTC2OYVNE2Q,B00BD9OLW0,5.0,1365811200


In [8]:
# historical 5 star rating
user_df = df[(df['userID'] == id) & (df['rating'] == 5)]
user_df = user_df.set_index('itemID')
print(user_df)

                   userID  rating  unixReviewTime
itemID                                           
B000B9RI14  AZZTC2OYVNE2Q     5.0      1365811200
B002P35JMO  AZZTC2OYVNE2Q     5.0      1356566400
B006W41X1S  AZZTC2OYVNE2Q     5.0      1356566400
B00BD9OLW0  AZZTC2OYVNE2Q     5.0      1365811200


In [9]:
# recommendation for this user
# predict what game would user love to play

trainset = data.build_full_trainset()
model.fit(trainset)

user_df = user_df.reset_index()
user_df['estimate_score'] = user_df['itemID'].apply(lambda x: model.predict(id, x).est)
user_df = user_df.sort_values('estimate_score', ascending=False)
print(user_df.head(10))

       itemID         userID  rating  unixReviewTime  estimate_score
0  B000B9RI14  AZZTC2OYVNE2Q     5.0      1365811200        4.848344
3  B00BD9OLW0  AZZTC2OYVNE2Q     5.0      1365811200        4.721596
1  B002P35JMO  AZZTC2OYVNE2Q     5.0      1356566400        4.565306
2  B006W41X1S  AZZTC2OYVNE2Q     5.0      1356566400        4.413981
