In [2]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy 
from surprise.model_selection import train_test_split

In [4]:
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.25, random_state=0)

In [5]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe865747390>

In [6]:
# test() method
predictions = algo.test(testset)
type(predictions)

list

In [7]:
len(predictions)

25000

In [8]:
predictions[:5]

[Prediction(uid='120', iid='282', r_ui=4.0, est=3.7364036009807253, details={'was_impossible': False}),
 Prediction(uid='882', iid='291', r_ui=4.0, est=3.8711497507289954, details={'was_impossible': False}),
 Prediction(uid='535', iid='507', r_ui=5.0, est=4.276269069198319, details={'was_impossible': False}),
 Prediction(uid='697', iid='244', r_ui=5.0, est=3.6017863958905774, details={'was_impossible': False}),
 Prediction(uid='751', iid='385', r_ui=4.0, est=3.257111679305999, details={'was_impossible': False})]

In [11]:
[(pred.uid, pred.iid, pred.est) for pred in predictions[:3]]

[('120', '282', 3.7364036009807253),
 ('882', '291', 3.8711497507289954),
 ('535', '507', 4.276269069198319)]

In [13]:
# predict() method for an indivisual item
uid = str(196)
iid = str(302)
pred = algo.predict(uid, iid)
print(pred)

user: 196        item: 302        r_ui = None   est = 4.05   {'was_impossible': False}


In [14]:
accuracy.rmse(predictions)

RMSE: 0.9495


0.9494686037566482

In [15]:
import pandas as pd

ratings = pd.read_csv("ratings.csv")
ratings.to_csv("ratings_noheader", index=False, header=False)

In [17]:
from surprise import Reader

reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5, 5))
data = Dataset.load_from_file("ratings_noheader", reader=reader)

# # from Pandas Dataframe
# reader = Reader(rating_scale=(0.5, 5.0))
# data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [19]:
train_set, test_set = train_test_split(data, test_size=.25, random_state=0)

In [21]:
algo = SVD(n_factors=50, random_state=0) # latent factor =50

In [22]:
algo.fit(train_set)
predictions = algo.test(test_set)
accuracy.rmse(predictions)

RMSE: 0.8682


0.8681952927143516

In [23]:
# Cross-validate and hyperparameter tunning

from surprise.model_selection import cross_validate

# ratings = pd.read_csv("ratings.csv")
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [25]:
algo = SVD(random_state=0)
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8712  0.8680  0.8796  0.8785  0.8745  0.8744  0.0044  
MAE (testset)     0.6717  0.6683  0.6753  0.6726  0.6724  0.6721  0.0023  
Fit time          4.41    4.51    4.53    4.53    4.52    4.50    0.04    
Test time         0.11    0.10    0.10    0.11    0.20    0.12    0.04    


{'test_rmse': array([0.87124655, 0.86797792, 0.8796338 , 0.87850185, 0.87445522]),
 'test_mae': array([0.67167593, 0.66828187, 0.67532411, 0.67259994, 0.6723908 ]),
 'fit_time': (4.4107441902160645,
  4.509663105010986,
  4.5306360721588135,
  4.526982069015503,
  4.517225027084351),
 'test_time': (0.10953807830810547,
  0.10424399375915527,
  0.10464286804199219,
  0.10774087905883789,
  0.19532203674316406)}

In [27]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs': [20, 40, 60],
             'n_factors':[50, 100, 200]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.8763273822176986
{'n_epochs': 20, 'n_factors': 50}
