## 3. Collaborative Filtering (based on user's review)

In [2]:
import surprise
surprise.__version__

'1.1.1'

In [2]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [17]:
ratings = pd.read_csv('ratings_small.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [6]:
print("minimum rating is {}".format(ratings['rating'].min()))
print("maximum rating is {}".format(ratings['rating'].max()))


minimum rating is 0.5
maximum rating is 5.0


In [9]:
# So we need to set the range of the ratings to scale = (0.5, 5) by using the Reader class
reader = Reader(rating_scale=(0.5,5))

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader=reader)
data

<surprise.dataset.DatasetAutoFolds at 0x7f79922d46d0>

In [10]:
# build the SVD model
svd = SVD(random_state=0)

# evaluate the model
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8932  0.9016  0.8923  0.9042  0.8998  0.8982  0.0047  
MAE (testset)     0.6882  0.6964  0.6872  0.6930  0.6919  0.6913  0.0033  
Fit time          4.44    4.57    4.72    4.66    4.65    4.61    0.10    
Test time         0.08    0.18    0.08    0.08    0.08    0.10    0.04    


{'test_rmse': array([0.8932147 , 0.90164325, 0.89230422, 0.90424108, 0.899764  ]),
 'test_mae': array([0.6882475 , 0.6963735 , 0.68718076, 0.69299626, 0.69193298]),
 'fit_time': (4.4355080127716064,
  4.568915843963623,
  4.722170114517212,
  4.6563849449157715,
  4.6476991176605225),
 'test_time': (0.08398604393005371,
  0.18099498748779297,
  0.08353209495544434,
  0.07916784286499023,
  0.08052587509155273)}

In [11]:
# train the svd using the data.build_full_trainset()
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f79922d4670>

In [13]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [12]:
svd.predict(1, 302)
# if the user1 is seeing the 302 movie, user1 will approximately rate this movie as...

Prediction(uid=1, iid=302, r_ui=None, est=2.7142061734434044, details={'was_impossible': False})

In [14]:
svd.predict(1, 1172, 4)

Prediction(uid=1, iid=1172, r_ui=4, est=3.483496124990946, details={'was_impossible': False})

In [21]:
print(ratings[ratings['userId'] == 2])

    userId  movieId  rating  timestamp
20       2       10     4.0  835355493
21       2       17     5.0  835355681
22       2       39     5.0  835355604
23       2       47     4.0  835355552
24       2       50     4.0  835355586
..     ...      ...     ...        ...
91       2      592     5.0  835355395
92       2      593     3.0  835355511
93       2      616     3.0  835355932
94       2      661     4.0  835356141
95       2      720     4.0  835355978

[76 rows x 4 columns]
