In [32]:
pip install scikit-surprise

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [33]:
import pandas as pd
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise.model_selection import cross_validate

In [34]:
ratings_df = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1/main/Datasets/ratings.csv')
movies_df = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1/main/Datasets/movies.csv')

In [35]:
data = pd.merge(ratings_df, movies_df, on='movieId')

In [36]:
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(data[['userId','movieId','rating']], reader)

In [37]:
sim_options = {'name':'cosine',
               'user_based': False,
               'k':10} # content based filtering

In [38]:
algo = KNNBasic(sim_options=sim_options)

In [39]:
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9727  0.9759  0.9648  0.9842  0.9739  0.9743  0.0062  
Fit time          1.52    1.42    1.40    1.39    1.64    1.47    0.10    
Test time         3.15    2.87    3.01    3.35    3.16    3.11    0.16    


{'test_rmse': array([0.97265777, 0.97590203, 0.96481857, 0.9841798 , 0.97385791]),
 'fit_time': (1.5168449878692627,
  1.4155688285827637,
  1.3994770050048828,
  1.3862550258636475,
  1.6422078609466553),
 'test_time': (3.1537280082702637,
  2.8723762035369873,
  3.0056071281433105,
  3.3509020805358887,
  3.158282995223999)}

In [40]:
algo.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x104411fa0>

In [41]:
print(algo.sim)

[[1.         0.90773108 0.96657344 ... 1.         1.         1.        ]
 [0.90773108 1.         0.88581521 ... 1.         1.         1.        ]
 [0.96657344 0.88581521 1.         ... 1.         1.         1.        ]
 ...
 [1.         1.         1.         ... 1.         1.         1.        ]
 [1.         1.         1.         ... 1.         1.         1.        ]
 [1.         1.         1.         ... 1.         1.         1.        ]]


In [48]:
similarities = algo.sim

In [42]:
df = pd.merge(ratings_df, movies_df, on='movieId')

In [43]:
pred = algo.predict(uid=1, iid=10)
pred.est

3.9880309408033616

In [44]:
df[df['movieId']==10]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
27755,7,10,4.0,1322062970,GoldenEye (1995),Action|Adventure|Thriller
27756,9,10,3.0,842686329,GoldenEye (1995),Action|Adventure|Thriller
27757,11,10,3.0,850816421,GoldenEye (1995),Action|Adventure|Thriller
27758,28,10,3.0,884100808,GoldenEye (1995),Action|Adventure|Thriller
27759,29,10,4.0,836583421,GoldenEye (1995),Action|Adventure|Thriller
...,...,...,...,...,...,...
27885,627,10,3.5,1237934561,GoldenEye (1995),Action|Adventure|Thriller
27886,650,10,5.0,965517712,GoldenEye (1995),Action|Adventure|Thriller
27887,653,10,4.0,829071627,GoldenEye (1995),Action|Adventure|Thriller
27888,656,10,4.0,838777501,GoldenEye (1995),Action|Adventure|Thriller


In [47]:
from surprise.model_selection import train_test_split
from surprise import accuracy

In [49]:
trainset, testset = train_test_split(data, test_size=0.25)

In [50]:
algo = KNNBasic(sim_options={'user_based':True})

In [51]:
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x10440c0a0>

In [52]:
predictions = algo.test(testset)

In [59]:
df[(df['userId']==232) & (df['movieId']==56367)]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
27490,232,56367,4.0,1277406098,Juno (2007),Comedy|Drama|Romance


In [53]:
predictions

[Prediction(uid=30, iid=33794, r_ui=5.0, est=4.141956747337646, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=668, iid=65088, r_ui=2.0, est=1.8685541770943217, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid=232, iid=56367, r_ui=4.0, est=3.8696745325970388, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=330, iid=590, r_ui=5.0, est=3.867575165954036, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=192, iid=508, r_ui=5.0, est=3.852543754892401, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=404, iid=1042, r_ui=3.0, est=3.4329940828386314, details={'actual_k': 24, 'was_impossible': False}),
 Prediction(uid=164, iid=104074, r_ui=4.0, est=3.5170877423927904, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=72, iid=1729, r_ui=3.0, est=3.8934398582911522, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=109, iid=1214, r_ui=5.0, e

In [61]:
df[df['userId']==1]['movieId'].unique()

array([   16,    24,    32,    47,    50,   110,   150,   161,   165,
         204,   223,   256,   260,   261,   277,   296,   318,   349,
         356,   377,   380,   457,   480,   527,   589,   590,   592,
         593,   597,   608,   648,   719,   724,   736,   780,   858,
         912,   968,  1061,  1089,  1136,  1196,  1198,  1210,  1213,
        1220,  1221,  1222,  1233,  1243,  1258,  1265,  1267,  1270,
        1287,  1580,  1617,  1721,  1923,  1961,  2021,  2028,  2105,
        2161,  2194,  2396,  2407,  2455,  2467,  2502,  2542,  2571,
        2628,  2716,  2728,  2762,  2858,  2947,  2959,  3256,  3421,
        3578,  4011,  4027,  4033,  4085,  4262,  4306,  4963,  4993,
        4995,  5349,  5378,  5418,  5445,  5952,  6365,  6711,  6807,
        7153,  8825,  8961, 32587, 33493, 33794, 33834, 45950, 48516,
       48780, 49272, 52973, 54286, 57949])

In [62]:
algo.predict(uid=1, iid=10)

Prediction(uid=1, iid=10, r_ui=None, est=3.5810610591502514, details={'actual_k': 40, 'was_impossible': False})

In [63]:
df[df['movieId']==10]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
27755,7,10,4.0,1322062970,GoldenEye (1995),Action|Adventure|Thriller
27756,9,10,3.0,842686329,GoldenEye (1995),Action|Adventure|Thriller
27757,11,10,3.0,850816421,GoldenEye (1995),Action|Adventure|Thriller
27758,28,10,3.0,884100808,GoldenEye (1995),Action|Adventure|Thriller
27759,29,10,4.0,836583421,GoldenEye (1995),Action|Adventure|Thriller
...,...,...,...,...,...,...
27885,627,10,3.5,1237934561,GoldenEye (1995),Action|Adventure|Thriller
27886,650,10,5.0,965517712,GoldenEye (1995),Action|Adventure|Thriller
27887,653,10,4.0,829071627,GoldenEye (1995),Action|Adventure|Thriller
27888,656,10,4.0,838777501,GoldenEye (1995),Action|Adventure|Thriller


In [64]:
# SVD

from surprise import SVD

In [65]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8637  0.8658  0.8736  0.8757  0.8745  0.8707  0.0049  
Fit time          0.43    0.55    0.49    0.51    0.51    0.50    0.04    
Test time         0.07    0.15    0.05    0.06    0.13    0.09    0.04    


{'test_rmse': array([0.86367135, 0.86576704, 0.87363876, 0.87566494, 0.87454553]),
 'fit_time': (0.43370580673217773,
  0.5457918643951416,
  0.4882988929748535,
  0.50872802734375,
  0.5107200145721436),
 'test_time': (0.06579804420471191,
  0.1472179889678955,
  0.0529780387878418,
  0.061151981353759766,
  0.12544512748718262)}

In [66]:
trainset2 = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10ea0ef40>

In [67]:
svd.predict(1, 10)

Prediction(uid=1, iid=10, r_ui=None, est=3.4199677485643796, details={'was_impossible': False})

In [68]:
# predictions for everyone

df['predicted_rating'] = df.apply(lambda x: svd.predict(x['userId'], x['movieId']).est, axis=1)

In [69]:
df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,predicted_rating
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama,3.765583
1,9,16,4.0,842686699,Casino (1995),Crime|Drama,3.375058
2,12,16,1.5,1144396284,Casino (1995),Crime|Drama,2.667534
3,24,16,4.0,963468757,Casino (1995),Crime|Drama,3.795119
4,29,16,3.0,836820223,Casino (1995),Crime|Drama,3.262813
...,...,...,...,...,...,...,...
105334,668,140098,2.5,1450415424,Runoff (2015),Drama,2.751195
105335,668,140816,2.5,1443288791,Tangerine (2015),Comedy|Drama,2.645866
105336,668,141472,2.5,1442679119,The 50 Year Argument (2014),(no genres listed),2.751195
105337,668,142488,4.0,1451535844,Spotlight (2015),Thriller,2.751195
