In [1]:
from surprise import SVD, NMF, Dataset, Reader, accuracy, KNNBaseline
from surprise.model_selection import cross_validate, train_test_split
import pandas as pd
import numpy as np
from main import get_top_n

In [3]:
df = pd.read_csv('ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
df.shape

(100836, 4)

In [5]:
# 1 to 10, -3 to +3 ... what is the scale of your ratings

In [7]:
df.rating.describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [8]:
reader = Reader(rating_scale=(0.5, 5.0))

In [10]:
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

In [11]:
trainset, testset = train_test_split(data, test_size=0.25)

In [12]:
nmf = NMF()

In [13]:
nmf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x124920828>

In [15]:
predictions = nmf.test(testset)

In [16]:
accuracy.rmse(predictions)

RMSE: 0.9325


0.932526195227843

In [17]:
predictions[:5]

[Prediction(uid=484, iid=1580, r_ui=3.5, est=3.5374415894266735, details={'was_impossible': False}),
 Prediction(uid=223, iid=780, r_ui=3.5, est=2.7685657935791177, details={'was_impossible': False}),
 Prediction(uid=466, iid=1196, r_ui=2.5, est=4.582975208687068, details={'was_impossible': False}),
 Prediction(uid=89, iid=2997, r_ui=3.0, est=2.817522917793011, details={'was_impossible': False}),
 Prediction(uid=247, iid=1291, r_ui=4.0, est=3.8244904335711025, details={'was_impossible': False})]

In [21]:
nmf.predict(1, 31)

Prediction(uid=1, iid=31, r_ui=None, est=4.175081599001478, details={'was_impossible': False})

In [22]:
nmf.predict(1, 1000000000)

Prediction(uid=1, iid=1000000000, r_ui=None, est=3.500588414190699, details={'was_impossible': True, 'reason': 'User and item are unkown.'})

In [24]:
top_n = get_top_n(predictions, n=3)

In [28]:
for i, (uid, ur) in enumerate(top_n.items()):
    print(i, uid, [iid for iid, _ in ur])
    if i == 5:
        break

0 484 [318, 88129, 364]
1 223 [58559, 6711, 260]
2 466 [69844, 1196, 7153]
3 89 [945, 1256, 58998]
4 247 [608, 260, 6377]
5 387 [1248, 3627, 858]


In [29]:
knn = KNNBaseline(sim_options={'name': 'cosine', 'user_based':False})

In [30]:
knn.fit(trainset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x12108bcc0>

In [31]:
knn.get_neighbors(2, 5)

[28, 36, 42, 60, 68]