In [1]:
import pandas as pd
from surprise import prediction_algorithms as pa
from surprise import Dataset, Reader
from surprise import evaluate, print_perf

In [2]:
data = pd.read_csv('./movielens_small/ratings.csv')
number_of_rows = len(data)


In [3]:
for nfolds in [2,4,5,10]:
    reader = Reader(rating_scale=(0.5, 5))
    train_data = Dataset.load_from_df(data[['userId','movieId','rating']], reader)

    algo = pa.knns.KNNBasic(k=10, min_k=1)

    train_data.split(n_folds=nfolds)

    perf = evaluate(algo, train_data, measures=['RMSE', 'MAE','FCP'])

Evaluating RMSE, MAE, FCP of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9990
MAE:  0.7673
FCP:  0.6415
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9981
MAE:  0.7644
FCP:  0.6418
------------
------------
Mean RMSE: 0.9985
Mean MAE : 0.7658
Mean FCP : 0.6417
------------
------------
Evaluating RMSE, MAE, FCP of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9668
MAE:  0.7449
FCP:  0.6560
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9712
MAE:  0.7424
FCP:  0.6551
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9757
MAE:  0.7444
FCP:  0.6567
------------
Fold 4
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9755
MAE:  0.7491
FCP:  0.6549
--

In [4]:
data = data.sort_values(['timestamp'], ascending=[1])
for test_ratio in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    split = int(test_ratio*number_of_rows)
    train_data = data[:split]
    test_data = data[split:]

    reader = Reader(rating_scale=(0.5, 5))
    train_data = Dataset.load_from_df(train_data[['userId','movieId','rating']], reader)
    train_data = train_data.build_full_trainset()

    algo = pa.knns.KNNBasic(k=10)

    algo.train(train_data)

    s = 0
    a = 0
    for index,row in test_data.iterrows():
        ans = algo.predict(row['userId'],row['movieId'])
        s += (ans.est-row['rating'])**2
        a += abs(ans.est-row['rating'])
    print("rmse : " , (s/len(test_data))**0.5)
    print("mae : " , (a/len(test_data)))

Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.07101198352
mae :  0.851547586044
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.07653132499
mae :  0.845806077894
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.06700145423
mae :  0.83260725111
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.04385117435
mae :  0.812186292471
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.04688371602
mae :  0.815524191461
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.0564615367
mae :  0.824689506046
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.07783786041
mae :  0.844698255158
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.08963494702
mae :  0.861704630163
Computing the msd similarity matrix...
Done computing similarity m

In [5]:
data = data.sort_values(['userId','timestamp'], ascending=[1,1])
for test_ratio in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    train_data = pd.DataFrame()
    test_data = pd.DataFrame()
    users = data.userId.unique()
    for user in users:
        temp_data = data['userId'] == user
        temp_data = data[temp_data]
        x = temp_data[:int(test_ratio*len(temp_data))]
        train_data = train_data.append(x)
        x = temp_data[int(test_ratio*len(temp_data)):]
        test_data = test_data.append(x)
    print(len(test_data))
    reader = Reader(rating_scale=(0.5, 5))
    train_data = Dataset.load_from_df(train_data[['userId','movieId','rating']], reader)
    train_data = train_data.build_full_trainset()

    algo = pa.knns.KNNBasic(k=10, min_k=1)

    algo.train(train_data)

    s = 0
    a = 0
    for index,row in test_data.iterrows():
        ans = algo.predict(row['userId'],row['movieId'])
        s += (ans.est-row['rating'])**2
        a += abs(ans.est-row['rating'])
    print("rmse : " , (s/len(test_data))**0.5)
    print("mae : " , (a/len(test_data)))

90282
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.12087219213
mae :  0.865185105974
80251
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.08382382958
mae :  0.831532150016
70298
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.05615176561
mae :  0.81038249199
60248
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.03998488822
mae :  0.797442874335
50166
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.02781108133
mae :  0.785584907944
40259
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.01697544096
mae :  0.77579488882
30287
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.01112408584
mae :  0.772419230223
20256
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.00869999428
mae :  0.77079063005
10299
Computing the