In [1]:
import pandas as pd
from surprise import prediction_algorithms as pa
from surprise import Dataset, Reader
from surprise import evaluate, print_perf

In [2]:
data = pd.read_csv('./movielens_small/ratings.csv')
number_of_rows = len(data)


In [3]:
for nfolds in [2,4,5,10]:
    reader = Reader(rating_scale=(1, 5))
    train_data = Dataset.load_from_df(data[['userId','movieId','rating']], reader)

    algo = pa.knns.KNNBasic(k=10, min_k=1)

    train_data.split(n_folds=nfolds)

    perf = evaluate(algo, train_data, measures=['RMSE', 'MAE','FCP'])

Evaluating RMSE, MAE, FCP of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9957
MAE:  0.7639
FCP:  0.6428
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0036
MAE:  0.7712
FCP:  0.6421
------------
------------
Mean RMSE: 0.9997
Mean MAE : 0.7675
Mean FCP : 0.6424
------------
------------
Evaluating RMSE, MAE, FCP of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9768
MAE:  0.7470
FCP:  0.6576
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9749
MAE:  0.7492
FCP:  0.6471
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9670
MAE:  0.7420
FCP:  0.6536
------------
Fold 4
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9633
MAE:  0.7368
FCP:  0.6590
--

In [None]:
data = data.sort_values(['timestamp'], ascending=[1])
for test_ratio in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    split = int(test_ratio*number_of_rows)
    train_data = data[:split]
    test_data = data[split:]

    reader = Reader(rating_scale=(1, 5))
    train_data = Dataset.load_from_df(train_data[['userId','movieId','rating']], reader)
    train_data = train_data.build_full_trainset()

    algo = pa.knns.KNNBasic(k=10)

    algo.train(train_data)

    s = 0
    a = 0
    for index,row in test_data.iterrows():
        ans = algo.predict(row['userId'],row['movieId'])
        s += (ans.est-row['rating'])**2
        a += abs(ans.est-row['rating'])
    print("rmse : " , (s/len(test_data))**0.5)
    print("mae : " , (a/len(test_data)))

Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.07101198352
mae :  0.851547586044
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.07653132499
mae :  0.845806077894
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.06700145423
mae :  0.83260725111
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.04385117435
mae :  0.812186292471
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.04681014379
mae :  0.815498098866
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.05644160379
mae :  0.824657058312
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.07779533901
mae :  0.844681589602
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.08919321789
mae :  0.861579636413
Computing the msd similarity matrix...
Done computing similarity 

In [None]:
data = data.sort_values(['userId','timestamp'], ascending=[1,1])
for test_ratio in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    train_data = pd.DataFrame()
    test_data = pd.DataFrame()
    users = data.userId.unique()
    for user in users:
        temp_data = data['userId'] == user
        temp_data = data[temp_data]
        x = temp_data[:int(test_ratio*len(temp_data))]
        train_data = train_data.append(x)
        x = temp_data[int(test_ratio*len(temp_data)):]
        test_data = test_data.append(x)
    print(len(test_data))
    reader = Reader(rating_scale=(1, 5))
    train_data = Dataset.load_from_df(train_data[['userId','movieId','rating']], reader)
    train_data = train_data.build_full_trainset()

    algo = pa.knns.KNNBasic(k=10, min_k=1)

    algo.train(train_data)

    s = 0
    a = 0
    for index,row in test_data.iterrows():
        ans = algo.predict(row['userId'],row['movieId'])
        s += (ans.est-row['rating'])**2
        a += abs(ans.est-row['rating'])
    print("rmse : " , (s/len(test_data))**0.5)
    print("mae : " , (a/len(test_data)))

90282
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.11935865724
mae :  0.864378209891
80251
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.082118489
mae :  0.830784479277
70298
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.05506367345
mae :  0.809942994211
60248
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.03884134251
mae :  0.796960405592
50166
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.02653142847
mae :  0.785124181847
40259
Computing the msd similarity matrix...
Done computing similarity matrix.
