In [1]:
import pandas as pd
from surprise import prediction_algorithms as pa
from surprise import Dataset, Reader
from surprise import evaluate, print_perf

In [12]:
data = pd.read_csv('./movielens_small/ratings.csv')
number_of_rows = len(data)


In [3]:
for nfolds in [2,4,5,10]:
    reader = Reader(rating_scale=(0.5, 5))
    train_data = Dataset.load_from_df(data[['userId','movieId','rating']], reader)

    algo = pa.knns.KNNBasic(k=10, min_k=1)

    train_data.split(n_folds=nfolds)

    perf = evaluate(algo, train_data, measures=['RMSE', 'MAE','FCP'])

Evaluating RMSE, MAE, FCP of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9970
MAE:  0.7656
FCP:  0.6419
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9983
MAE:  0.7664
FCP:  0.6406
------------
------------
Mean RMSE: 0.9976
Mean MAE : 0.7660
Mean FCP : 0.6413
------------
------------
Evaluating RMSE, MAE, FCP of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9694
MAE:  0.7415
FCP:  0.6583
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9742
MAE:  0.7444
FCP:  0.6560
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9756
MAE:  0.7465
FCP:  0.6499
------------
Fold 4
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9719
MAE:  0.7453
FCP:  0.6504
--

In [4]:
data = data.sort_values(['timestamp'], ascending=[1])
for test_ratio in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    split = int(test_ratio*number_of_rows)
    train_data = data[:split]
    test_data = data[split:]

    reader = Reader(rating_scale=(0.5, 5))
    train_data = Dataset.load_from_df(train_data[['userId','movieId','rating']], reader)
    train_data = train_data.build_full_trainset()

    algo = pa.knns.KNNBasic(k=10)

    algo.train(train_data)

    s = 0
    a = 0
    for index,row in test_data.iterrows():
        ans = algo.predict(row['userId'],row['movieId'])
        s += (ans.est-row['rating'])**2
        a += abs(ans.est-row['rating'])
    print "rmse : " , (s/len(test_data))**0.5
    print "mae : " , (a/len(test_data))

Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.07101198352
mae :  0.851547586044
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.07653132499
mae :  0.845806077894
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.06700145423
mae :  0.83260725111
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.04385117435
mae :  0.812186292471
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.04688371602
mae :  0.815524191461
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.0564615367
mae :  0.824689506046
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.07783786041
mae :  0.844698255158
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.08963494702
mae :  0.861704630163
Computing the msd similarity matrix...
Done computing similarity m

In [15]:
data = data.sort_values(['userId','timestamp'], ascending=[1,1])
for test_ratio in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    train_data = pd.DataFrame()
    test_data = pd.DataFrame()
    users = data.userId.unique()
    for user in users:
        temp_data = data['userId'] == user
        temp_data = data[temp_data]
        x = temp_data[:int(test_ratio*len(temp_data))]
        train_data = train_data.append(x)
        x = temp_data[int(test_ratio*len(temp_data)):]
        test_data = test_data.append(x)
    print len(test_data)
    reader = Reader(rating_scale=(0.5, 5))
    train_data = Dataset.load_from_df(train_data[['userId','movieId','rating']], reader)
    train_data = train_data.build_full_trainset()

    algo = pa.knns.KNNBasic(k=10, min_k=1)

    algo.train(train_data)

    s = 0
    a = 0
    for index,row in test_data.iterrows():
        ans = algo.predict(row['userId'],row['movieId'])
        s += (ans.est-row['rating'])**2
        a += abs(ans.est-row['rating'])
    print "rmse : " , (s/len(test_data))**0.5
    print "mae : " , (a/len(test_data))

90282
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.1217902951
mae :  0.865171303189
80251
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.08449910686
mae :  0.832050695354
70298
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.05592028753
mae :  0.810273848775
60248
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.03963529646
mae :  0.797319013612
50166
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.02801622072
mae :  0.785885215947
40259
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.01735978678
mae :  0.776017272848
30287
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.01059767827
mae :  0.772225491851
20256
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse :  1.00831361121
mae :  0.770954508162
10299
Computing t

In [16]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2)
rated = {}
for i,row in train_data.iterrows():
    if int(row['userId']) not in rated:
        rated[int(row['userId'])] = []
    rated[int(row['userId'])].append(int(row['movieId']))
totest = {}
for i,row in test_data.iterrows():
    if int(row['userId']) not in totest:
        totest[int(row['userId'])] = []
    totest[int(row['userId'])].append(int(row['movieId']))
reader = Reader(rating_scale=(0.5, 5))
train_data = Dataset.load_from_df(train_data[['userId','movieId','rating']], reader)
train_data = train_data.build_full_trainset()

algo = pa.knns.KNNBasic(k=10, min_k=1)

algo.train(train_data)

users = data.userId.unique()
items = data.movieId.unique()
predicted = {}
for user in users:
    predicted[int(user)] = []
    for item in items:
        if rated[int(user)] and int(item) not in rated[int(user)]:
            predicted[int(user)].append((int(item),algo.predict(int(user),int(item)).est))
for user in predicted:
    predicted[user] = sorted(predicted[user], key = lambda x:x[1], reverse=True)
print "done"

Computing the msd similarity matrix...
Done computing similarity matrix.
done


In [17]:
for f in [10,20,30,40,50,60,70,80,90,100]:
    precision = 0
    recall = 0
    c = 0
    for user in predicted:
        if user in totest:
            a = 0
            for item in predicted[user][:f]:
                if item[0] in totest[user]:
                    a+=1
            c+=1
            precision += a*1.0/f
            recall += a*1.0/len(totest[user])
    print precision/c
    print recall/c
    print "----------------------------------"

0.0008968609865470852
0.00052548634797506
----------------------------------
0.0007473841554559044
0.000570056652746962
----------------------------------
0.0006477329347284504
0.0007635689621106723
----------------------------------
0.000672645739910314
0.0008152157170095687
----------------------------------
0.0005979073243647235
0.000840348594340651
----------------------------------
0.0006477329347284504
0.001050426298812974
----------------------------------
0.0005979073243647236
0.0010730603577080507
----------------------------------
0.0005792227204783261
0.0011688166746740944
----------------------------------
0.0005812987875768146
0.0014419566058724352
----------------------------------
0.0005680119581464875
0.0015005731816788467
----------------------------------
