# Testing the KNNBasic algorithm

In [4]:
from __future__ import (absolute_import, division, print_function, unicode_literals)                                
import pickle
import os
import random

import pandas as pd

from surprise import KNNBasic
from surprise import KNNBaseline
from surprise import Dataset
from surprise import Reader
from surprise.accuracy import rmse

In [5]:
my_seed = 1
random.seed(my_seed)

reader = Reader(line_format='user item rating', sep=',')
train_file = '../output/trainData.txt'
test_file = '../output/testData.txt'
data = Dataset.load_from_folds([(train_file, test_file)], reader)

algo = KNNBasic()

for trainset, testset in data.folds():
    algo.train(trainset)                             
    predictions = algo.test(testset)
    rmse(predictions)

Computing the msd similarity matrix...


MemoryError: 

In [None]:
# Build a pandas dataframe with all the predictions

def get_Iu(uid):
    """Return the number of items rated by given user
    
    Args:
        uid: The raw id of the user.
    Returns:
        The number of items rated by the user.
    """
    
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError:  # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """Return the number of users that have rated given item
    
    Args:
        iid: The raw id of the item.
    Returns:
        The number of users that have rated the item.
    """
    
    try:
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:  # item was not part of the trainset
        return 0

df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])    
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)

In [None]:
df.head()

In [25]:
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [26]:
# Let's take a look at the best predictions of the algorithm
best_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
69,XJYFjbH-oTmu2dATTqyauQ,0x2qB4R790SX3W4W8FQuWA,4.0,4.001405,{u'was_impossible': False},0,36,0.001405
48,tRY6Y1sVehnJ2h3IsHcbnQ,0x2qB4R790SX3W4W8FQuWA,4.0,4.001405,{u'was_impossible': False},0,36,0.001405
13,d_TBs6J3twMy9GChqUEXkg,0x2qB4R790SX3W4W8FQuWA,4.0,4.001405,{u'was_impossible': False},0,36,0.001405
65,C0jquh-km5UnawqDqSQpBw,4uiijOUDzc-DeIb2XcKW_A,3.0,2.942759,{u'was_impossible': False},0,28,0.057241
42,0lDl1Jg1Qz2KhvkfJKk6fw,4P1h3LKaCkgQL5dSy5d9gw,3.0,3.071899,{u'was_impossible': False},0,42,0.071899
90,Yej5B4nd8PqpHMQcmCTDrg,4P1h3LKaCkgQL5dSy5d9gw,3.0,3.071899,{u'was_impossible': False},0,42,0.071899
47,hfuhalehYPI89JCRSW1mSA,4P1h3LKaCkgQL5dSy5d9gw,3.0,3.071899,{u'was_impossible': False},0,42,0.071899
53,85gI4BQZ9zMtqkQ33niGWw,4P1h3LKaCkgQL5dSy5d9gw,3.0,3.071899,{u'was_impossible': False},0,42,0.071899
76,xH-vP1j0jR_BEdWcBkjDQg,4P1h3LKaCkgQL5dSy5d9gw,3.0,3.071899,{u'was_impossible': False},0,42,0.071899
82,dIIKEfOgo0KqUfGQvGikPg,4P1h3LKaCkgQL5dSy5d9gw,3.0,3.071899,{u'was_impossible': False},0,42,0.071899


It's interesting to note that these perfect predictions are actually lucky shots: $|U_i|$ is always very small, meaning that very few users have rated the target item. This implies that the set of neighbors is very small (see the ``actual_k`` field)... And, it just happens that all the ratings from the neighbors are the same (and mostly, are equal to that of the target user).

This may be a bit surprising but these lucky shots are actually very important to the accuracy of the algorithm... Try running the same algorithm with a value of ``min_k`` equal to $10$. This means that if there are less than $10$ neighbors, the prediction is set to the mean of all ratings. You'll see your accuracy decrease!

In [27]:
# Now, let's look at the prediction with the biggest error
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
3,W1Nl6_R7amuZ6NStXI3uBA,2PqCZxon6AZHJrQ5iam4LA,1.0,3.298947,{u'was_impossible': False},0,2,2.298947
54,p0RVQxuOSA2NV4Odv_kgWQ,32S_SP-ZaVmQucdEY5ynlQ,1.0,3.377895,{u'was_impossible': False},0,2,2.377895
46,3iq8oDotfNXL57CbqzaaAA,32S_SP-ZaVmQucdEY5ynlQ,1.0,3.377895,{u'was_impossible': False},0,2,2.377895
43,ub50UE95-gu7yIoed1zssA,4P-vTvE6cncJyUyLh73pxw,1.0,3.38988,{u'was_impossible': False},0,17,2.38988
67,-d8tk22ca-MpY7htQIw08Q,29XtLRA0y7Nfi4eZF-ZI4A,1.0,3.475882,{u'was_impossible': False},0,12,2.475882
86,1-0K54tp4ig3lYjd1eRF_g,8KWWHywMjN-154i_Qi0deQ,1.0,3.517,{u'was_impossible': False},0,16,2.517
20,T-6kD-3JWxzC9QszSa1e6A,8reh8dD3VasFglGhlkNlgg,1.0,3.5425,{u'was_impossible': False},0,0,2.5425
1,SxV1Jq7UANuSYpn42JXvOA,-3pfhzz9CB7F2DpbF1Ko7Q,1.0,3.667429,{u'was_impossible': False},0,1,2.667429
59,7BT9J9gmv3GsesY7Ju7pXA,0BJK4_RQnNiiXJcYPEgG3w,1.0,3.766283,{u'was_impossible': False},0,53,2.766283
24,9sdni8QHrai8l7ikHsV5Jg,7GI_V9oLCUGdn2ogqB0IBg,1.0,3.772632,{u'was_impossible': False},0,2,2.772632


Let's focus first on the last two predictions. Well, we can't do much about them. We should have predicted $5$, but the only available neighbor had a rating of $1$, so we were screwed. The only way to avoid this kind of errors would be to increase the ``min_k`` parameter, but it would actually worsen the accuracy (see note above).

How about the other ones? It seems that for each prediction, the users are some kind of outsiders: they rated their item with a rating of $1$ when the most of the ratings for the item where high (or inversely, rated a *bad* item with a rating of $5$). See the plot below as an illustration for the first rating.

These are situations where baseline estimates would be quite helpful, in order to deal with highly biased users (and items).

In [28]:
from collections import Counter

import matplotlib.pyplot as plt
import matplotlib
%matplotlib notebook
matplotlib.style.use('ggplot')

counter = Counter([r for (_, r) in trainset.ir[trainset.to_inner_iid('302')]])
pd.DataFrame.from_dict(counter, orient='index').plot(kind='bar', legend=False)
plt.xlabel('Rating value')
plt.ylabel('Number of users')
plt.title('Number of users having rated item 302')

ValueError: Item 302 is not part of the trainset.