# Surprise homework

In [1]:
import io 
import pandas as pd
from collections import defaultdict
from surprise import Dataset
from surprise import SVD
from surprise import NormalPredictor
from surprise import KNNWithMeans
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise import KNNBaseline
from surprise import get_dataset_dir
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold

### Functions

In [2]:
# используйте полезные функции из FAQ
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls


def comparison(algo, trainset, testset, data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

### Load data

In [15]:
item_path = '/Users/evgenijbuss/Downloads/ml-100k/u.item'
item_df = pd.read_csv(item_path, sep='|',encoding='ISO-8859-1', header = None)

data_path = '/Users/evgenijbuss/Downloads/ml-100k/u.data'
data_df = pd.read_csv(data_path, sep='\t',encoding='ISO-8859-1', header = None)


Unnamed: 0,0,1,2,3
1257,3,335,1,889237269
1343,3,245,1,889237247
1682,3,337,1,889236983
2523,3,343,3,889237122
3758,3,323,2,889237269
3840,3,331,4,889237455
4419,3,294,2,889237224
5897,3,332,1,889237224
6178,3,328,5,889237455
7110,3,334,3,889237122


In [4]:
data = Dataset.load_builtin('ml-100k')
data

<surprise.dataset.DatasetAutoFolds at 0x7fa2ac51c4c0>

In [5]:
trainset, testset = train_test_split(data, test_size=.25)
trainset

<surprise.trainset.Trainset at 0x7fa2abff9b20>

In [6]:
ratings = trainset.all_ratings()

for _ in range (0,5):
    print(next(ratings))
    

(0, 0, 3.0)
(0, 862, 3.0)
(0, 942, 2.0)
(0, 976, 4.0)
(0, 203, 4.0)


In [7]:
testset

[('802', '288', 3.0),
 ('280', '571', 3.0),
 ('655', '1403', 3.0),
 ('312', '608', 5.0),
 ('276', '720', 2.0),
 ('436', '239', 3.0),
 ('588', '941', 5.0),
 ('276', '1074', 3.0),
 ('690', '655', 4.0),
 ('476', '300', 5.0),
 ('293', '143', 4.0),
 ('130', '195', 5.0),
 ('280', '235', 5.0),
 ('682', '300', 2.0),
 ('21', '985', 2.0),
 ('313', '67', 1.0),
 ('141', '1244', 3.0),
 ('834', '181', 5.0),
 ('472', '71', 2.0),
 ('693', '98', 4.0),
 ('749', '578', 3.0),
 ('406', '528', 4.0),
 ('75', '237', 2.0),
 ('500', '223', 4.0),
 ('727', '771', 3.0),
 ('297', '1007', 4.0),
 ('63', '333', 4.0),
 ('58', '116', 5.0),
 ('551', '56', 5.0),
 ('487', '627', 4.0),
 ('904', '237', 5.0),
 ('598', '751', 3.0),
 ('499', '176', 4.0),
 ('312', '382', 4.0),
 ('376', '154', 4.0),
 ('116', '1020', 3.0),
 ('621', '542', 2.0),
 ('655', '1085', 2.0),
 ('251', '288', 4.0),
 ('392', '288', 4.0),
 ('215', '77', 3.0),
 ('541', '1084', 4.0),
 ('615', '528', 4.0),
 ('405', '675', 1.0),
 ('426', '1204', 4.0),
 ('532', '1

### Define algorithms

In [8]:
# внимательно изучите документацию по метрикам и алгоритмам
sim_options_cos = {'name': 'cosine',
               'user_based': True
               }
sim_options_pearson = {'name': 'pearson',
               'user_based': True
               }
sim_options_msd = {'name': 'msd',
               'user_based': True
               }
k = 30
min_k = 1
algo_random = NormalPredictor()
algo_svd = SVD()
algo_knn_cos = KNNWithMeans(k, min_k,sim_options=sim_options_cos,verbose=True)
algo_knn_pearson = KNNWithMeans(k, min_k,sim_options=sim_options_pearson,verbose=True)
algo_knn_msd = KNNWithMeans(k, min_k,sim_options=sim_options_msd,verbose=True)

### Select algorithm

In [9]:
comparison(algo_random,trainset,testset, data)
comparison(algo_knn_cos,trainset,testset, data)
comparison(algo_knn_pearson,trainset,testset, data)
comparison(algo_knn_msd,trainset,testset, data)
comparison(algo_svd,trainset,testset, data)

RMSE: 1.5182
Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5180  1.5337  1.5119  1.5327  1.5158  1.5224  0.0090  
Fit time          0.09    0.12    0.11    0.11    0.12    0.11    0.01    
Test time         0.11    0.15    0.15    0.10    0.15    0.13    0.02    
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0285
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0212  1.

### Calculate precision@k and recall@k

In [10]:
k = 5
threshold = 3.52
algo_svd.fit(trainset)
predictions = algo_svd.test(testset)
precision_k, recall_k = precision_recall_at_k(predictions, k, threshold)
print('precision@k: ',sum(prec for prec in precision_k.values()) / len(precision_k))
print('recall@k: ', sum(rec for rec in recall_k.values()) / len(recall_k))

precision@k:  0.737716507599859
recall@k:  0.365607108988558


### Predict

In [11]:
# обратите внимание на функцию build_anti_testset
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)
testset = trainset.build_anti_testset()
predictions = algo.test(testset)
top_n = get_top_n(predictions, n=5)

In [16]:
my_user = '3'
for uid, user_ratings in top_n.items():
    if (uid == my_user):
        list = [(iid, round(est,3)) for (iid, est) in user_ratings]

In [18]:
print('User: ', my_user)
for i in range(0, len(list)):
    print(item_df[item_df[0] == int(list[i][0])][[1, 2]], list[i][1])

User:  3
                  1            2
312  Titanic (1997)  01-Jan-1997 4.139
                              1            2
168  Wrong Trousers, The (1993)  01-Jan-1993 4.07
                         1            2
407  Close Shave, A (1995)  28-Apr-1996 3.964
                             1            2
11  Usual Suspects, The (1995)  14-Aug-1995 3.947
                   1            2
143  Die Hard (1988)  01-Jan-1988 3.944
