# Surprise homework

In [1]:
import io 
import os
import pandas as pd
from collections import defaultdict
from surprise import SVD
from surprise import Dataset
from surprise import KNNWithMeans
from surprise import NormalPredictor
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise.model_selection import train_test_split

In [2]:
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')
k = 30
trainset, testset = train_test_split(data, test_size=.25)

### A random rating based on the distribution of all ratings

In [3]:
algo = NormalPredictor()

cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5176  1.5164  1.5116  1.5144  1.5186  1.5157  0.0025  
Fit time          0.28    0.26    0.28    0.26    0.47    0.31    0.08    
Test time         0.30    0.26    0.33    0.37    0.40    0.33    0.05    


{'test_rmse': array([1.51763662, 1.51641254, 1.51158091, 1.51439183, 1.51858262]),
 'fit_time': (0.2842373847961426,
  0.25952887535095215,
  0.27729082107543945,
  0.2635636329650879,
  0.4677588939666748),
 'test_time': (0.29537296295166016,
  0.2608180046081543,
  0.33211302757263184,
  0.3689548969268799,
  0.4010148048400879)}

### kNN cosine

In [4]:
algo = KNNWithMeans(k, sim_options={'name': 'cosine'})

cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9529  0.9575  0.9598  0.9617  0.9610  0.9586  0.0032  
Fit time          2.95    2.95    2.92    3.04    3.65    3.10    0.28    
Test time         6.68    6.08    6.01    6.07    8.04    6.58    0.77    


{'test_rmse': array([0.95288586, 0.957459  , 0.95979542, 0.96174343, 0.96100972]),
 'fit_time': (2.9457361698150635,
  2.946728467941284,
  2.920332908630371,
  3.0370304584503174,
  3.6523311138153076),
 'test_time': (6.684844017028809,
  6.084195852279663,
  6.013605356216431,
  6.066106557846069,
  8.042350053787231)}

### kNN Mean Squared Difference

In [5]:
algo = KNNWithMeans(k, sim_options={'name': 'msd'})

cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9533  0.9566  0.9488  0.9497  0.9549  0.9526  0.0030  
Fit time          1.46    1.20    1.35    1.19    1.05    1.25    0.14    
Test time         7.59    8.85    6.36    6.17    8.38    7.47    1.07    


{'test_rmse': array([0.9532662 , 0.95659084, 0.94884208, 0.94965886, 0.95485827]),
 'fit_time': (1.4614648818969727,
  1.1975500583648682,
  1.3470699787139893,
  1.1900548934936523,
  1.0482687950134277),
 'test_time': (7.585597038269043,
  8.852206468582153,
  6.35832667350769,
  6.172762155532837,
  8.383087635040283)}

### kNN Pearson

In [6]:
algo = KNNWithMeans(k, sim_options={'name': 'pearson'})

cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9577  0.9480  0.9548  0.9580  0.9451  0.9527  0.0052  
Fit time          5.01    5.46    6.67    4.37    3.81    5.06    0.98    
Test time         11.57   7.68    14.81   6.47    6.92    9.49    3.21    


{'test_rmse': array([0.95773475, 0.94802945, 0.95484489, 0.9579754 , 0.94511902]),
 'fit_time': (5.014451026916504,
  5.459513902664185,
  6.67278528213501,
  4.367302179336548,
  3.808903455734253),
 'test_time': (11.56595253944397,
  7.681387662887573,
  14.810839414596558,
  6.468941688537598,
  6.922971248626709)}

### SVD

In [7]:
algo = SVD()

cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9409  0.9324  0.9386  0.9320  0.9357  0.9359  0.0035  
Fit time          10.22   11.70   8.75    8.80    7.98    9.49    1.32    
Test time         0.37    0.42    0.26    0.30    0.22    0.31    0.07    


{'test_rmse': array([0.94089764, 0.93242543, 0.93859992, 0.93199761, 0.93565388]),
 'fit_time': (10.22478199005127,
  11.695600271224976,
  8.74755597114563,
  8.80346131324768,
  7.978696584701538),
 'test_time': (0.3686349391937256,
  0.4189133644104004,
  0.26429009437561035,
  0.3001980781555176,
  0.22037649154663086)}

По средним значениям RMSE видно, что лучший алгоритм - SVD, его и будем использовать дальше

### Calculate precision@k and recall@k

In [8]:
#from FAQ
def precision_recall_at_k(predictions, k=10, threshold=3.52):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        
        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

algo = SVD()
kf = KFold(n_splits=5)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=3.52)

#усредним по всем пользователям
precision_at_k = sum(prec for prec in precisions.values()) / len(precisions)
recall_at_k = sum(rec for rec in recalls.values()) / len(recalls)
print(precision_at_k)
print(recall_at_k)

0.7286574566064472
0.4153230228244317


### Predict

In [9]:
#основано на примерах из FAQ
n = 5
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
    top_n[uid].append((iid, est))

for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

In [10]:
print("User 34")
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.item')
films_data = pd.read_csv(file_path, sep="|", encoding='ansi', usecols=[0,1,2], names=['id','name', 'date'])
for id, score in top_n['34']:
    name = films_data.at[int(id) - 1, 'name'] #вычитаем -1, т.к. Python
    date = films_data.at[int(id) - 1, 'date']
    print("{:<6} ('{:<30}', '{:<11}') {:<10.3f}".format(id, name, date, score))

User 34
427    ('To Kill a Mockingbird (1962)  ', '01-Jan-1962') 5.000     
496    ('It's a Wonderful Life (1946)  ', '01-Jan-1946') 5.000     
483    ('Casablanca (1942)             ', '01-Jan-1942') 5.000     
408    ('Close Shave, A (1995)         ', '28-Apr-1996') 5.000     
318    ('Schindler's List (1993)       ', '01-Jan-1993') 5.000     
