[Surprise · A Python scikit for recommender systems.](http://surpriselib.com/)

In [1]:
# !pip install scikit-surprise
# !conda install -c conda-forge scikit-surprise

### SVD

In [2]:
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from collections import defaultdict

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')
data

<surprise.dataset.DatasetAutoFolds at 0x7f9801370a10>

In [3]:
pd.DataFrame(data.raw_ratings, columns=["user", "movie", "rating", "timestamp"])

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596
...,...,...,...,...
99995,880,476,3.0,880175444
99996,716,204,5.0,879795543
99997,276,1090,1.0,874795795
99998,13,225,2.0,882399156


In [4]:
# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9359  0.9357  0.9424  0.9322  0.9380  0.9368  0.0033  
MAE (testset)     0.7387  0.7377  0.7410  0.7362  0.7385  0.7384  0.0016  
Fit time          3.55    3.94    3.64    3.65    3.74    3.70    0.13    
Test time         0.10    0.16    0.14    0.10    0.14    0.13    0.02    


{'test_rmse': array([0.93586987, 0.93570784, 0.9424264 , 0.93221746, 0.93797189]),
 'test_mae': array([0.73868457, 0.73768141, 0.74101961, 0.73620254, 0.73845929]),
 'fit_time': (3.5545201301574707,
  3.9404048919677734,
  3.6355769634246826,
  3.646739959716797,
  3.737112283706665),
 'test_time': (0.10431909561157227,
  0.15501022338867188,
  0.143996000289917,
  0.09977197647094727,
  0.1444110870361328)}

https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py

In [5]:
trainset = data.build_full_trainset()
trainset

<surprise.trainset.Trainset at 0x7f97e00222d0>

In [6]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
# data = Dataset.load_builtin('ml-100k')
# trainset = data.build_full_trainset()
# algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

196 ['408', '169', '114', '357', '603', '515', '318', '654', '488', '923']
186 ['483', '408', '205', '133', '513', '515', '496', '169', '318', '196']
22 ['408', '64', '246', '493', '199', '318', '480', '528', '519', '474']
244 ['408', '127', '286', '480', '483', '498', '302', '474', '275', '963']
166 ['169', '408', '318', '496', '1007', '64', '174', '272', '487', '515']
298 ['64', '272', '169', '480', '408', '12', '302', '657', '513', '180']
115 ['408', '175', '134', '340', '223', '285', '114', '169', '168', '179']
253 ['169', '114', '408', '513', '174', '603', '194', '165', '963', '651']
305 ['114', '603', '515', '133', '137', '1203', '213', '513', '443', '19']
6 ['663', '60', '428', '923', '603', '922', '657', '856', '179', '647']
62 ['408', '478', '187', '23', '185', '480', '493', '175', '499', '1194']
286 ['318', '8', '480', '178', '114', '919', '657', '515', '464', '1203']
200 ['408', '64', '12', '316', '190', '114', '604', '963', '520', '272']
210 ['408', '603', '169', '511', '31

## KNN

In [7]:
from surprise import KNNBasic

# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
knn = KNNBasic()
knn.fit(trainset)


Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f98120c4e90>

In [8]:
# Run 5-fold cross-validation and print results.
cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9767  0.9795  0.9908  0.9767  0.9689  0.9785  0.0071  
MAE (testset)     0.7704  0.7746  0.7835  0.7701  0.7650  0.7727  0.0062  
Fit time          0.25    0.29    0.24    0.27    0.26    0.26    0.02    
Test time         2.40    2.20    2.32    2.39    2.14    2.29    0.10    


{'test_rmse': array([0.97671444, 0.9794865 , 0.99076876, 0.9766554 , 0.96888674]),
 'test_mae': array([0.77043152, 0.77461604, 0.7834983 , 0.7700901 , 0.76502243]),
 'fit_time': (0.2541639804840088,
  0.29253196716308594,
  0.2423717975616455,
  0.2653348445892334,
  0.25690770149230957),
 'test_time': (2.398871898651123,
  2.204775810241699,
  2.3211841583251953,
  2.385383367538452,
  2.1381382942199707)}

In [9]:
import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate


# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                'userID': [9, 32, 2, 45, 'user_foo'],
                'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=2)

{'test_rmse': array([0.80148281, 1.81316248]),
 'test_mae': array([0.73206114, 1.68162048]),
 'fit_time': (6.175041198730469e-05, 2.8371810913085938e-05),
 'test_time': (4.315376281738281e-05, 1.6689300537109375e-05)}