# Memory based with sklearn

In [44]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

from scipy.spatial.distance import correlation
from surprise import KNNWithMeans, Dataset, Reader

In [45]:
np.random.seed(10)
r_width = 10
r_height = 500

R, c = make_blobs(
    n_samples=r_height,
    n_features=r_width,
    centers=3,
    random_state=10
)
R = np.round((R-R.min())*10/(R.max()-R.min())).astype(int)

# add bias for each object
bias = np.random.randint(-2,3, [R.shape[0], 1])
R = R + bias
# sometimes bias can lead to ratings
R = np.where(R<0, 0, R)
R = np.where(R>10, 10, R)
R[:10, :]

array([[ 5,  8,  0,  5,  7,  6,  6,  3,  9,  6],
       [10, 10,  3,  8, 10,  8,  9,  6, 10,  9],
       [ 4,  1,  2,  4,  2,  2,  4,  3,  4,  4],
       [ 4,  2,  4,  6,  3,  4,  6,  5,  6,  5],
       [ 7,  3,  5,  8,  7,  6,  7,  7,  7,  7],
       [ 7,  4,  5,  8,  6,  7,  8,  7,  8,  7],
       [ 5,  8,  0,  5,  7,  6,  5,  2,  7,  6],
       [ 5,  7,  0,  3,  6,  4,  5,  2,  7,  4],
       [ 5,  0,  3,  6,  4,  4,  4,  3,  6,  4],
       [ 8,  1,  6,  7,  6,  3,  3,  8,  2,  1]])

In [114]:
R_frame = pd.Series(
    R.ravel(),
    index = pd.MultiIndex.from_tuples(
        [
            (j,i) 
            for j in np.arange(R.shape[0]) 
            for i in np.arange(R.shape[1])
        ],
        names = ["object", "item"]
    ),
    name = "rank"
).to_frame()

R_frame.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,rank
object,item,Unnamed: 2_level_1
61,3,9
458,1,10
131,8,7
451,3,8
224,5,5
59,2,4
342,0,6
123,0,6
93,1,1
351,2,0


In [115]:
R_fr_train, R_fr_test = train_test_split(
    R_frame, 
    train_size=0.8, 
    random_state=100
)

In [116]:
reader = Reader(
    rating_scale=(
        R.min().min(), R.max().max()
    )
)
train_set = Dataset.load_from_df(
    df=R_fr_train.reset_index(), 
    reader=reader
).build_full_trainset()

In [117]:
algo = KNNWithMeans().fit(train_set)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [123]:
R_fr_test["surpr_predict"] = [
    algo.predict(uid=uid, iid=iid).est 
    for uid, iid in R_fr_test.index
]

In [124]:
from sklearn.metrics import mean_absolute_error

In [126]:
mean_absolute_error(
    R_fr_test["rank"],
    R_fr_test["surpr_predict"]
)

0.5891990700535977

In [94]:
from ranx import Qrels, Run, evaluate

In [147]:
R_fr_test[["object_str", "item_str"]] = \
    R_fr_test.index.to_frame()[["object", "item"]].astype("str")

qrels = Qrels.from_df(
    df=R_fr_test.reset_index(),
    q_id_col="object_str", 
    doc_id_col="item_str",
    score_col="rank"
)
surprise_run = Run.from_df(
    df=R_fr_test.reset_index(),
    q_id_col="object_str",
    doc_id_col="item_str",
    score_col="surpr_predict"
)

In [150]:
evaluate(
    qrels, 
    surprise_run, 
    [
        "precision@3", 
        "recall@3", 
        "ndcg@3"
    ]
)

{'precision@3': 0.6666666666666667,
 'recall@3': 0.9473910276805599,
 'ndcg@3': 0.9739427843133937}