# Memory based with sklearn

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from scipy.spatial.distance import (
    correlation as orig_correlation,
    cosine as orig_cosine
)
from surprise import KNNWithMeans, Dataset, Reader
from ranx import Qrels, Run, evaluate

## Task generation

In [2]:
np.random.seed(10)
r_width = 10
r_height = 500

R, group = make_blobs(
    n_samples=r_height,
    n_features=r_width,
    centers=5,
    random_state=10
)
R = np.round((R-R.min())*10/(R.max()-R.min())).astype(int)

# add bias for each object
bias = np.random.randint(-2,3, [R.shape[0], 1])
R = R + bias
# sometimes bias can lead to ratings
R = np.where(R<0, 0, R)
R = np.where(R>10, 10, R)
R[:10, :]

array([[ 5,  7,  2,  7,  4,  7,  3,  7,  3,  1],
       [ 8,  5,  7,  9,  6,  7,  8,  7,  7,  8],
       [ 3,  6,  1,  7,  2,  6,  1,  6,  2,  0],
       [ 6,  0,  6,  6,  4,  1,  3,  6,  2,  0],
       [ 8,  9,  2,  7,  9,  7,  8,  5,  9,  8],
       [ 8,  9,  4, 10,  5,  8,  5, 10,  6,  4],
       [ 7,  4,  8,  2,  1,  3,  1,  6,  0,  5],
       [ 3,  0,  2,  5,  3,  3,  4,  3,  5,  4],
       [ 7,  3,  8,  2,  1,  3,  1,  7,  1,  5],
       [ 7,  8,  1,  5,  7,  6,  7,  3,  9,  7]])

In [6]:
np.random.seed(10)
R_frame = pd.Series(
    R.ravel(),
    index = pd.MultiIndex.from_tuples(
        [
            (j,i) 
            for j in np.arange(R.shape[0]) 
            for i in np.arange(R.shape[1])
        ],
        names = ["object", "item"]
    ),
    name = "rank"
).to_frame()

R_frame["relevant"] = (R_frame["rank"] > 5).astype("int")
R_frame["random_predict"] = np.random.rand(R_frame.shape[0])
R_frame.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,rank,relevant,random_predict
object,item,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
38,7,6,1,0.03733
126,4,6,1,0.120591
147,3,8,1,0.285638
215,2,3,0,0.204547
44,5,3,0,0.501746
264,3,10,1,0.79065
401,7,8,1,0.507431
233,0,6,1,0.794765
361,9,7,1,0.169455
312,2,6,1,0.181615


In [7]:
R_fr_train, R_fr_test = train_test_split(
    R_frame, 
    train_size=0.8, 
    random_state=100
)

In [10]:
metrics = [
    "precision@3", 
    "recall@3", 
    "ndcg@3"
]
R_fr_test[["object_str", "item_str"]] = \
    R_fr_test.index.to_frame()[["object", "item"]].astype("str")

qrels = Qrels.from_df(
    df=R_fr_test.reset_index(),
    q_id_col="object_str", 
    doc_id_col="item_str",
    score_col="relevant"
)

random_run = Run.from_df(
    df=R_fr_test.reset_index(),
    q_id_col="object_str",
    doc_id_col="item_str",
    score_col="random_predict"
)

evaluate(
    qrels, 
    random_run, 
    metrics=metrics
)

{'precision@3': 0.311804008908686,
 'recall@3': 0.6076837416481069,
 'ndcg@3': 0.5401559043967112}

## surprise

In [11]:
reader = Reader(
    rating_scale=(
        R.min().min(), R.max().max()
    )
)
train_set = Dataset.load_from_df(
    df=R_fr_train["rank"].reset_index(), 
    reader=reader
).build_full_trainset()
algo = KNNWithMeans().fit(train_set)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [13]:
R_fr_test["surpr_predict"] = [
    algo.predict(uid=uid, iid=iid).est 
    for uid, iid in R_fr_test.index
]

In [14]:
mean_absolute_error(
    R_fr_test["rank"],
    R_fr_test["surpr_predict"]
)

0.6451729505731039

In [17]:
surprise_run = Run.from_df(
    df=R_fr_test.reset_index(),
    q_id_col="object_str",
    doc_id_col="item_str",
    score_col="surpr_predict"
)
evaluate(
    qrels, 
    surprise_run, 
    metrics=metrics
)

{'precision@3': 0.3266518188567186,
 'recall@3': 0.6303266518188567,
 'ndcg@3': 0.6346257036375941}

## sklearn

In [11]:
def correlation(a,b):
    cond = ((a != -1) & (b != -1))
    # in case if there are only two
    # observations it's impossible
    # to compute coorrelation coeficient
    # it's invalid case - so we return 
    # the biggest possible distance
    if sum(cond) <=1:
        return 2

    a_std = a[cond].std()
    b_std = b[cond].std()

    # Pearson coefficient uses standard 
    # deviations in the denominator, so 
    # if any of them is equal to zero, 
    # we have to return the biggest
    # possible distance.
    if a_std==0 or b_std==0:
        return 2
    return orig_correlation(a,b)

R_mat_train = R_fr_train.unstack().fillna(-1).to_numpy()

In [12]:
nn = NearestNeighbors(
    n_neighbors=40,
    metric=correlation
).fit(R_mat_train)

In [13]:
distances, indices = nn.kneighbors(R_mat_train)

In [14]:
for i in range(500):
    a = correlation(
        R_mat_train[i,:],
        R_mat_train[23,:]
    )

In [15]:
nn.kneighbors(R_mat_train[[23],:])

(array([[0.        , 0.00773735, 0.01793955, 0.02036468, 0.03078565,
         0.12830688, 0.1367376 , 0.1594134 , 0.19499183, 0.19572999,
         0.21355416, 0.21951847, 0.2424083 , 0.24593219, 0.24878308,
         0.25488075, 0.25593612, 0.25793885, 0.26449012, 0.26531659,
         0.27145967, 0.27193466, 0.28108243, 0.2833314 , 0.29896649,
         0.30095485, 0.31178408, 0.32141895, 0.33011679, 0.33980562,
         0.34359015, 0.34828176, 0.351791  , 0.37609985, 0.38677301,
         0.39023288, 0.39090246, 0.39342912, 0.40267617, 0.41134453]]),
 array([[ 23,  43, 438, 434, 499, 277, 405, 387, 373, 334, 211, 194, 340,
         166, 329,  39, 439, 314, 303, 257,   4, 350,  46, 283, 198, 284,
         270,  47,  57, 156, 328, 425,  55, 371, 351, 173,  24, 196, 421,
         445]]))

In [16]:
def get_estimations(obj):
    dist, ind = distances[obj, :], indices[obj, :]
    collaboration_R = np.where(
        R_mat_train[ind,:] == -1,
        np.NaN, R_mat_train[ind,:]
    )
    collaboration_r_bar = np.nanmean(collaboration_R, axis=1)

    sim = 1-dist

    scores = np.nansum(
        (collaboration_R - collaboration_r_bar[:, np.newaxis])*sim[:, np.newaxis],
        axis=0
    )/np.abs(sim).sum()

    m = np.nanmean(np.where(
        R_mat_train[obj,:]==-1,
        np.NaN, 
        R_mat_train[obj,:]
    ))

    return scores+m

In [17]:
sklearn_predictions = np.concatenate(
    [
        get_estimations(i)[np.newaxis, :]
        for i in range(500)
    ]
)

In [18]:
algo.predict(uid=300, iid=2).est 

6.55170660504411

In [19]:
sklearn_predictions[300,:]

array([6.42680978, 4.46057007, 3.90834435, 6.76853161, 3.8343215 ,
       4.76347161, 3.38226289, 7.71132283, 2.91157822, 2.9756443 ])

In [20]:
R[300, :]

array([8, 2, 6, 7, 5, 3, 3, 8, 3, 2])