# Memory based with sklearn

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from scipy.spatial.distance import correlation as orig_correlation
from surprise import KNNWithMeans, Dataset, Reader
from ranx import Qrels, Run, evaluate

## Task generation

In [2]:
np.random.seed(10)
r_width = 10
r_height = 500

R, group = make_blobs(
    n_samples=r_height,
    n_features=r_width,
    centers=5,
    random_state=10
)
R = np.round((R-R.min())*10/(R.max()-R.min())).astype(int)

# add bias for each object
bias = np.random.randint(-2,3, [R.shape[0], 1])
R = R + bias
# sometimes bias can lead to ratings
R = np.where(R<0, 0, R)
R = np.where(R>10, 10, R)
R[:10, :]

array([[ 5,  7,  2,  7,  4,  7,  3,  7,  3,  1],
       [ 8,  5,  7,  9,  6,  7,  8,  7,  7,  8],
       [ 3,  6,  1,  7,  2,  6,  1,  6,  2,  0],
       [ 6,  0,  6,  6,  4,  1,  3,  6,  2,  0],
       [ 8,  9,  2,  7,  9,  7,  8,  5,  9,  8],
       [ 8,  9,  4, 10,  5,  8,  5, 10,  6,  4],
       [ 7,  4,  8,  2,  1,  3,  1,  6,  0,  5],
       [ 3,  0,  2,  5,  3,  3,  4,  3,  5,  4],
       [ 7,  3,  8,  2,  1,  3,  1,  7,  1,  5],
       [ 7,  8,  1,  5,  7,  6,  7,  3,  9,  7]])

In [3]:
R_frame = pd.Series(
    R.ravel(),
    index = pd.MultiIndex.from_tuples(
        [
            (j,i) 
            for j in np.arange(R.shape[0]) 
            for i in np.arange(R.shape[1])
        ],
        names = ["object", "item"]
    ),
    name = "rank"
).to_frame()

R_frame.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,rank
object,item,Unnamed: 2_level_1
436,9,2
402,4,1
302,3,5
230,3,6
264,6,8
64,5,3
398,3,2
296,2,8
241,9,3
380,6,5


In [4]:
R_fr_train, R_fr_test = train_test_split(
    R_frame, 
    train_size=0.8, 
    random_state=100
)

In [5]:
reader = Reader(
    rating_scale=(
        R.min().min(), R.max().max()
    )
)
train_set = Dataset.load_from_df(
    df=R_fr_train.reset_index(), 
    reader=reader
).build_full_trainset()

## surprise

In [6]:
algo = KNNWithMeans().fit(train_set)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [7]:
R_fr_test["surpr_predict"] = [
    algo.predict(uid=uid, iid=iid).est 
    for uid, iid in R_fr_test.index
]

In [8]:
mean_absolute_error(
    R_fr_test["rank"],
    R_fr_test["surpr_predict"]
)

0.6451729505731039

In [9]:
R_fr_test[["object_str", "item_str"]] = \
    R_fr_test.index.to_frame()[["object", "item"]].astype("str")

qrels = Qrels.from_df(
    df=R_fr_test.reset_index(),
    q_id_col="object_str", 
    doc_id_col="item_str",
    score_col="rank"
)
surprise_run = Run.from_df(
    df=R_fr_test.reset_index(),
    q_id_col="object_str",
    doc_id_col="item_str",
    score_col="surpr_predict"
)

In [10]:
evaluate(
    qrels, 
    surprise_run, 
    [
        "precision@3", 
        "recall@3", 
        "ndcg@3"
    ]
)

{'precision@3': 0.6651818856718633,
 'recall@3': 0.964206172446707,
 'ndcg@3': 0.9911405135015423}

## sklearn

In [17]:
def correlation(a,b):
    '''
    NEED testing!!!
    '''
    cond = ((a != -1) & (b != -1))
    return orig_correlation(a[cond], b[cond])

R_mat_train = R_fr_train.unstack().fillna(-1).to_numpy()

In [23]:
nn = NearestNeighbors(
    n_neighbors=40,
    metric=correlation
).fit(R_mat_train)

In [27]:
distances, indices = nn.kneighbors(R_mat_train)

  dist = 1.0 - uv / math.sqrt(uu * vv)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [122]:
def get_estimations(obj):
    dist, ind = distances[obj, :], indices[obj, :]
    collaboration_R = np.where(
        R_mat_train[ind,:] == -1,
        np.NaN, R_mat_train[ind,:]
    )
    collaboration_r_bar = np.nanmean(collaboration_R, axis=1)

    sim = 1-dist

    scores = np.nansum(
        (collaboration_R - collaboration_r_bar[:, np.newaxis])*sim[:, np.newaxis],
        axis=0
    )/np.abs(sim).sum()

    m = np.nanmean(np.where(
        R_mat_train[obj,:]==-1,
        np.NaN, 
        R_mat_train[obj,:]
    ))

    return scores+m

In [149]:
predictions = np.concatenate(
    [
        get_estimations(i)[np.newaxis, :]
        for i in range(500)
    ]
)

In [151]:
predictions[300, :]

array([7.28898973, 2.18974413, 5.98889745, 6.88389521, 5.17644765,
       3.4250464 , 3.25849608, 7.11749231, 3.29885262, 2.51499556])

In [137]:
R[300, :]

array([8, 2, 6, 7, 5, 3, 3, 8, 3, 2])