# Selfmade memory based

In [28]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from scipy.spatial.distance import (
    correlation as orig_correlation,
    cosine as orig_cosine
)
from surprise import KNNWithMeans, Dataset, Reader
from ranx import Qrels, Run, evaluate

from typing import Union, Callable

from IPython.display import HTML
header_pattern = "<text style='font-size:20px'>{}</text>"

## Task generation

In [2]:
np.random.seed(10)
r_width = 50
r_height = 500

R, group = make_blobs(
    n_samples=r_height,
    n_features=r_width,
    centers=5,
    random_state=10
)
R = np.round((R-R.min())*10/(R.max()-R.min())).astype(int)

# add bias for each object
bias = np.random.randint(-2,3, [R.shape[0], 1])
R = R + bias
# sometimes bias can lead to ratings
R = np.where(R<0, 0, R)
R = np.where(R>10, 10, R)
R[:10, :]

array([[ 4,  7,  0,  3,  7,  5,  5,  0,  3,  4,  3,  4,  5,  2,  0,  2,
         1,  1,  8,  3,  2,  6,  1,  6,  4,  6,  1,  1,  5,  1,  1,  4,
         2,  2,  8,  8,  4,  4,  0,  8,  4,  6,  4,  7,  5,  5,  1,  4,
         5,  1],
       [ 9, 10,  6, 10,  7,  5, 10,  9,  7,  8,  5, 10, 10,  6, 10,  5,
         3,  3,  4,  4,  5, 10,  7,  4, 10,  6,  4,  9,  5,  9,  6,  9,
        10,  7,  7,  3,  6,  6,  7, 10,  6,  6,  9,  6,  5,  9,  7,  4,
        10,  5],
       [ 4,  6,  0,  3,  5,  5,  4,  0,  1,  3,  1,  3,  4,  0,  0,  2,
         0,  1,  7,  1,  0,  5,  0,  5,  3,  4,  0,  0,  4,  0,  1,  3,
         0,  3,  6,  7,  4,  3,  0,  7,  3,  5,  3,  6,  4,  5,  1,  2,
         4,  0],
       [ 5,  6,  1,  3,  6,  7,  5,  1,  3,  4,  3,  4,  5,  1,  0,  2,
         1,  1,  7,  3,  2,  6,  1,  6,  4,  5,  1,  1,  5,  1,  1,  5,
         2,  3,  7,  7,  5,  4,  1,  7,  4,  5,  4,  7,  5,  5,  0,  4,
         6,  1],
       [ 8,  8,  5, 10,  5,  4,  8,  7,  6,  7,  4,  9, 10,  5,  9, 

In [3]:
np.random.seed(10)
R_frame = pd.Series(
    R.ravel(),
    index = pd.MultiIndex.from_tuples(
        [
            (j,i)
            for j in np.arange(R.shape[0]) 
            for i in np.arange(R.shape[1])
        ],
        names = ["object", "item"]
    ),
    name = "rank"
).to_frame()

R_frame["relevant"] = (R_frame["rank"] > 5).astype("int")
R_frame["random_predict"] = np.random.rand(R_frame.shape[0])
R_frame.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,rank,relevant,random_predict
object,item,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
440,47,2,0,0.975159
283,2,3,0,0.229527
396,33,0,0,0.397484
14,6,5,0,0.270233
198,15,5,0,0.59817
20,33,1,0,0.709187
422,24,9,1,0.814357
267,4,2,0,0.505066
408,25,0,0,0.244018
99,44,3,0,0.754604


In [4]:
R_fr_train, R_fr_test = train_test_split(
    R_frame, 
    train_size=0.8, 
    random_state=100
)

# preparing test/train samples representation as
# as user/item matrix
R_mat_train = pd.DataFrame(
    R_fr_train["rank"].unstack(),
    columns = R_frame.index.get_level_values(1).unique()
)
R_mat_test = pd.DataFrame(
    R_fr_test["rank"].unstack(),
    columns = R_frame.index.get_level_values(1).unique()
)

In [5]:
metrics = [
    "precision@3", 
    "recall@3", 
    "ndcg@3"
]
R_fr_test[["object_str", "item_str"]] = \
    R_fr_test.index.to_frame()[["object", "item"]].astype("str")

qrels = Qrels.from_df(
    df=R_fr_test.reset_index(),
    q_id_col="object_str", 
    doc_id_col="item_str",
    score_col="relevant"
)

random_run = Run.from_df(
    df=R_fr_test.reset_index(),
    q_id_col="object_str",
    doc_id_col="item_str",
    score_col="random_predict"
)

evaluate(
    qrels, 
    random_run, 
    metrics=metrics
)

{'precision@3': 0.3713333333333333,
 'recall@3': 0.3102086885336885,
 'ndcg@3': 0.4036283432588485}

## surprise

In [6]:
reader = Reader(
    rating_scale=(
        R.min().min(), R.max().max()
    )
)
train_set = Dataset.load_from_df(
    df=R_fr_train["rank"].reset_index(), 
    reader=reader
).build_full_trainset()
algo = KNNWithMeans().fit(train_set)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [7]:
R_fr_test["surpr_predict"] = [
    algo.predict(uid=uid, iid=iid).est 
    for uid, iid in R_fr_test.index
]

In [8]:
mean_absolute_error(
    R_fr_test["rank"],
    R_fr_test["surpr_predict"]
)

0.414035940075337

In [9]:
surprise_run = Run.from_df(
    df=R_fr_test.reset_index(),
    q_id_col="object_str",
    doc_id_col="item_str",
    score_col="surpr_predict"
)
evaluate(
    qrels, 
    surprise_run, 
    metrics=metrics
)

{'precision@3': 0.7666666666666667,
 'recall@3': 0.7254612137862139,
 'ndcg@3': 0.9273804112635808}

## Selfmade

### Similarity measure

**Note** Many sources use difference instead of similarity, but similarity is the inverse of difference, so you can search not for items that have the smallest difference, but for items that have the strongest similarity.

We need a method to estimate how close the objects are to each other. The following cell defines such a function - it implements Pearson's correlation coefficient, which can solve some problems related to the RecSys domain.

In [10]:
def correlation(
        a : np.ndarray,
        b : np.ndarray
        ) -> float:
    '''
    Pearson correlation coefficient modified
    for our requirements. In particular, empty 
    handling

    Parameters
    ----------
    a : (N,) np.ndarray
        input array;
    b : (N, ) np.ndarray
        input array;

    Returns
    ----------
    out : float
        The Pearson correlation coefficient is 
        computed using only the common items 
        for both arrays. If it's not possible 
        to compute the coefficient, it returns 0, 
        indicating neutral similarity.
        The coefficient is a number ranging from -1 to 1.
    '''
    cond = ~(np.isnan(a) | np.isnan(b))
    # in case if there are only two
    # observations it's impossible
    # to compute coorrelation coeficient
    if sum(cond) <=1:
        return 0.

    sub_a = a[cond]
    sub_b = b[cond]
    
    variation_a = (sub_a - sub_a.mean())
    variation_b = (sub_b - sub_b.mean())

    # to compute pirson correlation coefficient
    # all variables should have some variation
    if (variation_a==0).all() or (variation_b==0).all():
        return 0.
    
    cov = (variation_a*variation_b).sum()
    return cov/np.sqrt(
        (variation_a**2).sum()*(variation_b**2).sum()
    )

Here are some cases where this function has been used and the result.

In [11]:
result = correlation(
    np.array([0,1,2,3,4]),
    np.array([5,6,7,8,9])
)
print("Total correlation -", result)
result = correlation(
    np.array([np.NaN, 1, 2, np.NaN]),
    np.array([10, 10, 20, np.NaN])
)
print("Total correlation with empty -", result)
result = correlation(
    np.array([1,1,1,1]),
    np.array([3,2,1,2])
)
print("Constant variable -", result)
result = correlation(
    np.array([np.NaN, 2, np.NaN, 3]),
    np.array([1, np.NaN, 10, np.NaN])
)
print("Not enough common elements -", result)

Total correlation - 1.0
Total correlation with empty - 1.0
Constant variable - 0.0
Not enough common elements - 0.0


### Prediction function

We need methods to create predictions based on results of the previous steps. The following cell realises the classical formula for generating predicsts based on collaboration:

$$\frac{(\sum_{i}r_{ij}-\overline{r_i})sim_i}{\sum_i{|sim_i|}}$$

Where:

- $r_{ij}$  - relevance of the $j$-th item for the $i$-th user of the collaboration.
- $\overline{r_i}$ - average relevance of the $i$-th user.
- $sim_i$ similarity of the $i$-th object of the collaboration to the object for which we are making a prediction.

In [12]:
def basic_prediction(
        collaboration : np.ndarray,
        similarities : np.ndarray
        ) -> np.ndarray:
    """
    Basic function for creating prediction.
    Uses formula
    \frac{(\sum_{i}x_{ij}-\overline{x_i})sim_i}{\sum_i{|sim_i|}}.
    
    items that have no observable preferences 
    for any user in the collation will simply 
    be omitted from the result;.
    
    Parameters
    ----------
    collaboration : np.ndarray (
        <users number>, 
        <games number>
        )
        relevances matrix for collaboration;
    relevances : np.ndarray (<items number>)
        relavences of the user under consdieration;
    similarities : np.ndarray (<observations number>)
        similarities of the users from 
        collaboration to user under consideration.
    
    Returns
    -------
    out : np.ndarray(<items number>)
        scores of the items under consideration.
    """
    users_mean = np.nanmean(collaboration, axis=1, keepdims=1)
    weighed_collab = (collaboration - users_mean)*similarities[:, np.newaxis]
    res = np.nansum(weighed_collab, axis=0)/np.abs(similarities).sum()
    
    # items that have not been played by 
    # any of the users should have nan
    is_empty = np.isnan(collaboration).all(axis=0)
    res[is_empty] = np.NaN

    return res

Consider how it works.

We will consider collaborations that have a matrix of relevance:

$$P_u=\left( \begin{array} \\
2,7,9,- \\
0,-,3,-
\end{array}\right)
$$

And estimations of similarity:

$$
\overline{sim} = 
\left( 
\begin{array}\\
0.7, \\
0.6
\end{array} 
\right)
$$

The average relevance of the users in the collaboration will take shape:

$$\overline{P_u}= 
\left(
\begin{array}\\
6\\
1.5
\end{array}
\right)
$$

Sum of similarities:

$$\sum_i \left|sim_i\right|=0.7+0.6=1.3$$

So we finally can compute estimated relevances for user with this collaboration:

$$\left( \frac{(2-6)0.7 + (0-1.5)0.6}{1.3}, \frac{(7-6)0.7}{1.3}, \frac{(9-6)0.7 + (3-1.5)0.6}{1.5}, - \right)=$$
$$=\left(-2.8461,0.5384,-2.3077, - \right)$$

Now let's try to apply it to the same numbers:

In [13]:
collaboration = np.array([
    [2,7,9, np.NaN],
    [0,np.NaN,3, np.NaN]
])
similarities = np.array([0.7,0.6])

basic_prediction(
    collaboration=collaboration,
    similarities=similarities
)

array([-2.84615385,  0.53846154,  2.30769231,         nan])

### Collaborative filter

Finally, consider a class that implements such an algorithm:

In [14]:
class CollaborativeFilter:
    """
    Сlass implements collaborative filtering based 
    on nearest neighbours.It uses information about 
    previous relevancies, we will call them 
    "fit relevancies", to estimate relevancies for 
    some new users, we will call them "predict relevancies".

    Attributes
    ----------
    similarity : Callable[[np.ndarray, np.ndarray], float]
        Similarity calculation function
        to find collaboration;
    prediction : Callable[[np.ndarray, np.ndarray]
        Function that predicts scores for all items
        available in train sample based on passed
        collaboration;
    sim_threshold : float = -np.inf
        Tresholld that will be used. Objects that have
        distance value is higher are not allowed to pass;
    n_nearest : int = 1000
        Maximum number of rows that can be
        included to the collaboration.
    """
    def __init__(
            self, 
            similarity : Callable[[np.ndarray, np.ndarray], float] = correlation,
            prediction : Callable[[np.ndarray, np.ndarray], float] = basic_prediction,
            sim_threshold : float = -np.inf,
            n_nearest : int = 1000
            ):
        self.similarity = similarity
        self.prediction = prediction
        self.sim_threshold = sim_threshold
        self.n_nearest = n_nearest
        
    
    def fit(self, X:Union[np.ndarray, pd.DataFrame]):
        '''
        Remember the train sample.

        Parameters
        ----------
        X : np.ndarray(
            <users number>,
            <items number>
        )
            Relevances matrix;

        Returns
        -------
        out : CollaborativeFilter
            Model instance.
        '''
        self.X=np.array(X)
        return self

    def get_similarities(self, X:np.ndarray):
        '''
        Get similarities to the object for the 
        selected set of rows. Here, we use 
        information about the relevances of 
        users from the training sample and users 
        for which we need to create predictions. 
        We will use the terms "fit users" and "predict users" 
        accordingly for these terms.

        Parameters
        ----------
        X : np.ndarray (
            <users numer>, 
            <number of items>)
            Fit relevancies;
        
        Returns
        -------
        out : np.ndarray (
            <users number in fit relevacies>, 
            <users number in predict relevacies>
            )
            array where each element is similarity
            between i-th fit user and j-th predict
            user.
        '''
        return np.apply_along_axis(
            func1d=lambda history_row: np.apply_along_axis(
                func1d=(
                    lambda predict_row: 
                    self.similarity(history_row, predict_row)
                ),
                arr=X, axis=1
            ), 
            arr=self.X, axis=1
        )
    

    def get_collaborations(self, X:np.ndarray):
        '''
        Get collaboration for given set predict 
        relevancies.

        Parameters
        ----------
        X : np.array (<users number>, <items number>)
            Predict relevancies.

        Returns
        ----------
        out : np.array
        '''
        similarities = self.get_similarities(X=X)
        # перебираем столбики с похожестями пользователей
        # для которых ведется предсказание и достаем нужные 
        # коллаборации
        return np.apply_along_axis(
            func1d=lambda user_sim: (
                self.X[user_sim>self.sim_threshold,:][:self.n_nearest,:]
            ),
            axis=0, arr=similarities
        )


    def predict(self, X:np.ndarray)->np.ndarray:
        '''
        Получить предсказания для 
        пользователей с заданными предпочтениями

        Parameters
        ----------
        X : np.ndarray (<количетсво пользователей>, <количество игр>)
            матрица, которая описывает предпочтения
            пользователей для которых надо сформировать
            предсказание;
        
        Returns
        -------
        out : np.ndarray
            np.ndarray (<количество полльзователей>, <количество игр>)
            предсказания для заданных пользователей.
        '''
        if X.shape[1] != self.X.shape[1]:
            raise ValueError(
                "Количества игр в обучающем наборе данных "
                "и наборе для предсказания не совпадают."
                )
        
        similarities = self.get_similarities(X)

#### Get similarities

Each row from the training must be compared with all prediction row via similarity fuinction passed to the constructor.

Here's an example that does only element-wise concatenation instead of similarity - so we can see that we'll have all combinations of raws in the result.

In [41]:
# history observations
history = np.array([
    ["11h", "12h"],
    ["21h", "22h"],
    ["31h", "32h"]
])
# observations that we'll
# use for predicitons
predict = np.array([
    ["11p", "12p"],
    ["21p", "22p"]
])

cf = CollaborativeFilter(
    similarity=lambda a,b: "|".join([a+"_"+b for a,b in zip(a,b)])
)
cf.fit(history)
ans = cf.get_similarities(predict)

display(HTML(header_pattern.format("Raw output")))
display(ans)

display(HTML(header_pattern.format("Interpretation")))
for hist_i, hist in enumerate(ans):
    for pred_i, pred in enumerate(hist):
        print(f"history {hist_i + 1}, predictions {pred_i + 1}")
        print(pred)

array([['11h_11p|12h_12p', '11h_21p|12h_22p'],
       ['21h_11p|22h_12p', '21h_21p|22h_22p'],
       ['31h_11p|32h_12p', '31h_21p|32h_22p']], dtype='<U15')

history 1, predictions 1
11h_11p|12h_12p
history 1, predictions 2
11h_21p|12h_22p
history 2, predictions 1
21h_11p|22h_12p
history 2, predictions 2
21h_21p|22h_22p
history 3, predictions 1
31h_11p|32h_12p
history 3, predictions 2
31h_21p|32h_22p


So as a result we'll have the matrix $[sim_{ij}]$ similarity of the $i$-th fit user to the $j$-th prediction user.

### Get collaboration

In [19]:
similarities

array([[-0.58829171,  0.08276059, -0.4258411 , ...,  0.53354002,
        -0.45328433, -0.91364277],
       [-0.21633961, -0.13245324, -0.72969394, ...,  0.97336648,
        -0.43185521, -0.77991383],
       [-0.66560255, -0.01863067, -0.2192645 , ...,  0.23918729,
        -0.48696592, -0.38150302],
       ...,
       [-0.3899503 , -0.36707203, -0.44233627, ...,  0.97574887,
        -0.11202768, -0.64677417],
       [ 0.00776151, -0.74330462,  0.92450033, ...,  0.40421051,
         0.95807103,  0.93155164],
       [ 0.04428005, -0.97302555,  0.91855865, ...,  0.27027781,
         0.9647146 ,  0.91132238]])