In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd

## Задание 1. Реализовать метрики Recall@k и  Money Recall@k

*Recall* - доля рекомендованных товаров среди релевантных = Какой % купленных товаров был среди рекомендованных

$$\Large Recall@K(i) = \frac {\sum_{j=1}^{K}\mathbb{1}_{r_{ij}}}{|Rel_i|}$$

$\Large |Rel_i|$ -- количество релевантных товаров для пользователя $i$

$$\Large MoneyRecall@K(i) = \frac {\sum_{j=1}^{K}\mathbb{1}_{r_{ij}}\cdot Price(j)}{\sum_{s\in Rel_i}Price(s)}$$


In [3]:
def recall(recommended_list, bought_list):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(bought_list, recommended_list)
    
    return flags.sum() / len(bought_list)
    

def recall_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    return flags.sum() / len(bought_list)


'''
или:

def recall_at_k(recommended_list, bought_list, k=5):
    return recall(recommended_list[:k], bought_list)
'''


def money_recall_at_k(recommended_list, bought_list, prices_recommended, prices_bought, k=5):
    bought_list = np.array(bought_list)
    prices_bought = np.array(prices_bought)
    recommended_list = np.array(recommended_list)[:k]
    prices_recommended = np.array(prices_recommended)[:k]
    
    flags = np.isin(recommended_list, bought_list)
    
    recall = np.dot(flags, prices_recommended).sum() / prices_bought.sum()
    
    return recall

In [4]:
recommended_list = [143, 156, 1134, 27, 1543, 3345, 533, 11, 43] #id товаров
bought_list = [521, 32, 143, 991, 55]
prices = [23, 2233, 534, 56, 1]
prices_bought = [300, 100, 23, 4]

In [5]:
recall(recommended_list, bought_list)

0.2

In [6]:
recall_at_k(recommended_list, bought_list, k=5)

0.2

In [7]:
money_recall_at_k(recommended_list, bought_list, prices, prices_bought)

0.053864168618266976

In [8]:
recommended_list = [143, 156, 1134, 27, 1543, 3345, 533, 11, 43] #id товаров
bought_list = [143, 156]
prices = [23, 2233, 534, 56, 1]
prices_bought = [23, 2233]

In [9]:
recall(recommended_list, bought_list)

1.0

In [10]:
recall_at_k(recommended_list, bought_list, k=5)

1.0

In [11]:
money_recall_at_k(recommended_list, bought_list, prices, prices_bought)

1.0

## Задание 2. Реализовать метрику MRR@k

Mean Reciprocal Rank

- Считаем для первых k рекоммендаций
- Найти ранк первого релевантного предсказания $\Large rank_j$
- Посчитать reciprocal rank = $\Large\frac{1}{rank_j}$

$$\Large  MMR(i)@k=\frac {1}{\min\limits_{j\in Rel(i)} rank_j}$$

In [12]:
recommended_list = [156, 143, 1134, 27, 1543, 3345, 533, 11, 43]
bought_list = [521, 32, 143, 991]

recommended_list_3_users = [[143, 156, 1134, 991, 27, 1543, 3345, 533, 11, 43], # юзер 1
                    [1134, 533, 14, 4, 15, 1543, 1, 99, 27, 3345], # юзер 2
                    [991, 3345, 27, 533, 43, 143, 1543, 156, 1134, 11] # юзер 3
                    ]

bought_list_3_users = [[521, 32, 143], # юзер 1
                       [143, 156, 991, 43, 11], # юзер 2
                       [1, 2]] # юзер 3

In [13]:
def mrr_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list, dtype=object)
    recommended_list = np.array(recommended_list, dtype=object)[:k]

    if True in [isinstance(ii, list) for ii in bought_list]:
        assert len(recommended_list) == len(bought_list)
        mrr = []
        for i in range(len(bought_list)):
            rank = np.nonzero(np.isin(recommended_list[i][:k], bought_list[i]))[0]
            if len(rank) == 0:
                break
            rank = rank[0] + 1
            rr = 1 / rank
            mrr.append(rr)
        return sum([1 / r for r in mrr]) / len(bought_list) if len(mrr) > 0 else 0
    else:
        rank = np.nonzero(np.isin(recommended_list, bought_list))[0]
        if len(rank) == 0:
            return 0
        else:
            rank = rank[0] + 1
            return 1 / rank

In [14]:
mrr_at_k(recommended_list, bought_list, k=5)

0.5

In [15]:
mrr_at_k(recommended_list_3_users, bought_list_3_users, k=5)

0.3333333333333333

## Задание 3*. Реализовать метрику nDCG@k
Normalized discounted cumulative gain. Эту метрику реализовать будет немного сложнее.

$$\Large DCG@K(i) = \sum_{j=1}^{K}\frac{\mathbb{1}_{r_{ij}}}{\log_2 (j+1)}$$


$\Large \mathbb{1}_{r_{ij}}$ -- индикаторная функция показывает что пользователь $i$ провзаимодействовал с продуктом $j$

Для подсчета $nDCG$ нам необходимо найти максимально возможный $DCG$ для пользователя $i$  и рекомендаций длины $K$.
Максимальный $DCG$ достигается когда мы порекомендовали максимально возможное количество релевантных продуктов и все они в начале списка рекомендаций.

$$\Large IDCG@K(i) = max(DCG@K(i)) = \sum_{j=1}^{K}\frac{\mathbb{1}_{j\le|Rel_i|}}{\log_2 (j+1)}$$

$$\Large nDCG@K(i) = \frac {DCG@K(i)}{IDCG@K(i)}$$

$\Large |Rel_i|$ -- количество релевантных продуктов для пользователя $i$



In [16]:
recommended_list = [156, 143, 1134, 27, 1543, 3345, 533, 11, 43]
bought_list = [521, 32, 143, 991]

In [17]:
def ndcg_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list, dtype=object)
    recommended_list = np.array(recommended_list, dtype=object)[:k]
    
    flags = np.isin(recommended_list, bought_list)
    
    dcg = 1 / len(recommended_list) * (sum([j / (i + 1) if i + 1 <= 2 else j / np.log2(i + 1) for i, j in enumerate(flags)]))
    ideal_dcg = 1 / len(recommended_list) * (sum([1 / (i + 1) if i + 1 <= 2 else 1 / np.log2(i + 1) for i in range(len(flags))]))
    ndcg = dcg / ideal_dcg
    
    return ndcg

In [18]:
ndcg_at_k(recommended_list, bought_list, k=5)

0.16331296355715133