# Метрики

## Imports

In [5]:
import time

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numba as nb

from tqdm.auto import tqdm
from rectools import Columns

In [6]:
np.random.seed(23)

## Read data

In [7]:
interactions = pd.read_csv('data_original/interactions.csv')

interactions.rename(
    columns={
        'track_id': Columns.Item,
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)

interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime])

In [8]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

In [9]:
def headtail(df):
    return pd.concat([df.head(), df.tail()])

headtail(interactions)

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0
5476250,319709,4436,2021-08-15,3921,45.0


In [10]:
interactions.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


Выделим небольшой кусок из данных, чтобы не слишком страдать

In [11]:
sample_users = [57607, 403227, 70720]
df = interactions[interactions[Columns.User].isin(sample_users)].reset_index(drop=True)
del df[Columns.Datetime], df[Columns.Weight], df['watched_pct']
df

Unnamed: 0,user_id,item_id
0,70720,4880
1,57607,4151
2,57607,10440
3,70720,4881
4,403227,6353
5,403227,1736
6,70720,6327
7,57607,13865
8,403227,5336
9,403227,181


In [12]:
print('Users', df[Columns.User].unique())
sample_items = df[Columns.Item].unique()
print('Items', sample_items)

Users [ 70720  57607 403227]
Items [ 4880  4151 10440  4881  6353  1736  6327 13865  5336   181]


## Regression

В регрессией все относительно просто. По (user, item) мы знаем таргет (рейтинг чаще всего) и по такой же паре предсказываем его

In [13]:
df['target'] = np.random.choice([3, 4, 5], df.shape[0])
df['predict'] = np.random.rand(df.shape[0]) * 3 + 2
df

Unnamed: 0,user_id,item_id,target,predict
0,70720,4880,5,2.658958
1,57607,4151,3,4.500017
2,57607,10440,4,3.280296
3,70720,4881,5,4.263719
4,403227,6353,3,4.990597
5,403227,1736,4,2.000249
6,70720,6327,3,4.927865
7,57607,13865,5,3.257364
8,403227,5336,4,4.044672
9,403227,181,5,4.257889


Общая оценка

In [14]:
mae = (df['target'] - df['predict']).abs().mean()
print(mae)

1.374467645324227


Оценка по пользователю с последюущим усреднением

In [15]:
df['diff'] = (df['target'] - df['predict']).abs()
average_mae = df.groupby(Columns.User)['diff'].mean()
print(average_mae.mean())
average_mae

1.3944881931989215


user_id
57607     1.320786
70720     1.668396
403227    1.194283
Name: diff, dtype: float64

Видно, что в данном случае метрики близки к друг другу, но это не всегда так

In [16]:
del df['target'], df['predict'], df['diff']

## Classification

Сгенерируем случайные рекомендации.

In [17]:
top_k = 5
recs = np.array([
    np.random.choice(sample_items, top_k, replace=False),
    np.random.choice(sample_items, top_k, replace=False),
    np.random.choice(sample_items, top_k, replace=False),
])
recs

array([[ 4880,  6327, 10440,  1736,  4881],
       [  181,  1736,  4880, 13865,  6353],
       [ 4151,  4881,  6353, 10440,  1736]])

Преобразуем в длинный датафрейм

In [18]:
df_recs = pd.DataFrame({
    Columns.User: np.repeat(sample_users, top_k),
    Columns.Item: recs.ravel()
})
df_recs

Unnamed: 0,user_id,item_id
0,57607,4880
1,57607,6327
2,57607,10440
3,57607,1736
4,57607,4881
5,403227,181
6,403227,1736
7,403227,4880
8,403227,13865
9,403227,6353


In [19]:
df_recs[Columns.Rank] = df_recs.groupby(Columns.User).cumcount() + 1
headtail(df_recs)

Unnamed: 0,user_id,item_id,rank
0,57607,4880,1
1,57607,6327,2
2,57607,10440,3
3,57607,1736,4
4,57607,4881,5
10,70720,4151,1
11,70720,4881,2
12,70720,6353,3
13,70720,10440,4
14,70720,1736,5


Ключевой момент. Именно ради него преобразовывали данные и именно это позволяет считать метрики быстрее.

In [20]:
df_recs = df.merge(df_recs, how='left', left_on=Columns.UserItem, right_on=Columns.UserItem)
df_recs = df_recs.sort_values(by=[Columns.User, Columns.Rank])
df_recs

Unnamed: 0,user_id,item_id,rank
2,57607,10440,3.0
1,57607,4151,
7,57607,13865,
3,70720,4881,2.0
0,70720,4880,
6,70720,6327,
9,403227,181,1.0
5,403227,1736,2.0
4,403227,6353,5.0
8,403227,5336,


### Precision@K

In [21]:
df_recs[f'TP@5'] = df_recs['rank'] < 6
df_recs

Unnamed: 0,user_id,item_id,rank,TP@5
2,57607,10440,3.0,True
1,57607,4151,,False
7,57607,13865,,False
3,70720,4881,2.0,True
0,70720,4880,,False
6,70720,6327,,False
9,403227,181,1.0,True
5,403227,1736,2.0,True
4,403227,6353,5.0,True
8,403227,5336,,False


In [22]:
df_recs[df_recs[Columns.Rank].notnull()]

Unnamed: 0,user_id,item_id,rank,TP@5
2,57607,10440,3.0,True
3,70720,4881,2.0,True
9,403227,181,1.0,True
5,403227,1736,2.0,True
4,403227,6353,5.0,True


Посчитаем вручную (1/5 + 1/5 + 3/5) / 3

In [23]:
df_recs['TP@5/5'] = df_recs['TP@5'] / top_k

p5 = df_recs.groupby(Columns.User)['TP@5/5'].sum().mean()

print(f'Precision@5 = {p5}')

Precision@5 = 0.3333333333333333


In [24]:
df_recs

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5
2,57607,10440,3.0,True,0.2
1,57607,4151,,False,0.0
7,57607,13865,,False,0.0
3,70720,4881,2.0,True,0.2
0,70720,4880,,False,0.0
6,70720,6327,,False,0.0
9,403227,181,1.0,True,0.2
5,403227,1736,2.0,True,0.2
4,403227,6353,5.0,True,0.2
8,403227,5336,,False,0.0


Используем тот факт, что мы знаем количество пользователей, а значит groupby не нужен

In [25]:
p5 = df_recs['TP@5/5'].sum() / len(sample_users)
print(f'Precision@5 = {p5}')

Precision@5 = 0.3333333333333333


### Recall@K

In [26]:
df_recs['actual'] = df_recs.groupby(Columns.User)[Columns.Item].transform('count')
df_recs

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5,actual
2,57607,10440,3.0,True,0.2,3
1,57607,4151,,False,0.0,3
7,57607,13865,,False,0.0,3
3,70720,4881,2.0,True,0.2,3
0,70720,4880,,False,0.0,3
6,70720,6327,,False,0.0,3
9,403227,181,1.0,True,0.2,4
5,403227,1736,2.0,True,0.2,4
4,403227,6353,5.0,True,0.2,4
8,403227,5336,,False,0.0,4


In [27]:
df_recs['TP@5/actual'] = df_recs['TP@5'] / df_recs['actual']
df_recs

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5,actual,TP@5/actual
2,57607,10440,3.0,True,0.2,3,0.333333
1,57607,4151,,False,0.0,3,0.0
7,57607,13865,,False,0.0,3,0.0
3,70720,4881,2.0,True,0.2,3,0.333333
0,70720,4880,,False,0.0,3,0.0
6,70720,6327,,False,0.0,3,0.0
9,403227,181,1.0,True,0.2,4,0.25
5,403227,1736,2.0,True,0.2,4,0.25
4,403227,6353,5.0,True,0.2,4,0.25
8,403227,5336,,False,0.0,4,0.0


In [28]:
(1/3 + 1/3 + 3/4) / 3

0.47222222222222215

In [29]:
r5 = df_recs.groupby(Columns.User)['TP@5/actual'].sum().mean()
print(f'Recall@5 = {r5}')

Recall@5 = 0.47222222222222215


In [30]:
r5 = df_recs['TP@5/actual'].sum() / len(sample_users)
print(f'Recall@5 = {r5}')

Recall@5 = 0.47222222222222215


## Ranking

### MAP@K

In [31]:
df_recs

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5,actual,TP@5/actual
2,57607,10440,3.0,True,0.2,3,0.333333
1,57607,4151,,False,0.0,3,0.0
7,57607,13865,,False,0.0,3,0.0
3,70720,4881,2.0,True,0.2,3,0.333333
0,70720,4880,,False,0.0,3,0.0
6,70720,6327,,False,0.0,3,0.0
9,403227,181,1.0,True,0.2,4,0.25
5,403227,1736,2.0,True,0.2,4,0.25
4,403227,6353,5.0,True,0.2,4,0.25
8,403227,5336,,False,0.0,4,0.0


In [32]:
df_recs['cumTP@5'] = df_recs.groupby(Columns.User)['TP@5'].cumsum()
df_recs

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5,actual,TP@5/actual,cumTP@5
2,57607,10440,3.0,True,0.2,3,0.333333,1
1,57607,4151,,False,0.0,3,0.0,1
7,57607,13865,,False,0.0,3,0.0,1
3,70720,4881,2.0,True,0.2,3,0.333333,1
0,70720,4880,,False,0.0,3,0.0,1
6,70720,6327,,False,0.0,3,0.0,1
9,403227,181,1.0,True,0.2,4,0.25,1
5,403227,1736,2.0,True,0.2,4,0.25,2
4,403227,6353,5.0,True,0.2,4,0.25,3
8,403227,5336,,False,0.0,4,0.0,3


In [33]:
df_recs['Prec@5'] = df_recs['cumTP@5'] / df_recs[Columns.Rank]
df_recs

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5,actual,TP@5/actual,cumTP@5,Prec@5
2,57607,10440,3.0,True,0.2,3,0.333333,1,0.333333
1,57607,4151,,False,0.0,3,0.0,1,
7,57607,13865,,False,0.0,3,0.0,1,
3,70720,4881,2.0,True,0.2,3,0.333333,1,0.5
0,70720,4880,,False,0.0,3,0.0,1,
6,70720,6327,,False,0.0,3,0.0,1,
9,403227,181,1.0,True,0.2,4,0.25,1,1.0
5,403227,1736,2.0,True,0.2,4,0.25,2,1.0
4,403227,6353,5.0,True,0.2,4,0.25,3,0.6
8,403227,5336,,False,0.0,4,0.0,3,


In [34]:
df_recs['Prec@5/actual'] = df_recs['Prec@5'] / df_recs['actual']
df_recs

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5,actual,TP@5/actual,cumTP@5,Prec@5,Prec@5/actual
2,57607,10440,3.0,True,0.2,3,0.333333,1,0.333333,0.111111
1,57607,4151,,False,0.0,3,0.0,1,,
7,57607,13865,,False,0.0,3,0.0,1,,
3,70720,4881,2.0,True,0.2,3,0.333333,1,0.5,0.166667
0,70720,4880,,False,0.0,3,0.0,1,,
6,70720,6327,,False,0.0,3,0.0,1,,
9,403227,181,1.0,True,0.2,4,0.25,1,1.0,0.25
5,403227,1736,2.0,True,0.2,4,0.25,2,1.0,0.25
4,403227,6353,5.0,True,0.2,4,0.25,3,0.6,0.15
8,403227,5336,,False,0.0,4,0.0,3,,


In [35]:
ap = df_recs.groupby(Columns.User)['Prec@5/actual'].sum()
print(ap.mean())
ap

0.3092592592592593


user_id
57607     0.111111
70720     0.166667
403227    0.650000
Name: Prec@5/actual, dtype: float64

## Naive vs Numba vs Pandas

In [36]:
df

Unnamed: 0,user_id,item_id
0,70720,4880
1,57607,4151
2,57607,10440
3,70720,4881
4,403227,6353
5,403227,1736
6,70720,6327
7,57607,13865
8,403227,5336
9,403227,181


In [37]:
target = df.values
target

array([[ 70720,   4880],
       [ 57607,   4151],
       [ 57607,  10440],
       [ 70720,   4881],
       [403227,   6353],
       [403227,   1736],
       [ 70720,   6327],
       [ 57607,  13865],
       [403227,   5336],
       [403227,    181]])

In [38]:
target[target[:, 0] == 513902][:, 1]

array([], dtype=int64)

In [39]:
recs

array([[ 4880,  6327, 10440,  1736,  4881],
       [  181,  1736,  4880, 13865,  6353],
       [ 4151,  4881,  6353, 10440,  1736]])

In [40]:
def precision_naive(target, users, recs, k):
    precision = []
    for i, user in enumerate(users):
        p = 0
        user_target = target[target[:, 0] == user][:, 1]
        for rec in recs[i]:
            if rec in user_target:
                p += 1
        precision.append(p / k)
    return sum(precision) / len(users)

In [41]:
precision_naive(target, sample_users, recs, 5)

0.3333333333333333

In [42]:
@nb.njit(cache=True, parallel=True)
def precision_numba(target, users, recs, k):
    precision = np.zeros(len(users))
    for i in nb.prange(len(users)):
        user = users[i]
        p = 0
        user_target = target[target[:, 0] == user][:, 1]
        for rec in recs[i]:
            if rec in user_target:
                p += 1
        precision[i] = p / k
    return precision.mean()

In [43]:
precision_numba(target, np.array(sample_users), recs, 5)

0.3333333333333333

In [44]:
precision_numba(target, np.array(sample_users), recs, 5)

0.3333333333333333

In [45]:
def precision_pandas(df, users, recs, k):
    df_recs = pd.DataFrame({
        Columns.User: np.repeat(users, k),
        Columns.Item: recs.ravel()
    })
    df_recs[Columns.Rank] = df_recs.groupby(Columns.User).cumcount() + 1
    df_recs = df.merge(df_recs, how='left', left_on=Columns.UserItem, right_on=Columns.UserItem)
    tp_k = f'TP@{k}'
    df_recs[tp_k] = df_recs[Columns.Rank] < (k + 1)
    p = df_recs[tp_k].sum() / k / len(users)
    return p

In [46]:
precision_pandas(df, sample_users, recs, 5)

0.3333333333333333

Посмотрим через `timeit`

In [47]:
%timeit precision_naive(target, sample_users, recs, 5)

98.7 µs ± 37.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [48]:
%timeit precision_numba(target, sample_users, recs, 5)

The slowest run took 46.65 times longer than the fastest. This could mean that an intermediate result is being cached.
457 µs ± 557 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [49]:
%timeit precision_pandas(df, sample_users, recs, 5)

5.88 ms ± 257 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [50]:
def generate_subsample(users_count, top_k):
    users = np.random.choice(interactions[Columns.User].unique(), users_count, replace=False)
    df = interactions[interactions[Columns.User].isin(users)].reset_index(drop=True)
    del df[Columns.Datetime], df[Columns.Weight], df['watched_pct']

    recs = np.random.choice(df[Columns.Item], size=(users_count, top_k))
    return df, users, recs

In [51]:
top_k = 10
df, users, recs = generate_subsample(10000, top_k)
target = df.values

In [52]:
%timeit precision_naive(target, users, recs, top_k)

3.31 s ± 66.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [53]:
precision_numba(target, users, recs, top_k)

0.03048000000000012

In [54]:
%timeit precision_numba(target, users, recs, top_k)

670 ms ± 32.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [55]:
%timeit precision_pandas(df, users, recs, top_k)

41.5 ms ± 3.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## RecTools

Рассмотрим, как использовать библиотеку от МТС для подсчета метрик.

Полный гайд тут - [RecTools/examples/3_metrics.ipynb](https://github.com/MobileTeleSystems/RecTools/blob/main/examples/3_metrics.ipynb)

Вначале вспомним, какие данные есть у нас

In [56]:
df.shape, users.shape, recs.shape

((56805, 2), (10000,), (10000, 10))

In [57]:
from rectools.metrics import Precision, Recall, MAP, calc_metrics

In [58]:
metrics = {
    "prec@1": Precision(k=1),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "MAP@5": MAP(k=5),
    "MAP@10": MAP(k=10),
}

In [59]:
catalog = df[Columns.Item].unique()

In [60]:
df_recs = pd.DataFrame({
    Columns.User: np.repeat(users, top_k),
    Columns.Item: recs.ravel()
})
df_recs[Columns.Rank] = df_recs.groupby(Columns.User).cumcount() + 1

In [61]:
metric_values = calc_metrics(
    metrics,
    reco=df_recs,
    interactions=df,
)

In [62]:
metric_values

{'prec@1': 0.0289,
 'prec@10': 0.03048,
 'recall@10': 0.08462718883430417,
 'MAP@5': 0.01938148253052879,
 'MAP@10': 0.026219275497237186}

Как посчитать одну метрику

In [63]:
metrics['prec@10'].calc(df_recs, df)

0.03048

In [64]:
%timeit metrics['prec@10'].calc(df_recs, df)

27.3 ms ± 520 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [65]:
metrics['prec@10'].calc_per_user(df_recs, df)

user_id
24         0.0
263        0.0
477        0.0
509        0.1
522        0.0
          ... 
1097062    0.0
1097151    0.1
1097227    0.0
1097343    0.0
1097398    0.0
Length: 10000, dtype: float64

## Homework

### PFound
Исходные данные - Yandex Cup 2022 Analytics
- Ссылка - https://yandex.ru/cup/analytics/analysis/ , пример A. Рассчитать pFound
- Данные - https://yadi.sk/d/guqki4UI4hFlXQ
- Формула
$$pFound@K = \sum_{i=1}^{k} pLook[i]\ pRel[i]$$

$$pLook[1] = 1$$

$$pLook[i] = pLook[i-1]\ (1 - pRel[i-1])\ (1 - pBreak)$$

$$pBreak = 0.15$$

**Задача** - написать функцию, которая принимает на вход dataframe (после join), а на выходе дает средний pFound по всем query.
- Запрещается использовать циклы for для расчет метрики (как полностью, так и ее частей).
- Усложнение, если задача показалась легкой - попробуйте обойтись без groupby (не уверен, что это возможно, но вдруг вы справитесь)

### MRR
Исходные данные - результат `generate_subsample`

**Задача** - по аналогии с precision написать три версии функции подсчета Mean Reciprocal Rank (naive, numba, pandas) и протестировать на разных размерах выборки
- Протестируйте для всех комбинаций (users_count, top_k):
  - users_count - [100, 1000, 10000, 100000]
  - top_k - [10, 50, 100]
- Результатом тестирования должен быть график, где будут отражены следующие показатели:
  - Алгоритм - naive, numba, pandas
  - Скорость работы (время)
  - users_count
  - top_k

In [304]:
df_s, users_s, recs_s = generate_subsample(100, 10)

In [305]:
df

Unnamed: 0,user_id,item_id
0,81354,7626
1,368484,13865
2,704013,15297
3,33662,12173
4,365812,9728
...,...,...
488,630935,10755
489,696314,5070
490,328801,10440
491,541820,10821


In [310]:
def mrr_naive(target, users, recs):
    mrr = []
    for i, user in enumerate(users):
        user_target = target[target[:, 0] == user][:, 1]
        for rank, rec in enumerate(recs[i], start=1):
            if rec in user_target:
                mrr.append(1 / rank)
                break
    return sum(mrr) / len(users)

In [311]:
@nb.njit(cache=True, parallel=True)
def mrr_numba(target, users, recs):
    mrr = np.zeros(len(users))
    for i in nb.prange(len(users)):
        user = users[i]
        user_target = target[target[:, 0] == user][:, 1]
        rank = 0
        for rec in recs[i]:
            rank += 1
            if rec in user_target:
                mrr[i] = 1 / rank
                break
    return mrr.mean()

In [313]:
mrr_numba(df_s.values, users_s, recs_s)

0.09267460317460317

In [315]:
%timeit mrr_numba(df_s.values, users_s, recs_s)

86.6 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [316]:
mrr_naive(df_s.values, users_s, recs_s)

0.09267460317460316

In [317]:
%timeit mrr_naive(df_s.values, users_s, recs_s)

4.6 ms ± 744 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [318]:
def mrr_pandas(df, users, recs):
    k = recs.shape[1]
    df_recs = pd.DataFrame({
        Columns.User: np.repeat(users, k),
        Columns.Item: recs.ravel()
    })
    df_recs[Columns.Rank] = df_recs.groupby(Columns.User).cumcount() + 1
    df_recs = df.merge(df_recs, how='left', on=[Columns.User, Columns.Item])
    df_recs = df_recs.groupby([Columns.User])[Columns.Rank].min()
    df_recs = 1 / df_recs
    return df_recs.sum() / len(users)

In [319]:
%timeit mrr_pandas(df_s, users_s, recs_s)

4.92 ms ± 1.18 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [320]:
mrr_pandas(df_s, users_s, recs_s)

0.09267460317460317

In [353]:
users_count = [100, 1000, 10000, 100000]
top_k = [10, 50, 100]
algos = {
    "naive": mrr_naive,
    "numba": mrr_numba,
    "pandas": mrr_pandas,
}
data_format = {
    "naive": lambda x: x.values,
    "numba": lambda x: x.values,
    "pandas": None,
}

In [354]:
from itertools import product
import seaborn as sns
import time
from tqdm import tqdm
from matplotlib import pyplot as plt

In [355]:
results = {}

In [356]:
for algo_name, algo in algos.items():
    results[algo_name] = {
        'users': [],
        'top_k': [],
        'time': [],
    }
    for cur_users_count, cur_top_k in tqdm(list(product(users_count, top_k))):
        df, users, recs = generate_subsample(cur_users_count, cur_top_k)
        df = df if data_format[algo_name] is None else data_format[algo_name](df)

        time_start = time.time_ns()
        algo_result = algo(df, users, recs)
        algo_time = time.time_ns() - time_start
        results[algo_name]['users'].append(cur_users_count)
        results[algo_name]['top_k'].append(cur_top_k)
        results[algo_name]['time'].append(algo_time)


100%|██████████| 12/12 [08:23<00:00, 41.96s/it] 
100%|██████████| 12/12 [02:12<00:00, 11.07s/it]
100%|██████████| 12/12 [00:14<00:00,  1.21s/it]


In [357]:
for algo_name, algo in algos.items():
    sns.lineplot(x=results[algo_name]['users'], y=results[algo_name]['time'], hue=results[algo_name]['top_k'])
    plt.title(f'Algorithm: {algo_name}')
    plt.xlabel('Users')
    plt.ylabel('Time')
    plt.legend([f'Top-{k}' for k in sorted(set(results[algo_name]['top_k']))])
    plt.savefig(f'figs/{algo_name}.png')
    plt.clf()

<Figure size 640x480 with 0 Axes>

In [67]:
qid_query = pd.read_csv("/home/kpavel/PycharmProjects/RecoService/notebooks/hw2/yacup/qid_query.tsv", sep="\t", names=["qid", "query"])
qid_url_rating = pd.read_csv("/home/kpavel/PycharmProjects/RecoService/notebooks/hw2/yacup/qid_url_rating.tsv", sep="\t", names=["qid", "url", "rating"])
hostid_url = pd.read_csv("/home/kpavel/PycharmProjects/RecoService/notebooks/hw2/yacup/hostid_url.tsv", sep="\t", names=["hostid", "url"])

qid_url_rating_hostid = pd.merge(qid_url_rating, hostid_url, on="url")

In [68]:
qid_url_rating_hostid = qid_url_rating_hostid.sort_values(['hostid', 'rating']).reset_index(drop=True)

In [69]:
p = 10

In [None]:
qid_url_rating_hostid.groupby

In [415]:
qid_url_rating_hostid.groupby("qid").count().index

Int64Index([  5308,  48815,  49587,  55082,  58989,  60304,  63179,  70357,
             70618,  79514,  91889,  99543, 402111, 405851, 407522, 409073,
            438518, 690263, 692059, 692308],
           dtype='int64', name='qid')

In [416]:
max_by_host = qid_url_rating_hostid[qid_url_rating_hostid.qid == 5308].groupby("hostid").head(p).reset_index(drop=True)

In [419]:
qid_url_rating_hostid[qid_url_rating_hostid.qid == 5308]

Unnamed: 0,qid,url,rating,hostid
6,5308,http://3pu.info/seo-tools/domains,0.0,10
26,5308,http://art-domain.info/,0.0,47
44,5308,http://caxapa.ru/92670.html,0.0,88
59,5308,http://distributed.org.ua/index.php?go=Pages&i...,0.0,119
79,5308,http://forum.oszone.net/nextoldesttothread-114...,0.14,160
80,5308,http://forum.oszone.net/post-572043.html,0.14,160
94,5308,http://gudzonhost.ru/indexns.php?n=5&id=1938,0.07,195
97,5308,http://home-bank.kz/,0.07,202
98,5308,http://home.live.com/,0.0,203
104,5308,http://hosting.agava.ru/faq/general/domain.shtml,0.14,208


In [422]:
max_by_host = max_by_host.sort_values('rating', ascending=False)[:p]

In [423]:
max_by_host

Unnamed: 0,qid,url,rating,hostid
4,5308,http://forum.oszone.net/nextoldesttothread-114...,0.14,160
5,5308,http://forum.oszone.net/post-572043.html,0.14,160
9,5308,http://hosting.agava.ru/faq/general/domain.shtml,0.14,208
13,5308,http://www.bijid.ru/,0.14,707
21,5308,http://www.securitylab.ru/forum/forum18/topic4...,0.07,1044
6,5308,http://gudzonhost.ru/indexns.php?n=5&id=1938,0.07,195
7,5308,http://home-bank.kz/,0.07,202
20,5308,http://www.nic.ru/,0.07,960
10,5308,http://masterhost.ru/support/doc/php/,0.07,323
17,5308,http://www.hoster.ru/,0.07,835


In [438]:
dftmp = max_by_host#.sort_values(['hostid', 'rating'], ascending=False)#.groupby('hostid').cumcount() + 1

In [458]:
pbreak = 0.15
dftmp['pLook'] = (dftmp.groupby('hostid').cumcount() + 1) * (1 - dftmp['rating']) * (1 - 0.15)

In [464]:
dftmp

Unnamed: 0,qid,url,rating,hostid,pLook,pFound
4,5308,http://forum.oszone.net/nextoldesttothread-114...,0.14,160,0.731,0.10234
5,5308,http://forum.oszone.net/post-572043.html,0.14,160,1.462,0.20468
9,5308,http://hosting.agava.ru/faq/general/domain.shtml,0.14,208,0.731,0.10234
13,5308,http://www.bijid.ru/,0.14,707,0.731,0.10234
21,5308,http://www.securitylab.ru/forum/forum18/topic4...,0.07,1044,0.7905,0.055335
6,5308,http://gudzonhost.ru/indexns.php?n=5&id=1938,0.07,195,0.7905,0.055335
7,5308,http://home-bank.kz/,0.07,202,0.7905,0.055335
20,5308,http://www.nic.ru/,0.07,960,0.7905,0.055335
10,5308,http://masterhost.ru/support/doc/php/,0.07,323,0.7905,0.055335
17,5308,http://www.hoster.ru/,0.07,835,0.7905,0.055335


In [460]:
dftmp['pFound'] = dftmp['pLook'] * dftmp['rating']

In [472]:
dftmp['pFound'][:3].sum()

  dftmp['pFound'][:3].sum()


0.40936000000000006

In [469]:
dftmp['pFound'].sum()

0.8437100000000001

In [462]:
def plook(ind, rels):
    if ind == 0:
        return 1
    return plook(ind-1, rels)*(1-rels[ind-1])*(1-pbreak)

def pfound(group):
    max_by_host = group.groupby("hostid")["rating"].max() # максимальный рейтинг хоста
    top10 = max_by_host.sort_values(ascending=False)[:p] # берем топ10 урлов с наивысшим рейтингом
    # print(top10)
    # pfound = ''
    pfound = 0
    for ind, val in enumerate(top10):
        pfound += val*plook(ind, top10.values)
    return pfound

In [430]:
qid_url_rating_hostid

Unnamed: 0,qid,url,rating,hostid
0,70357,http://09spravki.ru/requisites.php,0.00,1
1,49587,http://16x.zp.ua/find.php?id=13,0.00,5
2,49587,http://16x.zp.ua/sorties.php?sid=812&action=a&...,0.00,5
3,402111,http://24-job.com/board/job_australia/232-1-2-...,0.07,7
4,402111,http://24-job.com/board/job_australia/232-1-2-...,0.07,7
...,...,...,...,...
798,690263,http://z-oleg.com/secur/avz_doc/,0.61,1180
799,99543,http://zapil77.by.ru/kubok-kvn-g-dubna.html,0.00,1182
800,63179,http://zhitejnik.ru/aktery-rezhissery-tele/221...,0.00,1184
801,48815,http://zoolife.com.ua/pageid838.html,0.14,1188


In [431]:
qid_pfound = qid_url_rating_hostid.groupby('qid').apply(pfound) # группируем по qid и вычисляем pfound
#qid_max = qid_pfound.idxmax() # берем qid с максимальным pfound
qid_pfound#.sum()
#qid_query[qid_query["qid"] == qid_max]

  top10 = max_by_host.sort_values(ascending=False)[:p] # берем топ10 урлов с наивысшим рейтингом
  top10 = max_by_host.sort_values(ascending=False)[:p] # берем топ10 урлов с наивысшим рейтингом
  top10 = max_by_host.sort_values(ascending=False)[:p] # берем топ10 урлов с наивысшим рейтингом
  top10 = max_by_host.sort_values(ascending=False)[:p] # берем топ10 урлов с наивысшим рейтингом
  top10 = max_by_host.sort_values(ascending=False)[:p] # берем топ10 урлов с наивысшим рейтингом
  top10 = max_by_host.sort_values(ascending=False)[:p] # берем топ10 урлов с наивысшим рейтингом
  top10 = max_by_host.sort_values(ascending=False)[:p] # берем топ10 урлов с наивысшим рейтингом
  top10 = max_by_host.sort_values(ascending=False)[:p] # берем топ10 урлов с наивысшим рейтингом
  top10 = max_by_host.sort_values(ascending=False)[:p] # берем топ10 урлов с наивысшим рейтингом
  top10 = max_by_host.sort_values(ascending=False)[:p] # берем топ10 урлов с наивысшим рейтингом
  top10 = max_by_host.sort_val

qid
5308      0.415820
48815     0.655448
49587     0.493599
55082     0.497771
58989     0.497771
60304     0.735836
63179     0.460028
70357     0.655448
70618     0.263596
79514     0.764755
91889     0.344494
99543     0.772245
402111    0.324649
405851    0.718072
407522    0.460028
409073    0.808643
438518    0.497771
690263    0.770353
692059    0.655448
692308    0.852628
dtype: float64

In [362]:
df.groupby('hostid').cumsum()

  df.groupby('hostid').cumsum()


Unnamed: 0,qid,rating,rating_max
0,405851,0.0,0.41
1,811702,0.41,0.82
2,1217553,0.55,1.23
3,407522,0.14,0.41
4,456337,0.28,0.82
5,505924,0.28,1.23
6,1196187,0.42,1.64
7,1886450,0.56,2.05
8,2578758,0.97,2.46
9,2678301,1.11,2.87


In [132]:
df['pLook'] = dfasd

Unnamed: 0,url,rating,hostid,rating__,rating_max
0,http://24-job.com/board/job_australia/232-1-2-...,0.07,7,,
1,http://24-job.com/board/job_australia/232-1-2-...,0.07,7,,
2,http://802351.info/5964-v-avstralii.html,0.00,13,,
3,http://auscommunity.com/blog/jobs/,0.00,53,,
4,http://auscommunity.com/tag/%D1%84%D0%BE%D1%82...,0.00,53,,
...,...,...,...,...,...
798,http://www.youtube.com/watch?v=QDcomRWogFE,0.14,1155,0.41,0.41
799,http://www.youtube.com/watch?v=Y3n47xZb0b4,0.14,1155,0.41,0.41
800,http://www.yuga.ru/articles/culture?id=3378,0.00,1156,,
801,http://www.zexe.de/modules.php?name=Pages&pa=s...,0.14,1160,,
