In [78]:
import pandas as pd
import numpy as np
from rectools import Columns

# PFound

In [6]:
#  id запроса и текст запроса, разделённые табуляцией
qid_query = pd.read_csv("/Users/dmitry/Downloads/yandex_cup_analytics_A/hidden_task/qid_query.tsv", 
                        sep="\t", 
                        names=["qid", "query"])
#  id запроса, URL документа, релевантность документа запросу;
qid_url_rating = pd.read_csv("/Users/dmitry/Downloads/yandex_cup_analytics_A/hidden_task/qid_url_rating.tsv", 
                             sep="\t", 
                             names=["qid", "url", "rating"])
# hostid_url.tsv — id хоста и URL документа.
hostid_url = pd.read_csv("/Users/dmitry/Downloads/yandex_cup_analytics_A/hidden_task/hostid_url.tsv", 
                         sep="\t", 
                         names=["hostid", "url"])

In [4]:
qid_query[:3]

Unnamed: 0,qid,query
0,402111,работа фотографом в австралии
1,405851,производительность видеокарт
2,407522,ёлочные игрушки из пластиковых бутылок


In [7]:
qid_url_rating[:3]

Unnamed: 0,qid,url,rating
0,402111,http://24-job.com/board/job_australia/232-1-2-...,0.07
1,402111,http://24-job.com/board/job_australia/232-1-2-...,0.07
2,402111,http://802351.info/5964-v-avstralii.html,0.0


In [8]:
hostid_url[:3]

Unnamed: 0,hostid,url
0,1,http://09spravki.ru/requisites.php
1,10,http://3pu.info/seo-tools/domains
2,1006,http://www.priroda.su/item/820


In [9]:
qid_url_rating_hostid = pd.merge(qid_url_rating, hostid_url, on="url")

In [10]:
qid_url_rating_hostid[:5]

Unnamed: 0,qid,url,rating,hostid
0,402111,http://24-job.com/board/job_australia/232-1-2-...,0.07,7
1,402111,http://24-job.com/board/job_australia/232-1-2-...,0.07,7
2,402111,http://802351.info/5964-v-avstralii.html,0.0,13
3,402111,http://auscommunity.com/blog/jobs/,0.0,53
4,402111,http://auscommunity.com/tag/%D1%84%D0%BE%D1%82...,0.0,53


In [75]:
def p_found(data: pd.DataFrame, 
            group_col: list, 
            rating_col: str, 
            K: int, 
            p_break: float = 0.15) ->float:
    
    # create data with max rating of each group per item
    max_by_group = data.groupby(
        group_col)[rating_col].max().reset_index().sort_values(
            [group_col[0], rating_col], ascending=False).groupby(group_col[0]).head(K)
    
    # create numbers(index) for each items
    max_by_group['numbers'] = max_by_group.groupby(group_col[0]).cumcount()
    
    # calculate (1 - Rel) and (1 - pBreak)
    max_by_group['1 - Rel'] = (1 - max_by_group[rating_col]).shift(1)
    max_by_group['1 - pBreak'] = 1 - p_break
    
    # first values for n == 0 will be 1
    max_by_group.loc[max_by_group['numbers'] == 0, ['1 - Rel', '1 - pBreak']] = 1
    
    # lets multiply (1 - Rel) and (1 - pBreak)
    max_by_group['multiply'] = max_by_group['1 - Rel'] * max_by_group['1 - pBreak']
    
    # Plook is a recursion function. For it we will use a pd.cumprod method
    max_by_group['Plook'] = max_by_group.groupby(group_col[0])['multiply'].cumprod()
    max_by_group['PFound'] = max_by_group['Plook'] * max_by_group[rating_col]
    
    return max_by_group.groupby(group_col[0])['PFound'].sum().mean()

In [77]:
p_found(qid_url_rating_hostid, ['qid', 'hostid'], 'rating', 5)

0.5370314635332146

# MRR

In [140]:
interactions = pd.read_csv('/Users/dmitry/Downloads/kion_train/interactions.csv')

interactions.rename(
    columns={
        'track_id': Columns.Item,
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True) 

interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime])

In [141]:
interactions[: 10]

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5,1032142,6686,2021-05-13,11286,100.0
6,1016458,354,2021-08-14,1672,25.0
7,884009,693,2021-08-04,703,14.0
8,648682,1449,2021-06-13,26246,75.0
9,203219,13582,2021-08-22,6975,100.0


In [151]:
def generate_subsample(data: pd.DataFrame, users_count: int, top_k: int):
    users = np.random.choice(data[Columns.User].unique(), users_count, replace=False)
    df = data[data[Columns.User].isin(users)].reset_index(drop=True)
    del df[Columns.Datetime], df[Columns.Weight], df['watched_pct']
    
    recs = np.random.choice(df[Columns.Item], size=(users_count, top_k))
    return df, users, recs

In [201]:
sample, users, recs = generate_subsample(interactions, 100000, 5)

In [202]:
def mrr_pandas(df: pd.DataFrame, users: np.array, recs: np.array, K: int) -> float:
    # create recs dataset
    df_recs = pd.DataFrame({'user_id': np.repeat(users, 5), 'item_id': recs.ravel()})
    # calculate rank
    df_recs['rank'] = df_recs.groupby('user_id').cumcount() + 1
    # join data
    df_recs = sample.merge(df_recs, how='left', left_on=['user_id', 'item_id'], right_on=['user_id', 'item_id'])
    df_recs = df_recs.sort_values(by=['user_id', 'rank'])
    # take min rank and calculate reciprocal rank
    df_recs[df_recs['rank'].notna()].groupby('user_id')[['rank']].min()
    df_recs['1/rank'] = 1/df_recs['rank']
    df_recs = df_recs.fillna(0)
    mrr = df_recs['1/rank'].sum() / df_recs['user_id'].nunique()
    return mrr

In [203]:
mrr_pandas(sample, users, recs, 5)

0.07146466666666665