In [1]:
import pandas as pd
from hnmchallenge.data_reader import DataReader
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np



from hnmchallenge.constant import *
import datetime

In [2]:
dataset = StratifiedDataset()
dr = DataReader()

In [3]:
KIND = "train"
CUTOFF = 40
RECS_NAME = f"{KIND}_cosine_recs_{CUTOFF}_tw_True.feather"

In [4]:
# retrieve the prediction of the model
recs = pd.read_feather(dr.get_preprocessed_data_path() / RECS_NAME)

In [5]:
# retrieve the holdout
holdout = dataset.get_holdout()
# retrieve items per user in holdout
item_per_user = holdout.groupby(DEFAULT_USER_COL)[DEFAULT_ITEM_COL].apply(list)
item_per_user_df = item_per_user.to_frame()
# items groundtruth
items_groundtruth = (
    item_per_user_df.reset_index().explode(DEFAULT_ITEM_COL).drop_duplicates()
)

# merge recs and item groundtruth
merged = pd.merge(
    recs,
    items_groundtruth,
    left_on=[DEFAULT_USER_COL, "recs"],
    right_on=[DEFAULT_USER_COL, "article_id"],
    how="left",
)

In [6]:
merged

Unnamed: 0,customer_id,recs,itemknn_score,itemknn_rank,article_id
0,0,1482,0.009862,1,
1,0,1638,0.009794,2,
2,0,1797,0.009470,3,
3,0,4861,0.006824,4,
4,0,9027,0.006576,5,
...,...,...,...,...,...
38509195,1136205,12589,0.040956,36,
38509196,1136205,1490,0.040618,37,
38509197,1136205,11813,0.040546,38,
38509198,1136205,17499,0.040533,39,


In [7]:
merged.loc[merged["article_id"].notnull(), "article_id"] = 1

In [8]:
merged["hit_sum"] = merged.groupby(DEFAULT_USER_COL)["article_id"].transform("sum")
merged = merged[merged["hit_sum"] > 0]

In [9]:
pred = merged[[DEFAULT_USER_COL, "recs", "itemknn_rank"]].copy().rename({"recs":DEFAULT_ITEM_COL, "itemknn_rank":"rank"}, axis=1)

In [14]:
from hnmchallenge.evaluation.python_evaluation import map_at_k, recall_at_k

In [11]:
pred

Unnamed: 0,customer_id,article_id,rank
40,1,3161,1
41,1,7534,2
42,1,2997,3
43,1,3503,4
44,1,1482,5
...,...,...,...
38509155,1136204,20539,36
38509156,1136204,17633,37
38509157,1136204,16502,38
38509158,1136204,16477,39


In [12]:
gt = holdout[[DEFAULT_USER_COL, DEFAULT_ITEM_COL]].copy()

In [17]:
map_at_k(gt, pred)

0.27536951840566765

In [18]:
recall_at_k(gt, pred)

0.639320047724101

In [None]:
hit_per_user = merged.groupby(DEFAULT_USER_COL)["article_id"].sum().to_frame().reset_index()

In [None]:
hit_per_user

In [None]:
at_least_one_hit = hit_per_user[hit_per_user["article_id"]>0]

In [None]:
at_least_one_hit

In [None]:
items_groundtruth

In [None]:
items_groundtruth.groupby(DEFAULT_USER_COL).size().to_frame().reset_index()