In [43]:
import pandas as pd
from hnmchallenge.data_reader import DataReader
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


from hnmchallenge.models_prediction.recs_interface import RecsInterface
from hnmchallenge.datasets.all_items_last_month_last_week import AILMLWDataset
from hnmchallenge.evaluation.python_evaluation import map_at_k, recall_at_k
from hnmchallenge.datasets.all_items_last_month_last_2nd_week import AILML2WDataset
from hnmchallenge.datasets.all_items_last_month_last_3rd_week import AILML3WDataset
from hnmchallenge.constant import *

In [44]:
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [45]:
dataset = AILMLWDataset()
dr = DataReader()

In [46]:
fd=dataset.get_full_data()

In [47]:
DATASET_NAME = f"cutf_200_Popularity_cutoff_200"
kind="train"

In [48]:
base_df = RecsInterface.load_recommendations(dataset, DATASET_NAME, kind)

[1;36mloading recs model:
 cutf_200_Popularity_cutoff_200[0m


In [49]:
base_df

Unnamed: 0,customer_id,Popularity_cutoff_200_recs,Popularity_cutoff_200_score,Popularity_cutoff_200_rank,relevance
0,0,67,0.583459,18,
1,0,111,0.379699,67,
2,0,394,0.326316,111,
3,0,486,0.404511,58,
4,0,865,0.334586,104,
...,...,...,...,...,...
274395995,1371979,102563,0.270677,168,
274395996,1371979,102590,0.285714,148,
274395997,1371979,102629,0.269925,169,
274395998,1371979,102825,0.343609,96,


In [53]:
filter_indices = []
cutoff = base_df.groupby(DEFAULT_USER_COL).size().values
i = 0
for cut in cutoff:
    filter_indices.extend(range(i, i + 12))
    i = i + cut
customer_article_df = base_df.loc[filter_indices]

In [54]:
RECS_NAME= "Popularity_cutoff_200" 
cutoff=12   # retrieve the holdout
holdout = dataset.get_holdout()
# retrieve items per user in holdout
item_per_user = holdout.groupby(DEFAULT_USER_COL)[DEFAULT_ITEM_COL].apply(list)
item_per_user_df = item_per_user.to_frame()
# items groundtruth
holdout_groundtruth = (
    item_per_user_df.reset_index().explode(DEFAULT_ITEM_COL).drop_duplicates()
)

# merge recs and item groundtruth
merged = pd.merge(
    customer_article_df,
    holdout_groundtruth,
    left_on=[DEFAULT_USER_COL, f"{RECS_NAME}_recs"],
    right_on=[DEFAULT_USER_COL, "article_id"],
    how="left",
)

# we have to remove the user for which we do not do at least one hit,
# since we would not have the relavance for the items
merged.loc[merged["article_id"].notnull(), "article_id"] = 1
merged["hit_sum"] = merged.groupby(DEFAULT_USER_COL)["article_id"].transform(
    "sum"
)

merged_filtered = merged[merged["hit_sum"] > 0]

pred = (
    merged[
        [DEFAULT_USER_COL, f"{RECS_NAME}_recs", f"{RECS_NAME}_rank"]
    ]
    .copy()
    .rename(
        {
            f"{RECS_NAME}_recs": DEFAULT_ITEM_COL,
            f"{RECS_NAME}_rank": "rank",
        },
        axis=1,
    )
)
pred_filtered = (
    merged_filtered[
        [DEFAULT_USER_COL, f"{RECS_NAME}_recs", f"{RECS_NAME}_rank"]
    ]
    .copy()
    .rename(
        {
            f"{RECS_NAME}_recs": DEFAULT_ITEM_COL,
            f"{RECS_NAME}_rank": "rank",
        },
        axis=1,
    )
)
ground_truth = holdout_groundtruth[[DEFAULT_USER_COL, DEFAULT_ITEM_COL]].copy()
logger.info(
    f"Remaining Users (at least one hit): {merged_filtered[DEFAULT_USER_COL].nunique()}"
)
logger.info("\nMetrics on ALL users")
logger.info(f"MAP@{cutoff}: {map_at_k(ground_truth, pred)}")
logger.info(f"RECALL@{cutoff}: {recall_at_k(ground_truth, pred)}")
logger.info("\nMetrics on ONE-HIT users")
logger.info(f"MAP@{cutoff}: {map_at_k(ground_truth, pred_filtered)}")
logger.info(
    f"RECALL@{cutoff}: {recall_at_k(ground_truth, pred_filtered)}",
)


INFO:__main__:Remaining Users (at least one hit): 2322
INFO:__main__:
Metrics on ALL users


68984


INFO:__main__:MAP@12: 0.0005499728872257096


68984


INFO:__main__:RECALL@12: 0.012314566651308067
INFO:__main__:
Metrics on ONE-HIT users
INFO:__main__:MAP@12: 0.016339073924366215
INFO:__main__:RECALL@12: 0.3658518802212901


2322
2322
