In [1]:
import pandas as pd
from hnmchallenge.data_reader import DataReader
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


from hnmchallenge.models_prediction.recs_interface import RecsInterface
from hnmchallenge.datasets.all_items_last_month_last_week import AILMLWDataset
from hnmchallenge.evaluation.python_evaluation import map_at_k, recall_at_k
from hnmchallenge.datasets.all_items_last_month_last_2nd_week import AILML2WDataset
from hnmchallenge.datasets.all_items_last_month_last_3rd_week import AILML3WDataset
from hnmchallenge.constant import *

In [2]:
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [3]:
dataset = AILMLWDataset()
dr = DataReader()

In [4]:
fd=dataset.get_full_data()

In [7]:

time_weight=True,
remove_seen=False,
DATASET_NAME = f"cutf_12_ItemKNN_tw_True_rs_False"
kind="full"

In [11]:
base_df = RecsInterface.load_recommendations(dataset, DATASET_NAME, kind)

[1;36mloading recs model:
 cutf_12_ItemKNN_tw_True_rs_False[0m


In [12]:
base_df

Unnamed: 0,customer_id,ItemKNN_tw_True_rs_False_recs,ItemKNN_tw_True_rs_False_score,ItemKNN_tw_True_rs_False_rank
0,0,100206,0.019349,1.0
1,0,82209,0.001501,2.0
2,0,2880,0.001332,3.0
3,0,96355,0.001310,4.0
4,0,89354,0.001293,5.0
...,...,...,...,...
16638337,1371979,102098,,
16638338,1371979,38400,,
16638339,1371979,102327,,
16638340,1371979,76736,,


In [17]:
base_df.rename(columns = {"ItemKNN_tw_True_rs_False_recs":'article_id'}, inplace = True)

In [18]:
user_map_dict, item_map_dict = dataset.get_new_raw_mapping_dict()

grp_recs_df = base_df.groupby(DEFAULT_USER_COL)[DEFAULT_ITEM_COL].apply(list)
grp_recs_df = grp_recs_df.to_frame().reset_index()
# map back to original ids
grp_recs_df[DEFAULT_USER_COL] = grp_recs_df[DEFAULT_USER_COL].apply(
    lambda x: user_map_dict.get(x)
)
grp_recs_df[DEFAULT_ITEM_COL] = grp_recs_df[DEFAULT_ITEM_COL].apply(
    lambda x: " ".join(list(map(item_map_dict.get, x)))
)
grp_recs_df = grp_recs_df.rename(
    columns={DEFAULT_ITEM_COL: DEFAULT_PREDICTION_COL}
)

final_recs = grp_recs_df

In [19]:
final_recs

Unnamed: 0,customer_id,prediction
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0750424014 0351484002 0794321011 08...
1,00007d2de826758b65a93dd24ce629ed66842531df6699...,0160442007 0160442010 0666444002 0372860001 03...
2,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,0783504004 0559630003 0600886001 0351484002 05...
3,0008968c0d451dbc5a9968da03196fe20051965edde741...,0893059004 0868134001 0808462002 0778064001 08...
4,000aa7f0dc06cd7174389e76c9e132a67860c5f65f9706...,0706016001 0706016002 0706016003 0796210001 07...
...,...,...
1371975,ffe5801cb2a5b51d4d068322d7f8082e995f427a6f22a6...,0924243001 0924243002 0923758001 0918522001 09...
1371976,ffeb3ca867aba57a312fe9d28d67dd46ef2240fe92a94c...,0924243001 0924243002 0923758001 0918522001 09...
1371977,fff456fa60aac9174456c2f36ede5e0f25429a16c88a34...,0924243001 0924243002 0923758001 0918522001 09...
1371978,fffa8d3cea26d4f5186472b923629b35fa28051f258030...,0924243001 0924243002 0923758001 0918522001 09...


In [20]:
final_recs.to_csv('submission_item_1month.csv', index=False)

In [53]:
filter_indices = []
cutoff = base_df.groupby(DEFAULT_USER_COL).size().values
i = 0
for cut in cutoff:
    filter_indices.extend(range(i, i + 12))
    i = i + cut
customer_article_df = base_df.loc[filter_indices]

In [54]:
RECS_NAME= "Popularity_cutoff_200" 
cutoff=12   # retrieve the holdout
holdout = dataset.get_holdout()
# retrieve items per user in holdout
item_per_user = holdout.groupby(DEFAULT_USER_COL)[DEFAULT_ITEM_COL].apply(list)
item_per_user_df = item_per_user.to_frame()
# items groundtruth
holdout_groundtruth = (
    item_per_user_df.reset_index().explode(DEFAULT_ITEM_COL).drop_duplicates()
)

# merge recs and item groundtruth
merged = pd.merge(
    customer_article_df,
    holdout_groundtruth,
    left_on=[DEFAULT_USER_COL, f"{RECS_NAME}_recs"],
    right_on=[DEFAULT_USER_COL, "article_id"],
    how="left",
)

# we have to remove the user for which we do not do at least one hit,
# since we would not have the relavance for the items
merged.loc[merged["article_id"].notnull(), "article_id"] = 1
merged["hit_sum"] = merged.groupby(DEFAULT_USER_COL)["article_id"].transform(
    "sum"
)

merged_filtered = merged[merged["hit_sum"] > 0]

pred = (
    merged[
        [DEFAULT_USER_COL, f"{RECS_NAME}_recs", f"{RECS_NAME}_rank"]
    ]
    .copy()
    .rename(
        {
            f"{RECS_NAME}_recs": DEFAULT_ITEM_COL,
            f"{RECS_NAME}_rank": "rank",
        },
        axis=1,
    )
)
pred_filtered = (
    merged_filtered[
        [DEFAULT_USER_COL, f"{RECS_NAME}_recs", f"{RECS_NAME}_rank"]
    ]
    .copy()
    .rename(
        {
            f"{RECS_NAME}_recs": DEFAULT_ITEM_COL,
            f"{RECS_NAME}_rank": "rank",
        },
        axis=1,
    )
)
ground_truth = holdout_groundtruth[[DEFAULT_USER_COL, DEFAULT_ITEM_COL]].copy()
logger.info(
    f"Remaining Users (at least one hit): {merged_filtered[DEFAULT_USER_COL].nunique()}"
)
logger.info("\nMetrics on ALL users")
logger.info(f"MAP@{cutoff}: {map_at_k(ground_truth, pred)}")
logger.info(f"RECALL@{cutoff}: {recall_at_k(ground_truth, pred)}")
logger.info("\nMetrics on ONE-HIT users")
logger.info(f"MAP@{cutoff}: {map_at_k(ground_truth, pred_filtered)}")
logger.info(
    f"RECALL@{cutoff}: {recall_at_k(ground_truth, pred_filtered)}",
)


INFO:__main__:Remaining Users (at least one hit): 2322
INFO:__main__:
Metrics on ALL users


68984


INFO:__main__:MAP@12: 0.0005499728872257096


68984


INFO:__main__:RECALL@12: 0.012314566651308067
INFO:__main__:
Metrics on ONE-HIT users
INFO:__main__:MAP@12: 0.016339073924366215
INFO:__main__:RECALL@12: 0.3658518802212901


2322
2322
