In [1]:
import pandas as pd
from hnmchallenge.data_reader import DataReader
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np



from hnmchallenge.models.top_pop import TopPop
from hnmchallenge.evaluation.python_evaluation import map_at_k, recall_at_k
from hnmchallenge.constant import *
from hnmchallenge.models.sgmc.sgmc import SGMC
from hnmchallenge.models.ease.ease import EASE
from hnmchallenge.models.itemknn.itemknn import ItemKNN 

In [2]:
dataset = StratifiedDataset()
dr = DataReader()

In [3]:
from hnmchallenge.feature_manager import FeatureManager

In [4]:
fm = FeatureManager(dataset, "train")

In [5]:
fm.create_features_df("cosine_recs_100_tw_True.feather")

Creating features df for training...
Loading item features...
join item features...
Loading user features...
Final number of features loaded: 101


Unnamed: 0,customer_id,article_id,itemknn_score,itemknn_rank,relevance,colour_group_code,department_no,Accessories_x,Blouses,Dressed,...,age,0_x,ACTIVE,LEFT CLUB,PRE-CREATE,0_y,Monthly,NONE,Regularly,FN
0,1,3161,0.010078,1,1,9,1676,0,0,0,...,26.0,0,1,0,0,0,0,1,0,0.0
1,1,7534,0.009785,2,0,51,1338,0,0,0,...,26.0,0,1,0,0,0,0,1,0,0.0
2,1,2997,0.007242,3,0,10,1676,0,0,0,...,26.0,0,1,0,0,0,0,1,0,0.0
3,1,3503,0.007069,4,0,9,3948,1,0,0,...,26.0,0,1,0,0,0,0,1,0,0.0
4,1,1482,0.006179,5,0,71,1747,0,0,0,...,26.0,0,1,0,0,0,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48331695,1136204,10716,0.011835,96,0,51,1422,0,0,0,...,22.0,0,1,0,0,0,0,1,0,0.0
48331696,1136204,12879,0.011771,97,0,11,1322,0,0,0,...,22.0,0,1,0,0,0,0,1,0,0.0
48331697,1136204,15711,0.011720,98,0,33,5999,0,0,0,...,22.0,0,1,0,0,0,0,1,0,0.0
48331698,1136204,16688,0.011698,99,0,10,1647,0,0,0,...,22.0,0,1,0,0,0,0,1,0,0.0


In [None]:
recs = pd.read_feather(dr.get_preprocessed_data_path()/"cosine_recs_100_tw_True.feather")

In [4]:
recs

Unnamed: 0,customer_id,recs,itemknn_score,itemknn_rank
0,0,1482,0.009862,1
1,0,1638,0.009794,2
2,0,1797,0.009470,3
3,0,4861,0.006824,4
4,0,9027,0.006576,5
...,...,...,...,...
96272995,1136205,4459,0.032402,96
96272996,1136205,5696,0.032185,97
96272997,1136205,146,0.032159,98
96272998,1136205,3115,0.032108,99


In [5]:
# creating H&M 
holdout = dataset.get_holdout()

In [6]:
item_per_user = holdout.groupby(DEFAULT_USER_COL)[DEFAULT_ITEM_COL].apply(list)

In [7]:
item_per_user_df = item_per_user.to_frame()

In [8]:
item_per_user_df

Unnamed: 0_level_0,article_id
customer_id,Unnamed: 1_level_1
0,"[1652, 7053, 11572]"
1,"[3161, 8254, 16695, 13392, 2427]"
2,"[8443, 3023, 7068, 8089, 3215, 7193]"
3,"[632, 3]"
4,[4]
...,...
1136201,[17418]
1136202,[20154]
1136203,[4770]
1136204,[17478]


In [9]:
items_true = item_per_user_df.reset_index().explode(DEFAULT_ITEM_COL).drop_duplicates()

In [10]:
merged = pd.merge(recs, items_true, left_on=[DEFAULT_USER_COL, "recs"], right_on=[DEFAULT_USER_COL, "article_id"], how="left")

In [11]:
merged.loc[merged["article_id"].notnull(), "article_id"] = 1 

In [12]:
merged["hit_sum"] = merged.groupby(DEFAULT_USER_COL)["article_id"].transform("sum")

In [13]:
merged = merged[merged["hit_sum"]>0]

In [14]:
merged

Unnamed: 0,customer_id,recs,itemknn_score,itemknn_rank,article_id,hit_sum
100,1,3161,0.010078,1,1,1
101,1,7534,0.009785,2,,1
102,1,2997,0.007242,3,,1
103,1,3503,0.007069,4,,1
104,1,1482,0.006179,5,,1
...,...,...,...,...,...,...
96272895,1136204,10716,0.011835,96,,1
96272896,1136204,12879,0.011771,97,,1
96272897,1136204,15711,0.011720,98,,1
96272898,1136204,16688,0.011698,99,,1


In [15]:
merged = merged.drop("hit_sum", axis=1)

In [16]:
merged

Unnamed: 0,customer_id,recs,itemknn_score,itemknn_rank,article_id
100,1,3161,0.010078,1,1
101,1,7534,0.009785,2,
102,1,2997,0.007242,3,
103,1,3503,0.007069,4,
104,1,1482,0.006179,5,
...,...,...,...,...,...
96272895,1136204,10716,0.011835,96,
96272896,1136204,12879,0.011771,97,
96272897,1136204,15711,0.011720,98,
96272898,1136204,16688,0.011698,99,


In [17]:
merged["article_id"] = merged["article_id"].fillna(0)

In [19]:
merged = merged.rename({"recs": DEFAULT_ITEM_COL, "article_id":"relevance"}, axis=1).reset_index(drop=True)

In [20]:
merged

Unnamed: 0,customer_id,article_id,itemknn_score,itemknn_rank,relevance
0,1,3161,0.010078,1,1
1,1,7534,0.009785,2,0
2,1,2997,0.007242,3,0
3,1,3503,0.007069,4,0
4,1,1482,0.006179,5,0
...,...,...,...,...,...
48331695,1136204,10716,0.011835,96,0
48331696,1136204,12879,0.011771,97,0
48331697,1136204,15711,0.011720,98,0
48331698,1136204,16688,0.011698,99,0


In [None]:
merged[DEFAULT_USER_COL].nunique()