In [20]:
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from xgboost import plot_importance

from hnmchallenge.constant import *
from hnmchallenge.data_reader import DataReader
from hnmchallenge.dataset import Dataset
from hnmchallenge.evaluation.python_evaluation import map_at_k, recall_at_k
from hnmchallenge.feature_manager import FeatureManager
from hnmchallenge.filtered_dataset import FilterdDataset
from hnmchallenge.models.ease.ease import EASE
from hnmchallenge.models.itemknn.itemknn import ItemKNN
from hnmchallenge.models.sgmc.sgmc import SGMC
from hnmchallenge.models.top_pop import TopPop
from hnmchallenge.stratified_dataset import StratifiedDataset

In [21]:
dataset = StratifiedDataset()
dr = DataReader()

In [22]:
sample_sub = dr.get_sample_submission()

In [23]:
raw_users = sample_sub[DEFAULT_USER_COL].unique()

In [24]:
len(raw_users)

1371980

In [25]:
new_raw_user_map_dict, _ = dr.get_filtered_new_raw_mapping_dict()

In [26]:
raw_new_user_map_dict, _ = dr.get_filtered_raw_new_mapping_dict()

In [27]:
missing_users = [user for user in raw_users if user not in raw_new_user_map_dict]

In [28]:
len(missing_users)

235774

In [29]:
user_keys = np.array(list(new_raw_user_map_dict.keys()))

In [30]:
max_user = user_keys.max()+1

In [31]:
new_keys = np.arange(max_user, max_user+len(missing_users))

In [32]:
added_new_raw_user_map_dict = dict(zip(new_keys, missing_users))

In [33]:
added_raw_new_user_map_dict = {v:k for k,v in added_new_raw_user_map_dict.items()}

In [34]:
new_raw_user_map_dict.update(added_new_raw_user_map_dict)
raw_new_user_map_dict.update(added_raw_new_user_map_dict)

In [35]:
len(new_raw_user_map_dict.keys())

1371980

In [36]:
import pickle
# save mapping dictionaries
dict_dp = dr.get_mapping_dict_path()
# users
with open(dict_dp / "filtered_raw_new_user_ids_dict.pkl", "wb+") as f:
    pickle.dump(raw_new_user_map_dict, f)
with open(dict_dp / "filtered_new_raw_user_ids_dict.pkl", "wb+") as f:
    pickle.dump(new_raw_user_map_dict, f)

In [39]:
new_raw_user_map_dict, _ = dr.get_filtered_new_raw_mapping_dict()

In [40]:
len(list(new_raw_user_map_dict.keys()))

1371980

In [41]:
filtered_all_users = pd.DataFrame(list(new_raw_user_map_dict.keys()), columns=[DEFAULT_USER_COL])

In [42]:
filtered_all_users

Unnamed: 0,customer_id
0,0
1,1
2,2
3,3
4,4
...,...
1371975,1371975
1371976,1371976
1371977,1371977
1371978,1371978


In [44]:
filtered_all_users.to_feather(dr.get_preprocessed_data_path() / "filtered_all_users.feather")

In [50]:
dr.get_filtered_all_customers_ids_df()

Unnamed: 0,customer_id
0,0
1,1
2,2
3,3
4,4
...,...
1371975,1371975
1371976,1371976
1371977,1371977
1371978,1371978


In [51]:
raw_customer = dr.get_customer()

In [54]:
raw_customer[DEFAULT_USER_COL] = raw_customer[DEFAULT_USER_COL].map(raw_new_user_map_dict)

In [55]:
raw_customer

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,263061,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,24390,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,863220,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,1136206,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,1016506,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...
...,...,...,...,...,...,...,...
1371975,439410,,,ACTIVE,NONE,24.0,7aa399f7e669990daba2d92c577b52237380662f36480b...
1371976,184159,,,ACTIVE,NONE,21.0,3f47f1279beb72215f4de557d950e0bfa73789d24acb5e...
1371977,102731,1.0,1.0,ACTIVE,Regularly,21.0,4563fc79215672cd6a863f2b4bf56b8f898f2d96ed590e...
1371978,935187,1.0,1.0,ACTIVE,Regularly,18.0,8892c18e9bc3dca6aa4000cb8094fc4b51ee8db2ed14d7...


In [56]:
raw_customer.to_feather(dr.get_preprocessed_data_path() / "filtered_all_customers.feather")

In [57]:
dr.get_filtered_all_customers()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,263061,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,24390,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,863220,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,1136206,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,1016506,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...
...,...,...,...,...,...,...,...
1371975,439410,,,ACTIVE,NONE,24.0,7aa399f7e669990daba2d92c577b52237380662f36480b...
1371976,184159,,,ACTIVE,NONE,21.0,3f47f1279beb72215f4de557d950e0bfa73789d24acb5e...
1371977,102731,1.0,1.0,ACTIVE,Regularly,21.0,4563fc79215672cd6a863f2b4bf56b8f898f2d96ed590e...
1371978,935187,1.0,1.0,ACTIVE,Regularly,18.0,8892c18e9bc3dca6aa4000cb8094fc4b51ee8db2ed14d7...
