In [None]:
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from xgboost import plot_importance

from hnmchallenge.constant import *
from hnmchallenge.data_reader import DataReader
from hnmchallenge.dataset import Dataset
from hnmchallenge.evaluation.python_evaluation import map_at_k, recall_at_k
from hnmchallenge.feature_manager import FeatureManager
from hnmchallenge.filtered_dataset import FilterdDataset
from hnmchallenge.models.ease.ease import EASE
from hnmchallenge.models.itemknn.itemknn import ItemKNN
from hnmchallenge.models.sgmc.sgmc import SGMC
from hnmchallenge.models.top_pop import TopPop
from hnmchallenge.stratified_dataset import StratifiedDataset

In [None]:
dataset = StratifiedDataset()
dr = DataReader()

In [None]:
base_load_path = dr.get_preprocessed_data_path() / "xgb_models"

In [None]:
VERSION = 0
DATASET = f"dataset_v00_{VERSION}.feather"
MODEL_NAME = f"xgb_{DATASET}.json"

In [None]:
model = xgb.XGBRanker()
model.load_model(base_load_path/MODEL_NAME)

In [None]:
from xgboost import plot_importance

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
plot_importance(model, ax=ax)
plt.show()

In [None]:
model.best_ntree_limit

In [None]:
features = pd.read_feather(dr.get_preprocessed_data_path()/ f"dataset_dfs/full/{DATASET}")

# Remember to use the bert_ntree_limit when taking the predictions

In [None]:
features.head()

In [None]:
customer_article_df = features[[DEFAULT_USER_COL, DEFAULT_ITEM_COL]].copy()
X = features.drop([DEFAULT_USER_COL, DEFAULT_ITEM_COL], axis=1)

In [None]:
X.head()

In [None]:
y_pred = model.predict(X, ntree_limit=model.best_ntree_limit)

In [None]:
customer_article_df["predicted_score"] = y_pred

In [None]:
customer_article_df

In [None]:
sorted_scores = customer_article_df.sort_values([DEFAULT_USER_COL, "predicted_score"], ascending=[True, False])

In [None]:
sorted_scores

In [None]:
sorted_scores_index = sorted_scores.reset_index(drop=True)

In [None]:
cutoff = sorted_scores_index.groupby(DEFAULT_USER_COL).size().values
i=0 
filter_indices = []
for cut in cutoff:
    filter_indices.extend(range(i, i+12))
    i=i+cut

In [None]:
cutoff.mean()

In [None]:
final_df = sorted_scores_index.loc[filter_indices]

In [None]:
final_df

In [None]:
final_final_df = final_df.drop("predicted_score", axis=1)

In [None]:
from hnmchallenge.submission_handler import SubmissionHandler
sh = SubmissionHandler()                                               

In [None]:
sh.create_submission_filtered_data([final_final_df], sub_name="dataset_v00")
#sh.create_submission_filtered_data_full_users([final_final_df], sub_name="iknn_lastday")