In [130]:
import math
import re
import time

import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
from tqdm import tqdm

from hnmchallenge.constant import *
from hnmchallenge.data_reader import DataReader
from hnmchallenge.datasets.all_items_last_mont__last_day_last_week import AILMLDWDataset
from hnmchallenge.datasets.all_items_last_month_last_day import AILMLDDataset
from hnmchallenge.datasets.all_items_last_month_last_week import AILMLWDataset
from hnmchallenge.datasets.last2month_last_day import L2MLDDataset
from hnmchallenge.datasets.last_month_last_day import LMLDDataset
from hnmchallenge.datasets.last_month_last_week_dataset import LMLWDataset
from hnmchallenge.datasets.last_month_last_week_user import LMLUWDataset
from hnmchallenge.datasets.last_week_last_week import LWLWDataset
from hnmchallenge.evaluation.python_evaluation import map_at_k, recall_at_k
from hnmchallenge.feature_manager import FeatureManager
from hnmchallenge.models.itemknn.itemknn import ItemKNN
from hnmchallenge.models_prediction.recs_interface import RecsInterface
from hnmchallenge.datasets.first_week_dataset import FirstWeekDataset

In [131]:
VERSION = 0
NAME = f"cutf_200_ItemKNN_tw_True_rs_False"
DATASET = f"{NAME}_{VERSION}.feather"
MODEL_NAME = f"lgbm_{DATASET}_5.pkl"

In [132]:
dataset = FirstWeekDataset()
base_load_path = dataset._DATASET_PATH / "lgbm_models"
model = joblib.load(base_load_path / MODEL_NAME)

In [133]:
print("Read Dataset...")
features = pd.read_feather(dataset._DATASET_PATH / f"dataset_dfs/full/{DATASET}")
print(features.shape)

Read Dataset...
(12682400, 81)


In [134]:
features = features.rename(columns=lambda x: re.sub("[^A-Za-z0-9_]+", "", x))

cat = [
    "index_code_gbm",
    "product_group_name_gbm",
    "index_group_name_gbm",
    "graphical_appearance_no_gbm",
]
cat_index = [i for i, c in enumerate(features.columns) if c in cat]
print("Categorical conversion...")
for col in cat:
    features[col] = pd.Categorical(features[col])

Categorical conversion...


In [135]:
customer_article_df = features[[DEFAULT_USER_COL, DEFAULT_ITEM_COL]]
X = features.drop([DEFAULT_USER_COL, DEFAULT_ITEM_COL], axis=1)

In [136]:
s = time.time()
print("Computing Predictions...")
y_pred = []

batch_size = 30_000_000
idx = 0
while idx + batch_size < X.shape[0]:
    end = idx + batch_size
    batch = X.loc[idx : end - 1, :]
    score = model.predict(batch, num_iteration=model.best_iteration_, n_jobs=72)
    y_pred.extend(score)
    idx = end
last_batch = X.loc[idx:, :]
score = model.predict(last_batch, num_iteration=model.best_iteration_, n_jobs=72)
y_pred.extend(score)
y_pred = np.array(y_pred)
print(y_pred.shape)

print(f"Took: {math.ceil((time.time()-s)/60)} minutes")

Computing Predictions...
(12682400,)
Took: 1 minutes


In [137]:
customer_article_df["predicted_score"] = y_pred
print("Sorting scores...")
customer_article_df = customer_article_df.sort_values(
    [DEFAULT_USER_COL, "predicted_score"], ascending=[True, False]
)
print(customer_article_df.head(20))
customer_article_df = customer_article_df.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_article_df["predicted_score"] = y_pred


Sorting scores...
     customer_id  article_id  predicted_score
3              6        2687         0.723911
12             6       62737         0.564720
2              6       98054         0.555936
72             6       37912         0.353578
27             6       35307         0.201512
139            6       39230         0.172506
67             6       39629         0.170546
160            6       98086         0.169760
62             6       61890         0.158974
59             6        1408         0.063472
26             6        1146         0.004017
24             6       38400        -0.065058
66             6          67        -0.083361
33             6        3596        -0.111731
49             6       40366        -0.160905
115            6       39651        -0.217799
152            6       36709        -0.377455
134            6       46107        -0.410020
92             6       44709        -0.412651
69             6         394        -0.446366


In [138]:
customer_article_df["rank"] = np.tile(np.arange(1,201), customer_article_df[DEFAULT_USER_COL].nunique())

In [139]:
print("Filtering predictions...")
cutoff = customer_article_df.groupby(DEFAULT_USER_COL).size().values
i = 0
filter_indices = []
for cut in cutoff:
    filter_indices.extend(range(i, i + 200))
    i = i + cut
customer_article_df = customer_article_df.loc[filter_indices]
customer_article_df = customer_article_df.drop("predicted_score", axis=1)

Filtering predictions...


In [140]:
customer_article_df

Unnamed: 0,customer_id,article_id,rank
0,6,2687,1
1,6,62737,2
2,6,98054,3
3,6,37912,4
4,6,35307,5
...,...,...,...
12682395,1356690,92948,196
12682396,1356690,82757,197
12682397,1356690,57010,198
12682398,1356690,75064,199


In [141]:
eval_data = dataset.get_evaluation_data()

In [142]:
item_per_user = eval_data.groupby(DEFAULT_USER_COL)[DEFAULT_ITEM_COL].apply(list)
item_per_user_df = item_per_user.to_frame()
# items groundtruth
eval_groundtruth = (
    item_per_user_df.reset_index().explode(DEFAULT_ITEM_COL).drop_duplicates()
)

In [143]:
map_list = []
rec_list = []
cut_list = []
for cutoff in [12, 50, 100]:
    filtered_res = customer_article_df[customer_article_df["rank"]<=cutoff]
    
    map_res = map_at_k(eval_groundtruth, filtered_res)
    rec_res = recall_at_k(eval_groundtruth, filtered_res)

    print(map_res)

    map_list.append(map_res)
    rec_list.append(rec_res)
    cut_list.append(cutoff)

63412
63412
0.03245902317812821
63412
63412
0.03511422289376874
63412
63412
0.035528640901562325


In [144]:
res_df = pd.DataFrame(zip(map_list, rec_list, cut_list), columns=["map", "recall", "cutoff"])

In [16]:
res_df.to_csv("fourth_week_res.csv")

In [145]:
res_df

Unnamed: 0,map,recall,cutoff
0,0.032459,0.071917,12
1,0.035114,0.120215,50
2,0.035529,0.14073,100
