In [1]:
import cudf
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import GroupKFold
import xgboost as xgb
import glob

import gc

In [2]:
n_pred = 20 # sessionごとに予測するaidの数（固定）
weight = {"clicks": 0.1, "carts": 0.3, "orders": 0.6}

event_type = "orders"

path_input_feat = {}
path_input_label = {}
for i in ["train1"]:
    path_input_feat[i] = f"/kaggle/input/otto-make-data-feature/feature_label_{i}_{{}}_{{}}.pickle"
    path_input_label[i] = f"/kaggle/input/otto-make-data-feature/n_ground_truth_{i}_{event_type}.csv"

In [3]:
list_files = glob.glob(path_input_feat["train1"].format('feat','*'))
len(list_files)

15

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
rand = 64

n_splits = 5
n_round = 1000

lgb_params = {
    "objective": "rank:pairwise",
    "max_depth": 20,
    "max_leaves": 80,
    "subsample": 0.8,
    "learning_rate": 0.05,
    "colsample_bytree": 0.6,
    "alpha": 0,
    "lambda": 50,
    "random_state": rand,
#     "eval_metric": "logloss",
    "tree_method":"gpu_hist",
}

# train

In [6]:
# データ読み込み
df_Xy = []
for n_chunk in range(len(list_files)):
    feat = pd.read_pickle(path_input_feat["train1"].format("feat", n_chunk))
    label = pd.read_pickle(path_input_feat["train1"].format(event_type, n_chunk))
    df_Xy.append(pd.concat([feat,label], axis = 1))
df_Xy = pd.concat(df_Xy)

# 負例をサンプリング
df_Xy = pd.concat([
    df_Xy[df_Xy[f"label_{event_type}"] == 1],
    df_Xy[df_Xy[f"label_{event_type}"] == 0].sample(frac = 0.5)
])
df_Xy = df_Xy.sort_values("session")
    
# df_Xy = pd.read_csv(path_input_feat["train1"], dtype = np.int32)
df_Xy = reduce_mem_usage(df_Xy)
display(df_Xy)
df_X = df_Xy.drop([ f"label_{event_type}"], axis = 1)
df_y = df_Xy[f"label_{event_type}"]
df_session = df_Xy["session"]

display(df_y.value_counts())

del df_Xy, feat, label
gc.collect()

Mem. usage decreased to 798.67 Mb (0.0% reduction)


Unnamed: 0,session,n_type_all,n_type_clicks,n_type_carts,n_type_orders,rate_type_clicks,rate_type_carts,rate_type_orders,n_type_all_aid,n_type_clicks_aid,...,n_alltypes_allses_lastweek,n_clicks_allses_diff,n_clicks_allses_diffrate,n_carts_allses_diff,n_carts_allses_diffrate,n_orders_allses_diff,n_orders_allses_diffrate,n_alltypes_allses_diff,n_alltypes_allses_diffrate,label_orders
5,8643220,1,1,0,0,1.0,0.0,0.0,0,0,...,43,-28,0.278564,-1,0.571289,-1,0.000000,-30,0.298828,0
4,8643220,1,1,0,0,1.0,0.0,0.0,0,0,...,26,-18,0.274414,-1,0.000000,0,0.000000,-19,0.264160,0
3,8643220,1,1,0,0,1.0,0.0,0.0,0,0,...,4,43,10.445312,3,6.000000,1,2.000000,47,11.335938,0
6,8643220,1,1,0,0,1.0,0.0,0.0,0,0,...,120,-74,0.343506,0,0.909180,0,0.799805,-74,0.381836,0
0,8643220,1,1,0,0,1.0,0.0,0.0,0,0,...,675,-358,0.438477,-22,0.210571,-8,0.105286,-388,0.424805,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336829,11098494,1,1,0,0,1.0,0.0,0.0,0,0,...,1063,-316,0.568359,-75,0.634277,-91,0.265137,-482,0.546387,0
1336827,11098494,1,1,0,0,1.0,0.0,0.0,0,0,...,49,-20,0.453369,-6,0.235352,-3,0.222168,-29,0.404053,0
1336831,11098494,1,1,0,0,1.0,0.0,0.0,1,1,...,11,-4,0.526367,-2,0.000000,0,0.000000,-6,0.434814,0
1336836,11098509,1,1,0,0,1.0,0.0,0.0,0,0,...,39,-15,0.597168,-1,0.000000,0,0.000000,-16,0.582520,0


0    10908654
1      257504
Name: label_orders, dtype: int64

0

In [7]:
# Group-K-Fold modeling
list_model = []
# shuffleしたほうが良い？
folds = GroupKFold(n_splits = n_splits)
for tr_idx,va_idx in folds.split(df_X,df_y, df_session):
    X_tr, X_va, y_tr, y_va = df_X.iloc[tr_idx], df_X.iloc[va_idx], df_y.iloc[tr_idx], df_y.iloc[va_idx]
    tr_g = X_tr.groupby(["session"])["n_type_all"].count().values
    va_g = X_va.groupby(["session"])["n_type_all"].count().values
    X_tr = X_tr.drop(["session"], axis = 1)
    X_va = X_va.drop(["session"], axis = 1)
    X_tr, X_va, y_tr, y_va = cudf.from_pandas(X_tr), cudf.from_pandas(X_va), cudf.from_pandas(y_tr), cudf.from_pandas(y_va)
    d_tr = xgb.DMatrix(X_tr, y_tr, group=tr_g)
    d_va = xgb.DMatrix(X_va, y_va, group=va_g)
    model = xgb.train(
        lgb_params,
        d_tr,
        num_boost_round = n_round,
        evals = [(d_tr, 'train'), (d_va, 'valid')],
        early_stopping_rounds = 20,
        verbose_eval = 50,
    )
    list_model.append(model)
    del X_tr, X_va, y_tr, y_va, d_tr, d_va, model
    gc.collect()

# save model
pd.to_pickle(list_model,f"xgbranker_models_{event_type}.pkl")

[0]	train-map:0.99467	valid-map:0.99474
[50]	train-map:0.99546	valid-map:0.99550
[100]	train-map:0.99552	valid-map:0.99554
[150]	train-map:0.99557	valid-map:0.99560
[189]	train-map:0.99560	valid-map:0.99560
[0]	train-map:0.99466	valid-map:0.99463
[50]	train-map:0.99546	valid-map:0.99547
[100]	train-map:0.99553	valid-map:0.99550
[150]	train-map:0.99558	valid-map:0.99552
[200]	train-map:0.99562	valid-map:0.99556
[227]	train-map:0.99564	valid-map:0.99555
[0]	train-map:0.99467	valid-map:0.99462
[30]	train-map:0.99542	valid-map:0.99535
[0]	train-map:0.99464	valid-map:0.99463
[50]	train-map:0.99546	valid-map:0.99541
[100]	train-map:0.99553	valid-map:0.99546
[150]	train-map:0.99559	valid-map:0.99550
[173]	train-map:0.99561	valid-map:0.99549
[0]	train-map:0.99472	valid-map:0.99458
[50]	train-map:0.99549	valid-map:0.99527
[100]	train-map:0.99556	valid-map:0.99535
[150]	train-map:0.99560	valid-map:0.99537
[200]	train-map:0.99566	valid-map:0.99540
[212]	train-map:0.99567	valid-map:0.99540


In [8]:
del df_X, df_y
gc.collect()

66