In [1]:
import cudf
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import GroupKFold
import xgboost as xgb
import glob

import gc

In [2]:
n_pred = 20 # sessionごとに予測するaidの数（固定）
weight = {"clicks": 0.1, "carts": 0.3, "orders": 0.6}

event_type = "orders"

path_input_feat = {}
path_input_label = {}
for i in ["train1"]:
    path_input_feat[i] = f"/kaggle/input/otto-make-data-feature/feature_label_{i}_{{}}_{{}}.pickle"
    path_input_label[i] = f"/kaggle/input/otto-make-data-feature/n_ground_truth_{i}_{event_type}.csv"

In [3]:
list_files = glob.glob(path_input_feat["train1"].format('feat','*'))
len(list_files)

15

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
rand = 64

n_splits = 5
n_round = 1000

lgb_params = {
    "objective": "binary:logistic",
    "max_depth": 30,
    "max_leaves": 200,
    "subsample": 0.8,
    "learning_rate": 0.05,
    "colsample_bytree": 0.6,
    "alpha": 0,
    "lambda": 50,
    "random_state": rand,
    "eval_metric": "logloss",
    "tree_method":"gpu_hist",
}

# train

In [6]:
# データ読み込み
df_Xy = []
for n_chunk in range(len(list_files)):
    feat = pd.read_pickle(path_input_feat["train1"].format("feat", n_chunk))
    label = pd.read_pickle(path_input_feat["train1"].format(event_type, n_chunk))
    df_Xy.append(pd.concat([feat,label], axis = 1))
df_Xy = pd.concat(df_Xy)
df_Xy = reduce_mem_usage(df_Xy)


gc.collect()

Mem. usage decreased to 2221.09 Mb (0.0% reduction)


0

In [7]:
list_model = []

for i in range(5):
    # 負例をサンプリング
    df_Xy_sample = pd.concat([
        df_Xy[df_Xy[f"label_{event_type}"] == 1],
        df_Xy[df_Xy[f"label_{event_type}"] == 0].sample(frac = 0.2)
    ])

    df_X = df_Xy_sample.drop(["session", f"label_{event_type}"], axis = 1)
    df_y = df_Xy_sample[f"label_{event_type}"]
    df_session = df_Xy_sample["session"]

    display(df_y.value_counts())

    # Group-K-Fold modeling
    # shuffleしたほうが良い？
    folds = GroupKFold(n_splits = n_splits)
    for tr_idx,va_idx in folds.split(df_X,df_y, df_session):
        X_tr, X_va, y_tr, y_va = df_X.iloc[tr_idx], df_X.iloc[va_idx], df_y.iloc[tr_idx], df_y.iloc[va_idx]
        X_tr, X_va, y_tr, y_va = cudf.from_pandas(X_tr), cudf.from_pandas(X_va), cudf.from_pandas(y_tr), cudf.from_pandas(y_va)
        d_tr = xgb.DMatrix(X_tr, y_tr)
        d_va = xgb.DMatrix(X_va, y_va)
        model = xgb.train(
            lgb_params,
            d_tr,
            num_boost_round = n_round,
            evals = [(d_tr, 'train'), (d_va, 'valid')],
            early_stopping_rounds = 20,
            verbose_eval = 50,
        )
        list_model.append(model)
        del X_tr, X_va, y_tr, y_va, d_tr, d_va, model
        gc.collect()

# save model
pd.to_pickle(list_model,f"xgb_models_{event_type}.pkl")

0    5698420
1     260742
Name: label_orders, dtype: int64

[0]	train-logloss:0.64880	valid-logloss:0.64880
[50]	train-logloss:0.09136	valid-logloss:0.09176
[100]	train-logloss:0.06308	valid-logloss:0.06376
[150]	train-logloss:0.06068	valid-logloss:0.06164
[200]	train-logloss:0.06010	valid-logloss:0.06133
[250]	train-logloss:0.05977	valid-logloss:0.06121
[300]	train-logloss:0.05950	valid-logloss:0.06114
[350]	train-logloss:0.05926	valid-logloss:0.06110
[400]	train-logloss:0.05903	valid-logloss:0.06108
[450]	train-logloss:0.05883	valid-logloss:0.06106
[500]	train-logloss:0.05862	valid-logloss:0.06104
[550]	train-logloss:0.05841	valid-logloss:0.06104
[591]	train-logloss:0.05826	valid-logloss:0.06103
[0]	train-logloss:0.64876	valid-logloss:0.64875
[50]	train-logloss:0.09151	valid-logloss:0.09131
[100]	train-logloss:0.06326	valid-logloss:0.06312
[150]	train-logloss:0.06087	valid-logloss:0.06095
[200]	train-logloss:0.06030	valid-logloss:0.06060
[250]	train-logloss:0.05996	valid-logloss:0.06047
[300]	train-logloss:0.05970	valid-logloss:0.06041
[350]	

0    5698420
1     260742
Name: label_orders, dtype: int64

[0]	train-logloss:0.64879	valid-logloss:0.64879
[50]	train-logloss:0.09136	valid-logloss:0.09181
[100]	train-logloss:0.06307	valid-logloss:0.06382
[150]	train-logloss:0.06069	valid-logloss:0.06171
[200]	train-logloss:0.06010	valid-logloss:0.06136
[250]	train-logloss:0.05978	valid-logloss:0.06125
[300]	train-logloss:0.05953	valid-logloss:0.06120
[350]	train-logloss:0.05928	valid-logloss:0.06116
[400]	train-logloss:0.05906	valid-logloss:0.06112
[450]	train-logloss:0.05884	valid-logloss:0.06110
[500]	train-logloss:0.05863	valid-logloss:0.06109
[550]	train-logloss:0.05842	valid-logloss:0.06108
[581]	train-logloss:0.05830	valid-logloss:0.06108
[0]	train-logloss:0.64888	valid-logloss:0.64888
[50]	train-logloss:0.09140	valid-logloss:0.09170
[100]	train-logloss:0.06311	valid-logloss:0.06371
[150]	train-logloss:0.06072	valid-logloss:0.06163
[200]	train-logloss:0.06014	valid-logloss:0.06130
[250]	train-logloss:0.05979	valid-logloss:0.06117
[300]	train-logloss:0.05952	valid-logloss:0.06109
[350]	

0    5698420
1     260742
Name: label_orders, dtype: int64

[0]	train-logloss:0.64884	valid-logloss:0.64884
[50]	train-logloss:0.09143	valid-logloss:0.09139
[100]	train-logloss:0.06314	valid-logloss:0.06328
[150]	train-logloss:0.06075	valid-logloss:0.06119
[200]	train-logloss:0.06015	valid-logloss:0.06084
[250]	train-logloss:0.05981	valid-logloss:0.06072
[300]	train-logloss:0.05955	valid-logloss:0.06066
[350]	train-logloss:0.05931	valid-logloss:0.06061
[400]	train-logloss:0.05910	valid-logloss:0.06060
[450]	train-logloss:0.05888	valid-logloss:0.06059
[490]	train-logloss:0.05872	valid-logloss:0.06059
[0]	train-logloss:0.64884	valid-logloss:0.64884
[50]	train-logloss:0.09134	valid-logloss:0.09161
[100]	train-logloss:0.06304	valid-logloss:0.06361
[150]	train-logloss:0.06067	valid-logloss:0.06152
[200]	train-logloss:0.06007	valid-logloss:0.06116
[250]	train-logloss:0.05975	valid-logloss:0.06104
[300]	train-logloss:0.05950	valid-logloss:0.06098
[350]	train-logloss:0.05927	valid-logloss:0.06095
[400]	train-logloss:0.05905	valid-logloss:0.06092
[450]	

0    5698420
1     260742
Name: label_orders, dtype: int64

[0]	train-logloss:0.64879	valid-logloss:0.64881
[50]	train-logloss:0.09143	valid-logloss:0.09169
[100]	train-logloss:0.06317	valid-logloss:0.06359
[150]	train-logloss:0.06079	valid-logloss:0.06144
[200]	train-logloss:0.06020	valid-logloss:0.06108
[250]	train-logloss:0.05986	valid-logloss:0.06095
[300]	train-logloss:0.05960	valid-logloss:0.06089
[350]	train-logloss:0.05937	valid-logloss:0.06085
[400]	train-logloss:0.05914	valid-logloss:0.06082
[450]	train-logloss:0.05893	valid-logloss:0.06080
[500]	train-logloss:0.05874	valid-logloss:0.06079
[520]	train-logloss:0.05865	valid-logloss:0.06079
[0]	train-logloss:0.64879	valid-logloss:0.64880
[50]	train-logloss:0.09139	valid-logloss:0.09171
[100]	train-logloss:0.06309	valid-logloss:0.06379
[150]	train-logloss:0.06069	valid-logloss:0.06174
[200]	train-logloss:0.06009	valid-logloss:0.06141
[250]	train-logloss:0.05977	valid-logloss:0.06130
[300]	train-logloss:0.05951	valid-logloss:0.06124
[350]	train-logloss:0.05928	valid-logloss:0.06121
[400]	

0    5698420
1     260742
Name: label_orders, dtype: int64

[0]	train-logloss:0.64879	valid-logloss:0.64880
[50]	train-logloss:0.09144	valid-logloss:0.09164
[100]	train-logloss:0.06316	valid-logloss:0.06358
[150]	train-logloss:0.06078	valid-logloss:0.06147
[200]	train-logloss:0.06018	valid-logloss:0.06114
[250]	train-logloss:0.05984	valid-logloss:0.06101
[300]	train-logloss:0.05957	valid-logloss:0.06096
[350]	train-logloss:0.05933	valid-logloss:0.06092
[400]	train-logloss:0.05910	valid-logloss:0.06089
[450]	train-logloss:0.05889	valid-logloss:0.06087
[489]	train-logloss:0.05873	valid-logloss:0.06086
[0]	train-logloss:0.64880	valid-logloss:0.64880
[50]	train-logloss:0.09142	valid-logloss:0.09168
[100]	train-logloss:0.06313	valid-logloss:0.06370
[150]	train-logloss:0.06074	valid-logloss:0.06159
[200]	train-logloss:0.06014	valid-logloss:0.06124
[250]	train-logloss:0.05981	valid-logloss:0.06112
[300]	train-logloss:0.05955	valid-logloss:0.06106
[350]	train-logloss:0.05932	valid-logloss:0.06102
[400]	train-logloss:0.05910	valid-logloss:0.06100
[450]	

In [8]:
del df_X, df_y
gc.collect()

237