### 【Pandas → cuDF】

In [1]:
# 通常の pandas 動作が倍速になる
%load_ext cudf.pandas

### 【データのインポート】

In [2]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import os
import cudf
from cuml.preprocessing import TargetEncoder

# データフレーム読み込み
train_df = pd.read_csv("/kaggle/input/playground-series-s5e8/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv")

# 外部データセットの追加
orig_df = pd.read_csv("/kaggle/input/bank-marketing-dataset-full/bank-full.csv",delimiter=";")
orig_df['y'] = orig_df.y.map({'yes':1,'no':0})
orig_df['id'] = (np.arange(len(orig_df))+1e6).astype('int')
orig_df = orig_df.set_index('id')

# データ結合
all_df = pd.concat([train_df,test_df,orig_df],axis=0,ignore_index=True)

### 【前処理】

In [3]:
# カテゴリ列名と数値列名の取得
def preprocess1(df):
    CATS = []
    NUMS = []
    for c in (df.drop(["id","y"],axis=1)).columns:
        t = "CAT"
        if df[c].dtype=='object':
            CATS.append(c)
        else:
            NUMS.append(c)
            t = "NUM"
        n = df[c].nunique()
        na = df[c].isna().sum()
        # print(f"[{t}] {c} has {n} unique and {na} NA")
    # print("CATS:", CATS )
    # print("NUMS:", NUMS )
    return df, CATS, NUMS

# 内部データと外部データに適用
CATS = []
NUMS = []
all_df, CATS, NUMS = preprocess1(all_df)

In [4]:
# ラベルエンコードとユニーク数の取得
def preprocess2(df, CATS, NUMS):
    # NUMS：数値列、NUMS2：カテゴリ数、CATS1：カテゴリ数
    CATS1 = [] # 数値列
    SIZES = {} # カテゴリ数

    for c in NUMS + CATS:
        n = c
        # 数値列のとき
        if c in NUMS: 
            n = f"{c}2"
            CATS1.append(n)
        # カテゴリ列のとき、ラベルエンコード
        df[n], uniques = df[c].factorize()
        # カテゴリ数
        SIZES[n] = len(uniques)
        # print(c)
        df[c] = df[c].astype('int32')
        df[n] = df[n].astype('int32')

    # print("New CATS:", CATS1 )
    # print("Cardinality of all CATS:", SIZES )
    return df, CATS1, SIZES

# 内部データと外部データに適用
CATS1 = []
SIZES = []
all_df, CATS1, SIZES = preprocess2(all_df, CATS, NUMS)

In [5]:
from itertools import combinations

# カラムペアの作成(カラムxユニーク数+カラム)
def preprocess3(df, CATS, CATS1, SIZES):
    pairs = combinations(CATS + CATS1, 2)
    new_cols = {}
    CATS2 = []

    for c1, c2 in pairs:
        name = "_".join(sorted((c1, c2)))
        new_cols[name] = df[c1] * SIZES[c2] + df[c2]
        CATS2.append(name)
    if new_cols:
        new_df = pd.DataFrame(new_cols)         
        df = pd.concat([df, new_df], axis=1) 

    # print(f"Created {len(CATS2)} new CAT columns")
    return df, CATS2

# 内部データと外部データに適用
CATS2 = []
all_df, CATS2 = preprocess3(all_df, CATS, CATS1, SIZES)

In [6]:
# カウントエンコード
def preprocess4(df, CATS, CATS1, CATS2):
    
    CC = CATS+CATS1+CATS2

    print(f"Processing {len(CC)} columns... ",end="")
    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
        tmp = df.groupby(c).y.count()
        tmp = tmp.astype('int32')
        tmp.name = f"CE_{c}"
        CE.append( f"CE_{c}" )
        df = df.merge(tmp, on=c, how='left')
    print()
    return df, CE

# 内部データと外部データに適用
CE = []
all_df, CE = preprocess4(all_df, CATS, CATS1, CATS2)

Processing 136 columns... 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 


In [7]:
# yがint64なのでint32に変換する(GPU対応のため)
all_df["y"] = all_df["y"].astype("int32")

# データ分割
train1 = all_df.iloc[:len(train_df)]
test1 = all_df.iloc[len(train_df):len(train_df)+len(test_df)]

# 外部データセットあり
orig = all_df.iloc[-len(orig_df):]

In [8]:
def preprocess5(train, test, CATS, CATS1, CATS2, orig):

    CC = CATS+CATS1+CATS2

    print(f"Processing {len(CC)} columns... ",end="")
    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
        tmp = orig.groupby(c).y.mean()
        tmp = tmp.astype('float32')
        tmp.name = f"TE_ORIG_{c}"
        TE_ORIG.append( f"TE_ORIG_{c}" )
        train = train.merge(tmp, on=c, how='left')
        test = test.merge(tmp, on=c, how='left')
    return train, test
    print()

TE_ORIG = []
train2, test2 = preprocess5(train1, test1, CATS, CATS1, CATS2, orig)

Processing 136 columns... 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 

### 【① Light GBM (内部データセットのみ)】

In [9]:
############################################################
############ Light GBM with OOF Target Encoding ############
############################################################
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
# from category_encoders import TargetEncoder # CPU版
from cuml.preprocessing import TargetEncoder
import cudf
import warnings
import gc
warnings.filterwarnings("ignore")

# 学習、バリデーションデータ
pred_lgb1 = np.zeros(len(train1))
pred_lgb_test1 = np.zeros(len(test1))
models_lgb1 = []

# 入力データ
X = train1.drop(["id","y"], axis=1).copy()
y = train1["y"].copy()
test_ = test1.drop(["id","y"], axis=1).copy()

# LightGBMパラメータ
lgbm_params = {
    'objective': 'binary',
    'device': 'gpu',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    "num_leaves": 77,
    "max_depth": 15,
    "min_data_in_leaf": 23,
    "min_gain_to_split": 0.17931836655003727,
    "learning_rate": 0.019740808893122665,
    "feature_fraction": 0.7499258780711098,
    "bagging_fraction": 0.9392312065171743,
    "bagging_freq": 3,
    "lambda_l1": 0.13817541814163015,
    "lambda_l2": 5.987592011786754,
    "min_sum_hessian_in_leaf": 2.7742348039686524,
    'verbosity': -1
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Fold分割し格納
for fold, (train_idx, valid_idx) in enumerate(skf.split(X,y)):
    print("#"*25)
    print(f"### Fold {fold+1}")
    print("#"*25)

    X_train = X.iloc[train_idx,:].copy()
    y_train = y.iloc[train_idx].copy()
    X_valid = X.iloc[valid_idx,:].copy()
    y_valid = y.iloc[valid_idx].copy()
    X_test = test_.copy()

    # ターゲットエンコーディング
    CC = CATS1+CATS2
    print(f"Target encoding {len(CC)} features... ",end="")
    
    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
            
        TE0 = TargetEncoder(n_folds=5, smooth=10, split_method='random', stat='mean')
        X_train[c] = TE0.fit_transform(X_train[c],y_train).astype('float32')
        X_valid[c] = TE0.transform(X_valid[c]).astype('float32')
        X_test[c] = TE0.transform(X_test[c]).astype('float32')
        
    print()

    # TE0を明示的に削除
    del TE0
    gc.collect()

    # CC以外はカテゴリ型に変換
    for c in CATS:
        if c not in CC:  
            X_train[c] = X_train[c].astype('category')
            X_valid[c] = X_valid[c].astype('category')
            X_test[c]  = X_test[c].astype('category')    

    # データセット
    lgb_train = lgb.Dataset(
        X_train,y_train,categorical_feature=CATS)

    lgb_valid = lgb.Dataset(
        X_valid,y_valid,categorical_feature=CATS)
    # --------------------------
    # 学習
    # --------------------------
    model_lgb = lgb.train(
        lgbm_params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(100),
        ]
    )

    # 予測
    pred_lgb1[valid_idx] = model_lgb.predict(
        X_valid, num_iteration=model_lgb.best_iteration)

    pred_lgb_test1 += model_lgb.predict(
        X_test, num_iteration=model_lgb.best_iteration)/5

    # モデル保存
    models_lgb1.append(model_lgb)

    # メモリ開放
    del X_train, X_valid
    gc.collect()

    # メモリ開放
    import numba.cuda as cuda
    cuda.current_context().deallocations.clear()   

#########################
### Fold 1
#########################
Target encoding 127 features... 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 




[100]	train's auc: 0.972643	valid's auc: 0.972087
[200]	train's auc: 0.97519	valid's auc: 0.974054
[300]	train's auc: 0.976902	valid's auc: 0.975059
[400]	train's auc: 0.978169	valid's auc: 0.97555
[500]	train's auc: 0.979261	valid's auc: 0.975798
[600]	train's auc: 0.980167	valid's auc: 0.975902
[700]	train's auc: 0.981012	valid's auc: 0.975991
[800]	train's auc: 0.98178	valid's auc: 0.976044
[900]	train's auc: 0.982533	valid's auc: 0.976098
[1000]	train's auc: 0.983242	valid's auc: 0.976151
[1100]	train's auc: 0.983905	valid's auc: 0.976169
[1200]	train's auc: 0.984519	valid's auc: 0.976193
[1300]	train's auc: 0.985125	valid's auc: 0.976214
[1400]	train's auc: 0.985708	valid's auc: 0.976233
[1500]	train's auc: 0.986272	valid's auc: 0.976237
[1600]	train's auc: 0.986815	valid's auc: 0.976244
[1700]	train's auc: 0.987322	valid's auc: 0.976248
[1800]	train's auc: 0.987836	valid's auc: 0.976263
[1900]	train's auc: 0.988327	valid's auc: 0.976263
[2000]	train's auc: 0.988776	valid's auc: 0

In [10]:
# 訓練データのスコア
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

AUC_lgb1 = roc_auc_score(y,pred_lgb1)
print(f"LGB1: AUC score = {AUC_lgb1}")

LGB1: AUC score = 0.9756430131312648


### 【② Light GBM (外部データセットのTE追加)】

In [11]:
############################################################
############ Light GBM with OOF Target Encoding ############
############################################################
# 学習、バリデーションデータ
pred_lgb2 = np.zeros(len(train2))
pred_lgb_test2 = np.zeros(len(test2))
models_lgb2 = []

# 入力データ
X = train2.drop(["id","y"], axis=1).copy()
y = train2["y"].copy()
test_ = test2.drop(["id","y"], axis=1).copy()

# LightGBMパラメータ
lgbm_params = {
    'objective': 'binary',
    'device': 'gpu',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    "num_leaves": 77,
    "max_depth": 15,
    "min_data_in_leaf": 23,
    "min_gain_to_split": 0.17931836655003727,
    "learning_rate": 0.019740808893122665,
    "feature_fraction": 0.7499258780711098,
    "bagging_fraction": 0.9392312065171743,
    "bagging_freq": 3,
    "lambda_l1": 0.13817541814163015,
    "lambda_l2": 5.987592011786754,
    "min_sum_hessian_in_leaf": 2.7742348039686524,
    'verbosity': -1
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Fold分割し格納
for fold, (train_idx, valid_idx) in enumerate(skf.split(X,y)):
    print("#"*25)
    print(f"### Fold {fold+1}")
    print("#"*25)

    X_train = X.iloc[train_idx,:].copy()
    y_train = y.iloc[train_idx].copy()
    X_valid = X.iloc[valid_idx,:].copy()
    y_valid = y.iloc[valid_idx].copy()
    X_test = test_.copy()

    # ターゲットエンコーディング
    CC = CATS1+CATS2
    print(f"Target encoding {len(CC)} features... ",end="")

    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
            
        TE0 = TargetEncoder(n_folds=5, smooth=10, split_method='random', stat='mean')
        X_train[c] = TE0.fit_transform(X_train[c],y_train).astype('float32')
        X_valid[c] = TE0.transform(X_valid[c]).astype('float32')
        X_test[c] = TE0.transform(X_test[c]).astype('float32')
        
    print()

    # TE0を明示的に削除
    del TE0
    gc.collect()

    # CC以外はカテゴリ型に変換
    for c in CATS:
        if c not in CC:  
            X_train[c] = X_train[c].astype('category')
            X_valid[c] = X_valid[c].astype('category')
            X_test[c]  = X_test[c].astype('category')    

    # データセット
    lgb_train = lgb.Dataset(
        X_train,y_train,categorical_feature=CATS)

    lgb_valid = lgb.Dataset(
        X_valid,y_valid,categorical_feature=CATS)
    # --------------------------
    # 学習
    # --------------------------
    model_lgb = lgb.train(
        lgbm_params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(100),
        ]
    )

    # 予測
    pred_lgb2[valid_idx] = model_lgb.predict(
        X_valid, num_iteration=model_lgb.best_iteration)

    pred_lgb_test2 += model_lgb.predict(
        X_test, num_iteration=model_lgb.best_iteration)/5

    # モデル保存
    models_lgb2.append(model_lgb)

    # メモリ開放
    del X_train, X_valid
    gc.collect()

    # メモリ開放
    import numba.cuda as cuda
    cuda.current_context().deallocations.clear()   

#########################
### Fold 1
#########################
Target encoding 127 features... 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 
[100]	train's auc: 0.973105	valid's auc: 0.972376
[200]	train's auc: 0.975684	valid's auc: 0.974321
[300]	train's auc: 0.977462	valid's auc: 0.975372
[400]	train's auc: 0.978813	valid's auc: 0.975855
[500]	train's auc: 0.979971	valid's auc: 0.976142
[600]	train's auc: 0.98095	valid's auc: 0.976273
[700]	train's auc: 0.981829	valid's auc: 0.976357
[800]	train's auc: 0.982635	valid's auc: 0.976398
[900]	train's auc: 0.983404	valid's auc: 0.976464
[1000]	train's auc: 0.984129	valid's auc: 0.976494
[1100]	train's auc: 0.984831	valid's auc: 0.976512
[1200]	train's auc: 0.985456	valid's auc: 0.976524
[1300]	train's auc: 0.986084	valid's auc: 0.976538
[1400]	train's auc: 0.986693	valid's auc: 0.976544
[1500]	train's auc: 0.987283	valid's auc: 0.976555
[1600]	train's auc: 0.987822	valid's auc: 0.97656
[1700]	train's auc: 0.98836	valid's auc: 0.97

In [12]:
# 訓練データのスコア
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

AUC_lgb2 = roc_auc_score(y,pred_lgb2)
print(f"LGB2: AUC score = {AUC_lgb2}")

LGB2: AUC score = 0.9759960227090922


### 【③ XGBoost (内部データセットのみ)】

In [13]:
#################################################
############ XGBoost ############################
#################################################
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 学習、バリデーションデータ
pred_xgb1 = np.zeros(len(train1))
pred_xgb_test1 = np.zeros(len(test1))
models_xgb1 = []

# 入力データ
X = train1.drop(["id","y"],axis=1).copy()
y = train1["y"].copy()
test_ = test1.drop(["id","y"],axis=1).copy()

# 評価履歴を保存する辞書
evals_result_xgb = {}

# パラメータ
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "tree_method": "gpu_hist",
    "device": "cuda",
    "grow_policy": "lossguide",
    "learning_rate": 0.020187734867721113,
    "max_depth": 10,
    "min_child_weight": 0.0015137166209180514,
    "subsample": 0.6786153011677415,
    "colsample_bytree": 0.7917555828184474,
    "colsample_bylevel": 0.5539530181906183,
    "reg_alpha": 5.3805307261170965,
    "reg_lambda": 1.2434258141601598e-08,
    "gamma": 3.715076866606369,
    "max_leaves": 91,
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold,(train_idx,valid_idx) in enumerate(skf.split(X,y)):

    print("#"*25)
    print(f"### Fold {fold+1}")
    print("#"*25)

    # foldごとの訓練、バリデーションデータ
    X_train = X.iloc[train_idx,:].copy()
    y_train = y.iloc[train_idx].copy()
    X_valid = X.iloc[valid_idx,:].copy()
    y_valid = y.iloc[valid_idx].copy()
    X_test = test_.copy()

    # ターゲットエンコーディング
    CC = CATS1+CATS2
    print(f"Target encoding {len(CC)} features... ",end="")
    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
        TE0 = TargetEncoder(n_folds=5, smooth=0, split_method='random', stat='mean')
        X_train[c] = TE0.fit_transform(X_train[c],ｙ_train).astype('float32')
        X_valid[c] = TE0.transform(X_valid[c]).astype('float32')
        X_test[c] = TE0.transform(X_test[c]).astype('float32')
    print()

    # CC以外はカテゴリ型に変換
    for c in CATS:
        if c not in CC:  
            X_train[c] = X_train[c].astype('category')
            X_valid[c] = X_valid[c].astype('category')

    # DMatrixに変換
    dtrain = xgb.DMatrix(X_train,label=y_train,enable_categorical=True)
    dvalid = xgb.DMatrix(X_valid,label=y_valid,enable_categorical=True)
    dtest = xgb.DMatrix(X_test,enable_categorical=True)

    # 学習
    model_xgb = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=5000,
        evals=[(dtrain,"train"),(dvalid,"valid")],
        early_stopping_rounds=100,
        evals_result=evals_result_xgb,
        verbose_eval=100,
    )

    # 各foldでのバリデーション予測
    pred_xgb1[valid_idx] = model_xgb.predict(
        dvalid, iteration_range=(0,model_xgb.best_iteration+1))

    # 各foldでのバリデーション予測
    pred_xgb_test1 += model_xgb.predict(
        dtest, iteration_range=(0,model_xgb.best_iteration+1))/5
    
    # モデルの追加
    models_xgb1.append(model_xgb)

#########################
### Fold 1
#########################
Target encoding 127 features... 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 
[0]	train-auc:0.95835	valid-auc:0.95912
[100]	train-auc:0.97277	valid-auc:0.97230
[200]	train-auc:0.97506	valid-auc:0.97390
[300]	train-auc:0.97673	valid-auc:0.97480
[400]	train-auc:0.97800	valid-auc:0.97531
[500]	train-auc:0.97912	valid-auc:0.97562
[600]	train-auc:0.98012	valid-auc:0.97581
[700]	train-auc:0.98102	valid-auc:0.97595
[800]	train-auc:0.98184	valid-auc:0.97603
[900]	train-auc:0.98252	valid-auc:0.97608
[1000]	train-auc:0.98309	valid-auc:0.97612
[1100]	train-auc:0.98360	valid-auc:0.97616
[1200]	train-auc:0.98397	valid-auc:0.97617
[1300]	train-auc:0.98430	valid-auc:0.97618
[1400]	train-auc:0.98463	valid-auc:0.97621
[1500]	train-auc:0.98493	valid-auc:0.97622
[1600]	train-auc:0.98519	valid-auc:0.97624
[1700]	train-auc:0.98538	valid-auc:0.97624
[1800]	train-auc:0.98560	valid-auc:0.97624
[1900]	train-auc:0.98578	valid-auc:0.97625
[2

In [14]:
# 訓練データのスコア
from sklearn.metrics import f1_score

AUC_xgb1 = roc_auc_score(y,pred_xgb1)
print(f"XGB1: AUC score = {AUC_xgb1}")

XGB1: AUC score = 0.975560668843006


### 【④ XGBoost (外部データセットのTE追加)】

In [15]:
#################################################
############ XGBoost ############################
#################################################
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 学習、バリデーションデータ
pred_xgb2 = np.zeros(len(train2))
pred_xgb_test2 = np.zeros(len(test2))
models_xgb2 = []

# 入力データ
X = train2.drop(["id","y"],axis=1).copy()
y = train2["y"].copy()
test_ = test2.drop(["id","y"],axis=1).copy()

# 評価履歴を保存する辞書
evals_result_xgb = {}

# パラメータ
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "tree_method": "gpu_hist",
    "device": "cuda",
    "grow_policy": "lossguide",
    "learning_rate": 0.020187734867721113,
    "max_depth": 10,
    "min_child_weight": 0.0015137166209180514,
    "subsample": 0.6786153011677415,
    "colsample_bytree": 0.7917555828184474,
    "colsample_bylevel": 0.5539530181906183,
    "reg_alpha": 5.3805307261170965,
    "reg_lambda": 1.2434258141601598e-08,
    "gamma": 3.715076866606369,
    "max_leaves": 91,
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold,(train_idx,valid_idx) in enumerate(skf.split(X,y)):

    print("#"*25)
    print(f"### Fold {fold+1}")
    print("#"*25)

    # foldごとの訓練、バリデーションデータ
    X_train = X.iloc[train_idx,:].copy()
    y_train = y.iloc[train_idx].copy()
    X_valid = X.iloc[valid_idx,:].copy()
    y_valid = y.iloc[valid_idx].copy()
    X_test = test_.copy()

    # ターゲットエンコーディング
    CC = CATS1+CATS2
    print(f"Target encoding {len(CC)} features... ",end="")
    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
        TE0 = TargetEncoder(n_folds=5, smooth=0, split_method='random', stat='mean')
        X_train[c] = TE0.fit_transform(X_train[c],ｙ_train).astype('float32')
        X_valid[c] = TE0.transform(X_valid[c]).astype('float32')
        X_test[c] = TE0.transform(X_test[c]).astype('float32')
    print()

    # CC以外はカテゴリ型に変換
    for c in CATS:
        if c not in CC:  
            X_train[c] = X_train[c].astype('category')
            X_valid[c] = X_valid[c].astype('category')

    # DMatrixに変換
    dtrain = xgb.DMatrix(X_train,label=y_train,enable_categorical=True)
    dvalid = xgb.DMatrix(X_valid,label=y_valid,enable_categorical=True)
    dtest = xgb.DMatrix(X_test,enable_categorical=True)

    # 学習
    model_xgb = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=5000,
        evals=[(dtrain,"train"),(dvalid,"valid")],
        early_stopping_rounds=100,
        evals_result=evals_result_xgb,
        verbose_eval=100,
    )

    # 各foldでのバリデーション予測
    pred_xgb2[valid_idx] = model_xgb.predict(
        dvalid, iteration_range=(0,model_xgb.best_iteration+1))

    # 各foldでのバリデーション予測
    pred_xgb_test2 += model_xgb.predict(
        dtest, iteration_range=(0,model_xgb.best_iteration+1))/5
    
    # モデルの追加
    models_xgb2.append(model_xgb)

#########################
### Fold 1
#########################
Target encoding 127 features... 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 
[0]	train-auc:0.96149	valid-auc:0.96197
[100]	train-auc:0.97333	valid-auc:0.97269
[200]	train-auc:0.97567	valid-auc:0.97426
[300]	train-auc:0.97736	valid-auc:0.97512
[400]	train-auc:0.97874	valid-auc:0.97562
[500]	train-auc:0.97994	valid-auc:0.97592
[600]	train-auc:0.98098	valid-auc:0.97612
[700]	train-auc:0.98195	valid-auc:0.97626
[800]	train-auc:0.98283	valid-auc:0.97635
[900]	train-auc:0.98361	valid-auc:0.97641
[1000]	train-auc:0.98432	valid-auc:0.97645
[1100]	train-auc:0.98493	valid-auc:0.97650
[1200]	train-auc:0.98544	valid-auc:0.97651
[1300]	train-auc:0.98584	valid-auc:0.97653
[1400]	train-auc:0.98621	valid-auc:0.97655
[1500]	train-auc:0.98651	valid-auc:0.97657
[1600]	train-auc:0.98682	valid-auc:0.97658
[1700]	train-auc:0.98706	valid-auc:0.97658
[1800]	train-auc:0.98726	valid-auc:0.97659
[1900]	train-auc:0.98748	valid-auc:0.97660
[1

In [16]:
# 訓練データのスコア
from sklearn.metrics import f1_score

AUC_xgb2 = roc_auc_score(y,pred_xgb2)
print(f"XGB1: AUC score = {AUC_xgb2}")

XGB1: AUC score = 0.9759705085750096


### 【スコア提出】

In [17]:
# 訓練データのスコア
from sklearn.metrics import f1_score

AUC_ave = roc_auc_score(y,(pred_lgb1 + pred_lgb2 + pred_xgb1 + pred_xgb2)/4)
print(f"CV Average: AUC score = {AUC_ave}")

CV Average: AUC score = 0.9761545412818875


In [18]:
# 提出データ作成
sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")

sample_submission['y'] = (pred_lgb_test1 + pred_lgb_test2 + pred_xgb_test1 + pred_xgb_test2) / 4
sample_submission.to_csv('submission.csv', index=False)
print('Submission file saved.')

Submission file saved.
