In [1]:
# 通常の pandas 動作が倍速になる
%load_ext cudf.pandas

In [2]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import os
import cudf
from cuml.preprocessing import TargetEncoder

# データフレーム読み込み
train_df = pd.read_csv("/kaggle/input/playground-series-s5e8/train.csv")
# orig_df = pd.read_csv("/kaggle/input/bank-marketing-dataset-full/bank-full.csv",delimiter=";")
# orig_df['y'] = orig_df.y.map({'yes':1,'no':0})
# orig_df['id'] = (np.arange(len(orig_df))+1e6).astype('int')
# orig_df = orig_df.set_index('id')
test_df = pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv")

# データ結合
all_df = pd.concat([train_df,test_df],axis=0,ignore_index=True)
# all_df = pd.concat([train_df,orig_df,test_df],axis=0,ignore_index=True)

In [3]:
# カテゴリ列と数値列に分ける
CATS = []
NUMS = []
for c in (all_df.drop(["id","y"],axis=1)).columns:
    t = "CAT"
    if all_df[c].dtype=='object':
        CATS.append(c)
    else:
        NUMS.append(c)
        t = "NUM"
    n = all_df[c].nunique()
    na = all_df[c].isna().sum()
    print(f"[{t}] {c} has {n} unique and {na} NA")
print("CATS:", CATS )
print("NUMS:", NUMS )

[NUM] age has 78 unique and 0 NA
[CAT] job has 12 unique and 0 NA
[CAT] marital has 3 unique and 0 NA
[CAT] education has 4 unique and 0 NA
[CAT] default has 2 unique and 0 NA
[NUM] balance has 8469 unique and 0 NA
[CAT] housing has 2 unique and 0 NA
[CAT] loan has 2 unique and 0 NA
[CAT] contact has 3 unique and 0 NA
[NUM] day has 31 unique and 0 NA
[CAT] month has 12 unique and 0 NA
[NUM] duration has 1798 unique and 0 NA
[NUM] campaign has 52 unique and 0 NA
[NUM] pdays has 614 unique and 0 NA
[NUM] previous has 53 unique and 0 NA
[CAT] poutcome has 4 unique and 0 NA
CATS: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
NUMS: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [4]:
# カテゴリ列をラベルエンコードする
# NUMS：数値列、NUMS2：カテゴリ数、CATS1：カテゴリ数
CATS1 = [] # 数値列
SIZES = {} # カテゴリ数

for c in NUMS + CATS:
    n = c
    # 数値列のとき
    if c in NUMS: 
        n = f"{c}2"
        CATS1.append(n)
    # カテゴリ列のとき、ラベルエンコード
    all_df[n],_ = all_df[c].factorize()
    # カテゴリ数
    SIZES[n] = all_df[n].max()+1

    # print(c)
    all_df[c] = all_df[c].astype('int32')
    all_df[n] = all_df[n].astype('int32')

print("New CATS:", CATS1 )
print("Cardinality of all CATS:", SIZES )

New CATS: ['age2', 'balance2', 'day2', 'duration2', 'campaign2', 'pdays2', 'previous2']
Cardinality of all CATS: {'age2': 78, 'balance2': 8469, 'day2': 31, 'duration2': 1798, 'campaign2': 52, 'pdays2': 614, 'previous2': 53, 'job': 12, 'marital': 3, 'education': 4, 'default': 2, 'housing': 2, 'loan': 2, 'contact': 3, 'month': 12, 'poutcome': 4}


In [5]:
# カラムペアの作成
from itertools import combinations

pairs = combinations(CATS + CATS1, 2)
new_cols = {}
CATS2 = []

for c1, c2 in pairs:
    name = "_".join(sorted((c1, c2)))
    new_cols[name] = all_df[c1] * SIZES[c2] + all_df[c2]
    CATS2.append(name)
if new_cols:
    new_df = pd.DataFrame(new_cols)         
    all_df = pd.concat([all_df, new_df], axis=1) 

print(f"Created {len(CATS2)} new CAT columns")

Created 120 new CAT columns


In [6]:
# カウントエンコード
CE = []
CC = CATS+CATS1+CATS2

print(f"Processing {len(CC)} columns... ",end="")
for i,c in enumerate(CC):
    if i%10==0: print(f"{i}, ",end="")
    tmp = all_df.groupby(c).y.count()
    tmp = tmp.astype('int32')
    tmp.name = f"CE_{c}"
    CE.append( f"CE_{c}" )
    all_df = all_df.merge(tmp, on=c, how='left')
print()

Processing 136 columns... 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 


In [7]:
# メモリ開放
import gc
del tmp
gc.collect()

0

In [8]:
# データ分割
train = all_df.iloc[:len(train_df)]
test = all_df.iloc[len(train_df):]
# train = all_df.iloc[:(len(train_df)+len(orig_df))]
# test = all_df.iloc[len(train_df)+len(orig_df):]

# yがint64なのでint32に変換する
train["y"] = train["y"].astype("int32")

### 【Light GBM】

In [9]:
###################################################
############ Light GBM with OOF Target Encoding ###
###################################################
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
# from category_encoders import TargetEncoder
from cuml.preprocessing import TargetEncoder
import cudf
import warnings
import gc
warnings.filterwarnings("ignore")

# 学習、バリデーションデータ
pred_lgb = np.zeros(len(train))
pred_lgb_test = np.zeros(len(test))
models_lgb = []

# 入力データ
X = train.drop(["id","y"], axis=1).copy()
y = train["y"].copy()
test_ = test.drop(["id","y"], axis=1).copy()

# LightGBMパラメータ
lgbm_params = {
    'objective': 'binary',
    'device': 'gpu',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    # 'learning_rate': 0.01,
    'num_leaves': 31,
    'max_depth': 7,
    # 'feature_fraction': 0.8,
    'feature_fraction': 0.7,
    # 'bagging_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    # 'min_data_in_leaf': 20,
    'min_data_in_leaf': 200,
    'lambda_l1': 0.1,
    'lambda_l2': 1.0,
    'max_bin': 255,
    'verbosity': -1
}


# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
kf = KFold(n_splits=7, shuffle=True, random_state=42)

# Fold分割し格納
for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print("#"*25)
    print(f"### Fold {fold+1}")
    print("#"*25)

    X_train = X.iloc[train_idx,:].copy()
    y_train = y.iloc[train_idx].copy()
    X_valid = X.iloc[valid_idx,:].copy()
    y_valid = y.iloc[valid_idx].copy()
    X_test = test_.copy()

    # ターゲットエンコーディング
    CC = CATS1+CATS2
    print(f"Target encoding {len(CC)} features... ",end="")
    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
        TE0 = TargetEncoder(n_folds=10, smooth=10, split_method='random', stat='mean')
        X_train[c] = TE0.fit_transform(X_train[c],y_train).astype('float32')
        X_valid[c] = TE0.transform(X_valid[c]).astype('float32')
        X_test[c] = TE0.transform(X_test[c]).astype('float32')
    print()

    # TE0を明示的に削除
    del TE0
    gc.collect()

    # CC以外はカテゴリ型に変換
    for c in CATS:
        if c not in CC:  
            X_train[c] = X_train[c].astype('category')
            X_valid[c] = X_valid[c].astype('category')
            X_test[c]  = X_test[c].astype('category')    

    # データセット
    lgb_train = lgb.Dataset(
        X_train,y_train,categorical_feature=CATS)

    lgb_valid = lgb.Dataset(
        X_valid,y_valid,categorical_feature=CATS)
    # --------------------------
    # 学習
    # --------------------------
    model_lgb = lgb.train(
        lgbm_params,
        lgb_train,
        # num_boost_round=1,
        num_boost_round=2000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(100),
        ]
    )

    # 予測
    pred_lgb[valid_idx] = model_lgb.predict(
        X_valid, num_iteration=model_lgb.best_iteration)

    pred_lgb_test += model_lgb.predict(
        X_test, num_iteration=model_lgb.best_iteration)/7

    # モデル保存
    models_lgb.append(model_lgb)

    # メモリ開放
    del X_train, X_valid
    gc.collect()

    # メモリ開放
    import numba.cuda as cuda
    cuda.current_context().deallocations.clear()   

#########################
### Fold 1
#########################
Target encoding 127 features... 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 




[100]	train's auc: 0.97359	valid's auc: 0.972594
[200]	train's auc: 0.975756	valid's auc: 0.974023
[300]	train's auc: 0.977183	valid's auc: 0.974591
[400]	train's auc: 0.978221	valid's auc: 0.974783
[500]	train's auc: 0.979188	valid's auc: 0.974912
[600]	train's auc: 0.980067	valid's auc: 0.974994
[700]	train's auc: 0.980852	valid's auc: 0.975088
[800]	train's auc: 0.981601	valid's auc: 0.975148
[900]	train's auc: 0.982321	valid's auc: 0.975218
[1000]	train's auc: 0.982979	valid's auc: 0.975246
[1100]	train's auc: 0.983621	valid's auc: 0.97528
[1200]	train's auc: 0.984209	valid's auc: 0.975277
#########################
### Fold 2
#########################
Target encoding 127 features... 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 
[100]	train's auc: 0.97362	valid's auc: 0.972254
[200]	train's auc: 0.975912	valid's auc: 0.973745
[300]	train's auc: 0.977277	valid's auc: 0.974282
[400]	train's auc: 0.978338	valid's auc: 0.974521
[500]	train's auc: 0.979284	valid's auc: 0.97463
[

In [10]:
# 訓練データのスコア
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

AUC_lgb = roc_auc_score(y,pred_lgb)
print(f"LGB: AUC score = {AUC_lgb}")

LGB: AUC score = 0.9752234614742816


In [11]:
#################################################
############ XGBoost ############################
#################################################
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 学習、バリデーションデータ
pred_xgb = np.zeros(len(train))
pred_xgb_test = np.zeros(len(test))
models_xgb = []

# 入力データ
X = train.drop(["id","y"],axis=1)
y = train["y"]
test = test.drop(["id","y"],axis=1)

# 評価履歴を保存する辞書
evals_result_xgb = {}

# パラメータ
xgb_params = {
    "objective": "binary:logistic",  
    "eval_metric": "auc",           
    "learning_rate": 0.1,
    "max_depth": 0,
    "subsample": 0.8,
    "colsample_bytree": 0.7,
    "tree_method": "hist",
    "device": "cuda",
    # "tree_method": "gpu_hist",
    # "gpu_id": 0,
    "grow_policy": "lossguide", 
    "max_leaves": 32,           
    "alpha": 2.0,
}

# skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
kf = KFold(n_splits=7, shuffle=True, random_state=42)
for fold,(train_idx,valid_idx) in enumerate(kf.split(X)):

    print("#"*25)
    print(f"### Fold {fold+1}")
    print("#"*25)

    # foldごとの訓練、バリデーションデータ
    X_train = X.iloc[train_idx,:].copy()
    y_train = y.iloc[train_idx]
    X_valid = X.iloc[valid_idx,:].copy()
    y_valid = y.iloc[valid_idx]
    X_test = test.copy()

    # ターゲットエンコーディング
    CC = CATS1+CATS2
    print(f"Target encoding {len(CC)} features... ",end="")
    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
        TE0 = TargetEncoder(n_folds=10, smooth=0, split_method='random', stat='mean')
        X_train[c] = TE0.fit_transform(X_train[c],ｙ_train).astype('float32')
        X_valid[c] = TE0.transform(X_valid[c]).astype('float32')
        X_test[c] = TE0.transform(X_test[c]).astype('float32')
    print()

    # CC以外はカテゴリ型に変換
    X_train[CATS] = X_train[CATS].astype('category')
    X_valid[CATS] = X_valid[CATS].astype('category')
    X_test[CATS] = X_test[CATS].astype('category')    

    # DMatrixに変換
    dtrain = xgb.DMatrix(X_train,label=y_train,enable_categorical=True)
    dvalid = xgb.DMatrix(X_valid,label=y_valid,enable_categorical=True)
    dtest = xgb.DMatrix(X_test,enable_categorical=True)

    # 学習
    model_xgb = xgb.train(
        xgb_params,
        dtrain,
        # num_boost_round=1,
        num_boost_round=2000,
        evals=[(dtrain,"train"),(dvalid,"valid")],
        early_stopping_rounds=100,
        evals_result=evals_result_xgb,
        verbose_eval=100,
    )

    # 各foldでのバリデーション予測
    pred_xgb[valid_idx] = model_xgb.predict(
        dvalid, iteration_range=(0,model_xgb.best_iteration+1))

    # 各foldでのバリデーション予測
    pred_xgb_test += model_xgb.predict(
        dtest, iteration_range=(0,model_xgb.best_iteration+1))/7
    
    # モデルの追加
    models_xgb.append(model_xgb)

#########################
### Fold 1
#########################
Target encoding 127 features... 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 
[0]	train-auc:0.94863	valid-auc:0.94625
[100]	train-auc:0.97624	valid-auc:0.97457
[200]	train-auc:0.97868	valid-auc:0.97511
[300]	train-auc:0.98049	valid-auc:0.97523
[400]	train-auc:0.98209	valid-auc:0.97532
[500]	train-auc:0.98350	valid-auc:0.97533
[600]	train-auc:0.98484	valid-auc:0.97535
[667]	train-auc:0.98566	valid-auc:0.97535
#########################
### Fold 2
#########################
Target encoding 127 features... 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 
[0]	train-auc:0.95302	valid-auc:0.95273
[100]	train-auc:0.97628	valid-auc:0.97419
[200]	train-auc:0.97873	valid-auc:0.97487
[300]	train-auc:0.98055	valid-auc:0.97505
[400]	train-auc:0.98213	valid-auc:0.97514
[500]	train-auc:0.98361	valid-auc:0.97517
[545]	train-auc:0.98419	valid-auc:0.97514
#########################
### Fold 3
#########################
Target enco

In [12]:
# 訓練データのスコア
from sklearn.metrics import f1_score

AUC_xgb = roc_auc_score(y,pred_xgb)
print(f"XGB: AUC score = {AUC_xgb}")

XGB: AUC score = 0.975315562846599


### 【CatBoost】

In [13]:
# #################################################
# ############ CatBoost (Classifier版) ############
# #################################################
# from catboost import CatBoostClassifier, Pool
# from sklearn.model_selection import StratifiedKFold
# import numpy as np

# # 学習、バリデーションデータ
# pred_cb = np.zeros(len(train_df))
# pred_cb_test = np.zeros(len(test_df))
# models_cb = []
# cb_auc_valid = []  # foldごとのAUC履歴

# # 入力データ
# X = train.drop(["id","y"],axis=1)
# y = train["y"]
# X_test = test.drop(["id","y"],axis=1)

# # X = X_train_enc
# # y = y_train
# # X_test = X_test_enc

# # CatBoostパラメータ
# cat_params = {
#     "loss_function": "Logloss",
#     "eval_metric": "AUC",
#     "depth": 8,                   # 6〜10
#     "learning_rate": 0.05,        # 0.03〜0.1
#     # "iterations": 1,          # 大きめ＋ES
#     "iterations": 2000,          # 大きめ＋ES
#     "bootstrap_type": "Bayesian", # 精度安定
#     "boosting_type": "Ordered",   # 多カテゴリに強い
#     "random_strength": 1.0,       # 0.5〜2.0で微調整
#     "task_type": "GPU",
#     # "task_type": "CPU",           # このデータ規模ならCPUの方が速い/安定なこと多い
#     "verbose": 100,
# }

# # Stratified KFold
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

#     print("#" * 25)
#     print(f"### Fold {fold+1}")
#     print("#" * 25)

#     # データ分割
#     X_train_kf = X.iloc[train_idx, :]
#     y_train_kf = y.iloc[train_idx]
#     X_valid_kf = X.iloc[valid_idx, :]
#     y_valid_kf = y.iloc[valid_idx]

#     # object型をカテゴリ型に変換
#     for col in cat_col:
#         X_train_kf.loc[:, col] = X_train_kf.loc[:, col].astype("category")
#         X_valid_kf.loc[:, col] = X_valid_kf.loc[:, col].astype("category")

#     # Poolを作成
#     train_pool = Pool(X_train_kf, y_train_kf, cat_features=cat_col)
#     valid_pool = Pool(X_valid_kf, y_valid_kf, cat_features=cat_col)

#     # モデル作成 & 学習
#     model_cb = CatBoostClassifier(**cat_params)
#     model_cb.fit(
#         train_pool,
#         eval_set=valid_pool,
#         early_stopping_rounds=100,
#         use_best_model=True
#     )

#     # バリデーション予測
#     pred_cb[valid_idx] = model_cb.predict_proba(X_valid_kf)[:, 1]

#     # モデル保存
#     models_cb.append(model_cb)

#     # foldごとのベストスコアを保存
#     cb_auc_valid.append(model_cb.get_best_score()["validation"]["AUC"])

# # テスト予測
# for model in models_cb:
#     pred_cb_test += model.predict_proba(X_test)[:, 1]

# # FOLD数で割って平均化
# pred_cb_test = pred_cb_test / skf.n_splits

# print("各foldのAUC:", cb_auc_valid)
# print("平均AUC:", np.mean(cb_auc_valid))

In [14]:
# from sklearn.metrics import f1_score

# # スコア表示
# AUC_cb = roc_auc_score(y,pred_cb)
# F1_cb = f1_score(y,np.round(pred_cb,0))
# print(f"CB: AUC score = {AUC_cb}, F1 = {F1_cb}")

# # # 学習履歴を一番短いfoldに揃える
# # min_len = min(len(m) for m in cb_auc_valid)
# # cb_auc_score = [m[:min_len] for m in cb_auc_valid]

# # # foldごとの結果を平均する
# # cb_auc_score = np.average(cb_auc_score,axis=0)

In [15]:
# # 提出データ作成
# sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")

# sample_submission['y'] = pred_cb_test
# sample_submission.to_csv('submission.csv', index=False)
# print('Submission file saved.')

In [16]:
# import  matplotlib.pyplot as plt
# # 履歴の可視化
# plt.plot(cb_auc_score, label='Validation')
# plt.xlabel('Iteration')
# plt.ylabel('AUC')
# plt.grid()
# plt.legend()
# plt.title("CabBoost AUC")
# plt.show()

### Stacking

In [17]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score

print("# STACKING ENSEMBLE IMPLEMENTATION")
print("# " + "="*50)
print("# Combining top 2 models: LightGBM, XGBoost")
# print("# Combining top 3 models: LightGBM, XGBoost, CatBoost")
print("# Meta-learner: Logistic Regression")
print("# " + "="*50)

stacking_train = pd.DataFrame({
    'lgb': pred_lgb,
    'xgb': pred_xgb, 
    # 'cat': pred_cb
})

stacking_test = pd.DataFrame({
    'lgb': pred_lgb_test,
    'xgb': pred_xgb_test,
    # 'cat': pred_cb_test
})

print(f"# Stacking train shape: {stacking_train.shape}")
print(f"# Stacking test shape: {stacking_test.shape}")

print("\n# METHOD 1: WEIGHTED AVERAGE")
print("# " + "-"*30)

scores = [AUC_lgb, AUC_xgb]  
# scores = [AUC_lgb, AUC_xgb, AUC_cb]  
total_score = sum(scores)
weights = [score/total_score for score in scores]

print(f"# Model weights:")
print(f"# LightGBM: {weights[0]:.4f}")
print(f"# XGBoost:  {weights[1]:.4f}")
# print(f"# CatBoost: {weights[2]:.4f}")

weighted_oof = (stacking_train['lgb'] * weights[0] + 
                stacking_train['xgb'] * weights[1]) 
                # stacking_train['cat'] * weights[2])

weighted_test = (stacking_test['lgb'] * weights[0] + 
                 stacking_test['xgb'] * weights[1]) 
                 # stacking_test['cat'] * weights[2])

weighted_score = roc_auc_score(y, weighted_oof)
print(f"# Weighted Average ROC AUC: {weighted_score:.6f}")

print("\n# METHOD 2: LOGISTIC REGRESSION META-LEARNER")
print("# " + "-"*40)

meta_learner = LogisticRegression(random_state=42, max_iter=1000)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(meta_learner, stacking_train, y, 
                           cv=skf, scoring='roc_auc', n_jobs=1)
                           # cv=skf, scoring='roc_auc', n_jobs=-1)

print(f"# Meta-learner CV scores: {[f'{score:.6f}' for score in cv_scores]}")
print(f"# Meta-learner mean CV: {cv_scores.mean():.6f} ± {cv_scores.std():.6f}")

meta_learner.fit(stacking_train, y)
meta_oof = meta_learner.predict_proba(stacking_train)[:, 1]
meta_test = meta_learner.predict_proba(stacking_test)[:, 1]
meta_score = roc_auc_score(y, meta_oof)

print(f"# Meta-learner ROC AUC: {meta_score:.6f}")

coefficients = meta_learner.coef_[0]
print(f"# Meta-learner coefficients:")
print(f"# LightGBM: {coefficients[0]:.4f}")
print(f"# XGBoost:  {coefficients[1]:.4f}")
# print(f"# CatBoost: {coefficients[2]:.4f}")
print(f"# Intercept: {meta_learner.intercept_[0]:.4f}")

print("\n# METHOD 3: SIMPLE AVERAGE (BASELINE)")
print("# " + "-"*35)

simple_oof = (stacking_train['lgb'] + stacking_train['xgb']) / 2
# simple_oof = (stacking_train['lgb'] + stacking_train['xgb'] + stacking_train['cat']) / 3
simple_test = (stacking_test['lgb'] + stacking_test['xgb']) / 2
# simple_test = (stacking_test['lgb'] + stacking_test['xgb'] + stacking_test['cat']) / 3
simple_score = roc_auc_score(y, simple_oof)

print(f"# Simple Average ROC AUC: {simple_score:.6f}")

print("\n# ENSEMBLE METHODS COMPARISON")
print("# " + "="*40)
ensemble_results = [
    ('Individual LightGBM', AUC_lgb),
    ('Individual XGBoost', AUC_xgb),
    # ('Individual CatBoost', AUC_cb),
    ('Weighted Average', weighted_score),
    ('Meta-learner (LogReg)', meta_score),
    ('Simple Average', simple_score)
]

ensemble_results.sort(key=lambda x: x[1], reverse=True)

for i, (method, score) in enumerate(ensemble_results, 1):
    print(f"# {i}. {method:<25}: {score:.6f}")

best_method, best_score = ensemble_results[0]
print(f"\n# BEST ENSEMBLE METHOD: {best_method}")
print(f"# BEST ENSEMBLE SCORE: {best_score:.6f}")

if 'Meta-learner' in best_method:
    final_oof = meta_oof
    final_test = meta_test
    print("# Using Meta-learner predictions for final submission")
elif 'Weighted' in best_method:
    final_oof = weighted_oof
    final_test = weighted_test
    print("# Using Weighted Average predictions for final submission")
else:
    final_oof = simple_oof
    final_test = simple_test
    print("# Using Simple Average predictions for final submission")

print("\n# STACKING ENSEMBLE COMPLETED!")
print("# " + "="*50)

# STACKING ENSEMBLE IMPLEMENTATION
# Combining top 2 models: LightGBM, XGBoost
# Meta-learner: Logistic Regression
# Stacking train shape: (750000, 2)
# Stacking test shape: (250000, 2)

# METHOD 1: WEIGHTED AVERAGE
# ------------------------------
# Model weights:
# LightGBM: 0.5000
# XGBoost:  0.5000
# Weighted Average ROC AUC: 0.975619

# METHOD 2: LOGISTIC REGRESSION META-LEARNER
# ----------------------------------------
# Meta-learner CV scores: ['0.976289', '0.975522', '0.974900', '0.975846', '0.975542']
# Meta-learner mean CV: 0.975620 ± 0.000454
# Meta-learner ROC AUC: 0.975620
# Meta-learner coefficients:
# LightGBM: 3.6253
# XGBoost:  4.0175
# Intercept: -4.1071

# METHOD 3: SIMPLE AVERAGE (BASELINE)
# -----------------------------------
# Simple Average ROC AUC: 0.975619

# ENSEMBLE METHODS COMPARISON
# 1. Meta-learner (LogReg)    : 0.975620
# 2. Weighted Average         : 0.975619
# 3. Simple Average           : 0.975619
# 4. Individual XGBoost       : 0.975316
# 5. Indivi

In [18]:
# 提出データ作成
sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")

sample_submission['y'] = final_test
sample_submission.to_csv('submission.csv', index=False)
print('Submission file saved.')

Submission file saved.


### skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)【Neural Net】

In [19]:
# import random
# import os
# import pandas as pd
# import numpy as np
# from tqdm.notebook import tqdm
# import matplotlib.pyplot as plt

# import torch
# import torch.nn as nn
# from torch.utils.data import Dataset, DataLoader

# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import f1_score

# # pytorch実装
# import torch # Tensorの作成や操作
# import torch.nn as nn # ニューラルネットワーク
# import torch.nn.functional as F # 関数をメソッドとして提供
# import torch.optim as optim # オプティマイザ
# from torch.utils.data import Dataset, DataLoader
# from torch.autograd import Variable

# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import roc_auc_score
# from tqdm.notebook import tqdm
# # from tqdm import tqdm
# import matplotlib.pyplot as plt
# import time

# # GPUの使用状況確認
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print(device)

# all_df = all_df_NN
# all_df = all_df.drop(["id","y"],axis=1)
# y = train_df["y"]

# # 設定
# SEED = 42
# TARGET = "y"

# CATEGORICAL = cat_col
# NUMERICAL = num_col
# USE = CATEGORICAL + NUMERICAL
# # df_train = train_df.drop("id",axis=1)
# # df_test = test_df.drop("id",axis=1)

In [20]:
# # (1) 住宅ローン + ローン
# all_df["housing_loan"] = all_df["housing"].astype(str) + "_" + all_df["loan"].astype(str)

# # (2) コンタクト時間 x 年齢
# all_df["duration_x_age"] = all_df["duration"] * all_df["age"]

# # (3) sin,cos(コンタクト時間)
# all_df['duration_sin'] = np.sin(2*np.pi * all_df['duration'] / 400)
# all_df['duration_cos'] = np.cos(2*np.pi * all_df['duration'] / 400)

# # (4) monthを数値に直し周期的に使う
# month_map = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
#     'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
#     'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
# all_df['month_num'] = all_df['month'].map(month_map).astype('int')
# all_df['month_sin'] = np.sin(2 * np.pi * all_df['month_num'] / 12)
# all_df['month_cos'] = np.cos(2 * np.pi * all_df['month_num'] / 12)

# # (5) コンタクト時間をカテゴリ化
# all_df['duration_bin'] = pd.cut(
#     all_df['duration'],
#     bins=[0, 60, 300, 600, 900, float('inf')],
#     labels=['short', 'medium', 'long', 'very_long', 'extreme'],
#     right=False)
# all_df['duration_bin'] = all_df['duration_bin'].astype("object")

# # (6) 連絡手段 + 年齢
# all_df['age_group'] = pd.cut(
#     all_df['age'],
#     bins=[0, 30, 45, 60, 100],
#     labels=['young', 'mid', 'senior', 'elder'])
# all_df["contact_age"] = all_df["contact"].astype(str) + "_" + all_df["age_group"].astype(str)
# all_df = all_df.drop("age_group",axis=1)

# # (7) sin,cos(pdays)
# all_df['pdays_sin'] = np.sin(2*np.pi * all_df['pdays'] / 90)
# all_df['pdays_cos'] = np.cos(2*np.pi * all_df['pdays'] / 90)

In [21]:
# # 標準化 + ラベルエンコード
# def preprocessing(all_df, cat_cols=CATEGORICAL, num_cols=NUMERICAL, target=TARGET):

#     # 訓練データとテストデータに分離
#     train = all_df[:len(train_df)]
#     test = all_df[len(train_df):]

#     # y = train[target]
#     # train = train.drop("y",axis=1)
#     train_len = len(train)

#     # 訓練データ + テストデータ
#     # df = pd.concat([train.drop(columns=target), test])
#     # y = train[target]
#     # train_len = len(train)
    
#     # 欠損埋め
#     # df[cat_cols] = df[cat_cols].fillna('None')
#     # df[num_cols] = df[num_cols].fillna(0)

#     # train = df[:train_len]
#     # test = df[train_len:]

#     # 標準化
#     scaler = StandardScaler()

#     # フィッティング
#     # scaler.fit(df[num_cols])
#     scaler.fit(train[num_cols])

#     # 適用
#     train[num_cols] = scaler.transform(train[num_cols])
#     test[num_cols] = scaler.transform(test[num_cols])
#     df = pd.concat([train, test])
    
#     # ラベルエンコーダ
#     for col in df.columns:
#         if col in cat_cols:
#             df[col] = LabelEncoder().fit_transform(df[col])
#             df[col]= df[col].astype('category')
            
#     return pd.concat([df.iloc[:train_len], y], axis=1), df.iloc[train_len:]

In [22]:
# # 前処理の実施
# df_train, df_test = preprocessing(all_df)

In [23]:
# # データセット関数
# class CustomDataset(Dataset):

#     # オブジェクト定義
#     def __init__(self, df, target, cat_cols=CATEGORICAL):
#         self.df_cat = df[cat_cols]
#         self.df_num = df.drop(cat_cols, axis=1)
#         self.X_cats = self.df_cat.values.astype(np.int64)
#         self.X_nums = self.df_num.values.astype(np.float32)
#         self.target = target.values.astype(np.int64)

#     # データセットのサイズを返す
#     def __len__(self):
#         return len(self.target)

#     # 指定したインデックスのデータとラベルを返す
#     def __getitem__(self, idx):
#         return [self.X_cats[idx], self.X_nums[idx], self.target[idx]]

In [24]:
# # NNモデル作成
# class NN_Model(nn.Module):

#     # ネットワーク構造の定義
#     def __init__(self, embedding_sizes, n_num):
#         super().__init__()
#         self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories, size in embedding_sizes])
#         n_emb = sum(e.embedding_dim for e in self.embeddings)
#         self.n_emb, self.n_num = n_emb, n_num
#         self.lin1 = nn.Linear(self.n_emb + self.n_num, 100)
#         self.lin2 = nn.Linear(100, 70)
#         self.lin3 = nn.Linear(70, 2)
#         self.bn1 = nn.BatchNorm1d(self.n_num)
#         self.bn2 = nn.BatchNorm1d(100)
#         self.bn3 = nn.BatchNorm1d(70)
#         self.emb_drop = nn.Dropout(0.6)
#         self.drops = nn.Dropout(0.3)
 
#     # 順伝播
#     def forward(self,x_cat,x_num):
#         x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
#         x = torch.cat(x, dim=1)
#         x = self.emb_drop(x)
#         x2 = self.bn1(x_num)
#         x = torch.cat([x, x2], dim=1)
#         x = F.relu(self.lin1(x))
#         x = self.drops(x)
#         x = self.bn2(x)
#         x = F.relu(self.lin2(x))
#         x = self.drops(x)
#         x = self.bn3(x)
#         x = self.lin3(x)
#         return x

In [25]:
# # ラベルエンコード済みカテゴリ変数の埋め込み
# # 各カテゴリ列の変数の種類
# cat_sizes = [len(df_train[col].cat.categories) for col in CATEGORICAL]

# # (入力サイズ, 50と割る2の小さい方)でエンコード
# emb_sizes = [(size, min(50, (size+1)//2)) for size in cat_sizes]

In [26]:
# # 記録用
# hist = {
#     'train_loss': [], 'train_auc': [],
#     'val_loss': [], 'val_auc': []
# }

# # パラメータ
# bs = 64 # バッチサイズ
# EPOCHS = 5 # エポック
# save_every = 1
# FOLDS = 5 # FOLD数
# LR=1e-3 # 学習率

# patience = 3

# # stratified KFoldの宣言
# skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

# fold_results = []

# val_results = []
# test_results = []

# # SKFによるデータ分割
# for fold, (train_idx, val_idx) in enumerate(skf.split(df_train.drop(columns=TARGET), df_train[TARGET])):
    
#     print(f"\n========== Fold {fold+1} ==========")

#     # 学習データ
#     X_train = df_train.drop(columns=TARGET).iloc[train_idx] 
#     y_train = df_train[TARGET].iloc[train_idx]

#     # バリデーションデータ
#     X_val = df_train.drop(columns=TARGET).iloc[val_idx]
#     y_val = df_train[TARGET].iloc[val_idx]

#     # Datasetの作成
#     train_dataset = CustomDataset(X_train, y_train)
#     val_dataset = CustomDataset(X_val, y_val)
    
#     # DataLoaderの作成
#     train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=0)
#     val_loader = DataLoader(val_dataset, batch_size=bs, shuffle=False, num_workers=0)

#     # モデル構築
#     model = NN_Model(emb_sizes, len(NUMERICAL)).to(device)

#     # 最適化設定
#     optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-5)
#     # optimizer = torch.optim.Adam(model.parameters(), lr=LR)

#     # 損失関数
#     criterion = nn.CrossEntropyLoss()

#     hist = {"train_auc": [], "val_auc": []}
#     best_val_auc = 0
#     counter = 0

#     # 学習・予測エポックのループ
#     for epoch in range(EPOCHS):

#         # 開始時間
#         start_time = time.time()

#         # 学習モード
#         model.train()

#         # ラベル、予測値の保存場所
#         y_true_train, y_pred_train = [], []

#         # プログレスバー
#         train_iter = tqdm(train_loader, desc=f"<Train> Epoch {epoch+1}", leave=False)
        
#         for i, (cat_data, num_data, target) in enumerate(train_iter):

#             # DataLoaderから取り出した、カテゴリ、数値、ターゲット
#             cat_data, num_data, target = cat_data.to(device), num_data.to(device), target.to(device)

#             # パラメータの勾配を初期化
#             optimizer.zero_grad()

#             # 予測値の算出
#             output = model(cat_data, num_data)

#             # ラベルと予測値とのロス計算
#             loss = criterion(output, target)

#             # 各パラメータの勾配を算出
#             loss.backward()

#             # パラメータ更新
#             optimizer.step()

#             # ソフトマックスの分類結果を格納
#             probs = torch.softmax(output, dim=1)[:, 1].detach().cpu().numpy()
#             y_pred_train.extend(probs)

#             # ラベルの格納
#             y_true_train.extend(target.cpu().numpy())

#             # プログレスバーの後ろにロス値を表示
#             if i % 10 == 0:
#                 train_iter.set_postfix(loss=loss.item())

#         # チェックポイント保存
#         if (epoch + 1) % save_every == 0:
#             torch.save(model.state_dict(), f"model_epoch{epoch+1}.pt")        
    
#         # histに残すAUCスコア
#         train_auc = roc_auc_score(y_true_train, y_pred_train)

#         # 評価モード
#         model.eval()

#         # ラベル、予測値の保存場所        
#         y_true_val, y_pred_val = [], []

#         # プログレスバー
#         val_iter = tqdm(val_loader, desc=f"<Val> Epoch {epoch+1}", leave=False)

#         # 勾配を更新しない
#         with torch.no_grad():
            
#             for cat_data, num_data, target in val_iter:
    
#                 # DataLoaderから取り出した、カテゴリ、数値、ターゲット
#                 cat_data, num_data, target = cat_data.to(device), num_data.to(device), target.to(device)

#                 # 予測値の算出
#                 output = model(cat_data, num_data)

#                 # ソフトマックスの分類結果を格納
#                 probs = torch.softmax(output, dim=1)[:, 1].cpu().numpy()
#                 y_pred_val.extend(probs)

#                 # ラベルの格納
#                 y_true_val.extend(target.cpu().numpy())

#                 # プログレスバーの後ろにロス値を表示
#                 val_iter.set_postfix(loss=criterion(output, target).item())

#         # histに残すAUCスコア        
#         val_auc = roc_auc_score(y_true_val, y_pred_val)

#         # 差分時刻
#         elapsed = time.time() - start_time

#         # 履歴追加
#         hist["train_auc"].append(train_auc)
#         hist["val_auc"].append(val_auc)

#         # 進捗
#         print(f"Epoch {epoch+1}/{EPOCHS} - TrainAUC: {train_auc:.4f} | ValAUC: {val_auc:.4f} | Time: {elapsed:.1f}s")

#         # チェックポイント
#         if (epoch + 1) % save_every == 0:
#             torch.save(model.state_dict(), f"model_fold{fold+1}_epoch{epoch+1}.pth")

#         # EarlyStopping判定
#         if val_auc > best_val_auc:
#             best_val_auc = val_auc
#             counter = 0
#             torch.save(model.state_dict(), f"best_model_fold{fold+1}.pth")
#         else:
#             counter += 1
#             if counter >= patience:
#                 print(f"Early stopping at epoch {epoch+1}")
#                 break

    
#     # foldごとに保存
#     torch.save(model.state_dict(), f"model_fold{fold+1}.pth")
    
#     # ヒストグラムの更新
#     fold_results.append(hist)

#     # foldごとにテストデータ計算
#     model.eval()
#     with torch.no_grad():
#         X_val_cat = torch.from_numpy(df_train[CATEGORICAL].values.astype(np.int64)).to(device)
#         X_val_num = torch.from_numpy(df_train[NUMERICAL].values.astype(np.float32)).to(device)

#         # 予測
#         preds = torch.softmax(model(X_val_cat, X_val_num),dim=1)[:,1].cpu().numpy()
#         val_results.append(preds)
        
#         X_test_cat = torch.from_numpy(df_test[CATEGORICAL].values.astype(np.int64)).to(device)
#         X_test_num = torch.from_numpy(df_test[NUMERICAL].values.astype(np.float32)).to(device)

#         # 予測
#         preds = torch.softmax(model(X_test_cat, X_test_num),dim=1)[:,1].cpu().numpy()
#         # preds = torch.softmax(model(X_test_cat, X_test_num).squeeze()).cpu().numpy()
#         test_results.append(preds)
    
# # shape = (n_folds, n_test_samples) → 平均化
# val_results = np.mean(val_results, axis=0)        
# test_results = np.mean(test_results, axis=0)        

In [27]:
# pred_NN = val_results
# pred_NN_test = test_results
# AUC_NN = np.average(hist["val_auc"])

In [28]:
# # ======================
# # FoldごとのAUCをプロット
# # ======================
# import matplotlib.pyplot as plt
# plt.figure(figsize=(10,5))
# for i, hist in enumerate(fold_results):
#     plt.plot(hist["val_auc"], label=f"Fold {i+1} Val AUC")
# plt.xlabel("Epoch")
# plt.ylabel("AUC")
# plt.legend()
# plt.title("Validation AUC per Fold")
# plt.show() 

In [29]:
# from sklearn.ensemble import StackingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import cross_val_score, StratifiedKFold
# from sklearn.metrics import roc_auc_score

# print("# STACKING ENSEMBLE IMPLEMENTATION")
# print("# " + "="*50)
# print("# Combining top 4 models: LightGBM, XGBoost, CatBoost, NN")
# print("# Meta-learner: Logistic Regression")
# print("# " + "="*50)

# stacking_train = pd.DataFrame({
#     'lgb': pred_lgb,
#     'xgb': pred_xgb, 
#     'cat': pred_cb,
#     'NN': pred_NN,
# })

# stacking_test = pd.DataFrame({
#     'lgb': pred_lgb_test,
#     'xgb': pred_xgb_test,
#     'cat': pred_cb_test,
#     'NN': pred_NN_test,
# })

# print(f"# Stacking train shape: {stacking_train.shape}")
# print(f"# Stacking test shape: {stacking_test.shape}")

# print("\n# METHOD 1: WEIGHTED AVERAGE")
# print("# " + "-"*30)

# scores = [AUC_lgb, AUC_xgb, AUC_cb, AUC_NN]  
# total_score = sum(scores)
# weights = [score/total_score for score in scores]

# print(f"# Model weights:")
# print(f"# LightGBM: {weights[0]:.4f}")
# print(f"# XGBoost:  {weights[1]:.4f}")
# print(f"# CatBoost: {weights[2]:.4f}")
# print(f"# NN: {weights[3]:.4f}")

# weighted_oof = (stacking_train['lgb'] * weights[0] + 
#                 stacking_train['xgb'] * weights[1] + 
#                 stacking_train['cat'] * weights[2] +
#                 stacking_train['NN'] * weights[3])

# weighted_test = (stacking_test['lgb'] * weights[0] + 
#                  stacking_test['xgb'] * weights[1] + 
#                  stacking_test['cat'] * weights[2] +
#                  stacking_test['NN'] * weights[3])

# weighted_score = roc_auc_score(y, weighted_oof)
# print(f"# Weighted Average ROC AUC: {weighted_score:.6f}")

# print("\n# METHOD 2: LOGISTIC REGRESSION META-LEARNER")
# print("# " + "-"*40)

# meta_learner = LogisticRegression(penalty="l2",random_state=42, max_iter=1000)
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# cv_scores = cross_val_score(meta_learner, stacking_train, y, 
#                            cv=skf, scoring='roc_auc', n_jobs=-1)

# print(f"# Meta-learner CV scores: {[f'{score:.6f}' for score in cv_scores]}")
# print(f"# Meta-learner mean CV: {cv_scores.mean():.6f} ± {cv_scores.std():.6f}")

# meta_learner.fit(stacking_train, y)
# meta_oof = meta_learner.predict_proba(stacking_train)[:, 1]
# meta_test = meta_learner.predict_proba(stacking_test)[:, 1]
# meta_score = roc_auc_score(y, meta_oof)

# print(f"# Meta-learner ROC AUC: {meta_score:.6f}")

# coefficients = meta_learner.coef_[0]
# print(f"# Meta-learner coefficients:")
# print(f"# LightGBM: {coefficients[0]:.4f}")
# print(f"# XGBoost:  {coefficients[1]:.4f}")
# print(f"# CatBoost: {coefficients[2]:.4f}")
# print(f"# NN: {coefficients[3]:.4f}")
# print(f"# Intercept: {meta_learner.intercept_[0]:.4f}")

# print("\n# METHOD 3: SIMPLE AVERAGE (BASELINE)")
# print("# " + "-"*35)

# simple_oof = (stacking_train['lgb'] + stacking_train['xgb'] + stacking_train['cat'] + stacking_train['NN']) / 4
# simple_test = (stacking_test['lgb'] + stacking_test['xgb'] + stacking_test['cat'] + stacking_test['NN']) / 4
# simple_score = roc_auc_score(y, simple_oof)

# print(f"# Simple Average ROC AUC: {simple_score:.6f}")

# print("\n# ENSEMBLE METHODS COMPARISON")
# print("# " + "="*40)
# ensemble_results = [
#     ('Individual LightGBM', AUC_lgb),
#     ('Individual XGBoost', AUC_xgb),
#     ('Individual CatBoost', AUC_cb),
#     ('Individual NN', AUC_NN),
#     ('Weighted Average', weighted_score),
#     ('Meta-learner (LogReg)', meta_score),
#     ('Simple Average', simple_score)
# ]

# ensemble_results.sort(key=lambda x: x[1], reverse=True)

# for i, (method, score) in enumerate(ensemble_results, 1):
#     print(f"# {i}. {method:<25}: {score:.6f}")

# best_method, best_score = ensemble_results[0]
# print(f"\n# BEST ENSEMBLE METHOD: {best_method}")
# print(f"# BEST ENSEMBLE SCORE: {best_score:.6f}")

# if 'Meta-learner' in best_method:
#     final_oof = meta_oof
#     final_test = meta_test
#     print("# Using Meta-learner predictions for final submission")
# elif 'Weighted' in best_method:
#     final_oof = weighted_oof
#     final_test = weighted_test
#     print("# Using Weighted Average predictions for final submission")
# else:
#     final_oof = simple_oof
#     final_test = simple_test
#     print("# Using Simple Average predictions for final submission")

# print("\n# STACKING ENSEMBLE COMPLETED!")
# print("# " + "="*50)

In [30]:
# # 提出データ作成
# sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")

# sample_submission['y'] = test_results
# sample_submission.to_csv('submission.csv', index=False)
# print('Submission file saved.')