In [None]:
# 通常の pandas 動作が倍速になる
%load_ext cudf.pandas

### 【データのインポート】

In [None]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import os
import cudf
from cuml.preprocessing import TargetEncoder

# データフレーム読み込み
train_df = pd.read_csv("/kaggle/input/playground-series-s5e8/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv")

# 外部データセットの追加
orig_df = pd.read_csv("/kaggle/input/bank-marketing-dataset-full/bank-full.csv",delimiter=";")
orig_df['y'] = orig_df.y.map({'yes':1,'no':0})
orig_df['id'] = (np.arange(len(orig_df))+1e6).astype('int')
orig_df = orig_df.set_index('id')

# データ結合
all_df = pd.concat([train_df,test_df,orig_df],axis=0,ignore_index=True)

### 【前処理】

In [None]:
# カテゴリ列名と数値列名の取得
def preprocess1(df):
    CATS = []
    NUMS = []
    for c in (df.drop(["id","y"],axis=1)).columns:
        t = "CAT"
        if df[c].dtype=='object':
            CATS.append(c)
        else:
            NUMS.append(c)
            t = "NUM"
        n = df[c].nunique()
        na = df[c].isna().sum()
        # print(f"[{t}] {c} has {n} unique and {na} NA")
    # print("CATS:", CATS )
    # print("NUMS:", NUMS )
    return df, CATS, NUMS

# 内部データと外部データに適用
CATS = []
NUMS = []
all_df, CATS, NUMS = preprocess1(all_df)

In [None]:
# ラベルエンコードとユニーク数の取得
def preprocess2(df, CATS, NUMS):
    # NUMS：数値列、NUMS2：カテゴリ数、CATS1：カテゴリ数
    CATS1 = [] # 数値列
    SIZES = {} # カテゴリ数

    for c in NUMS + CATS:
        n = c
        # 数値列のとき
        if c in NUMS: 
            n = f"{c}2"
            CATS1.append(n)
        # カテゴリ列のとき、ラベルエンコード
        df[n], uniques = df[c].factorize()
        # カテゴリ数
        SIZES[n] = len(uniques)
        # print(c)
        df[c] = df[c].astype('int32')
        df[n] = df[n].astype('int32')

    # print("New CATS:", CATS1 )
    # print("Cardinality of all CATS:", SIZES )
    return df, CATS1, SIZES

# 内部データと外部データに適用
CATS1 = []
SIZES = []
all_df, CATS1, SIZES = preprocess2(all_df, CATS, NUMS)

In [None]:
from itertools import combinations

# カラムペアの作成(カラムxユニーク数+カラム)
def preprocess3(df, CATS, CATS1, SIZES):
    pairs = combinations(CATS + CATS1, 2)
    new_cols = {}
    CATS2 = []

    for c1, c2 in pairs:
        name = "_".join(sorted((c1, c2)))
        new_cols[name] = df[c1] * SIZES[c2] + df[c2]
        CATS2.append(name)
    if new_cols:
        new_df = pd.DataFrame(new_cols)         
        df = pd.concat([df, new_df], axis=1) 

    # print(f"Created {len(CATS2)} new CAT columns")
    return df, CATS2

# 内部データと外部データに適用
CATS2 = []
all_df, CATS2 = preprocess3(all_df, CATS, CATS1, SIZES)

In [None]:
# カウントエンコード
def preprocess4(df, CATS, CATS1, CATS2):
    
    CC = CATS+CATS1+CATS2

    print(f"Processing {len(CC)} columns... ",end="")
    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
        tmp = df.groupby(c).y.count()
        tmp = tmp.astype('int32')
        tmp.name = f"CE_{c}"
        CE.append( f"CE_{c}" )
        df = df.merge(tmp, on=c, how='left')
    print()
    return df, CE

# 内部データと外部データに適用
CE = []
all_df, CE = preprocess4(all_df, CATS, CATS1, CATS2)

In [None]:
# yがint64なのでint32に変換する(GPU対応のため)
all_df["y"] = all_df["y"].astype("int32")

# データ分割
train1 = all_df.iloc[:len(train_df)]
test1 = all_df.iloc[len(train_df):len(train_df)+len(test_df)]

# 外部データセットあり
orig = all_df.iloc[-len(orig_df):]

In [None]:
def preprocess5(train, test, CATS, CATS1, CATS2, orig):

    CC = CATS+CATS1+CATS2

    print(f"Processing {len(CC)} columns... ",end="")
    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
        tmp = orig.groupby(c).y.mean()
        tmp = tmp.astype('float32')
        tmp.name = f"TE_ORIG_{c}"
        TE_ORIG.append( f"TE_ORIG_{c}" )
        train = train.merge(tmp, on=c, how='left')
        test = test.merge(tmp, on=c, how='left')
    return train, test
    print()

TE_ORIG = []
train2, test2 = preprocess5(train1, test1, CATS, CATS1, CATS2, orig)

### 【Light GBM + Optuna】

In [None]:
# ###########################################
# ############ Light GBM + Optuna ###########
# ###########################################
# import optuna
# import numpy as np
# import pandas as pd
# import lightgbm as lgb
# from sklearn.metrics import roc_auc_score
# from sklearn.model_selection import StratifiedKFold, KFold
# # from category_encoders import TargetEncoder # CPU版
# from cuml.preprocessing import TargetEncoder
# import cudf
# import warnings
# import gc
# warnings.filterwarnings("ignore")

# # 入力データ
# X = train1.drop(["id","y"], axis=1).copy()
# y = train1["y"].copy()

# def objective(trial):

#     # 整数は±50%、確率系は±0.1〜0.2、正則化は対数で±1〜2
#     # LightGBMパラメータ
#     lgbm_params = {
#         'objective': 'binary',
#         'device': 'gpu',
#         'metric': 'auc',
#         'boosting_type': 'gbdt',
#         "num_leaves": trial.suggest_int("num_leaves", 52, 116),
#         "max_depth": trial.suggest_int("max_depth", 10, 15),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 15, 35),
#         "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0.0, 0.38),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.55, 1.0),
#         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.73, 1.0),
#         "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),
#         "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 10),
#         "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 8),
#         "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1e-3, 4),
#         'verbosity': -1
#     }

#     pred_lgb = np.zeros(len(X))
#     fold_scores = []

#     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     # kf = KFold(n_splits=5, shuffle=True, random_state=42)

#     # Fold分割し格納
#     for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
#         print("#"*25)
#         print(f"### Fold {fold+1}")
#         print("#"*25)

#         X_train = X.iloc[train_idx,:].copy()
#         y_train = y.iloc[train_idx].copy()
#         X_valid = X.iloc[valid_idx,:].copy()
#         y_valid = y.iloc[valid_idx].copy()

#         # ターゲットエンコーディング
#         CC = CATS1_in+CATS2_in
#         print(f"Target encoding {len(CC)} features... ",end="")

#         # グローバル平均
#         # global_mean = y_train.mean()
    
#         for i,c in enumerate(CC):
#             if i%10==0: print(f"{i}, ",end="")
            
#             TE0 = TargetEncoder(n_folds=5, smooth=10, 
#                                 split_method='random', stat='mean')
#             X_train[c] = TE0.fit_transform(X_train[c],y_train).astype('float32')
#             # X_valid[c] = np.where(np.isnan(X_valid[c]), global_mean, X_valid[c])
#             X_valid[c] = TE0.transform(X_valid[c]).astype('float32')
        
#         print()

#         # TE0を明示的に削除
#         del TE0
#         gc.collect()

#         # CC以外はカテゴリ型に変換
#         for c in CATS_in:
#             if c not in CC:  
#                 X_train[c] = X_train[c].astype('category')
#                 X_valid[c] = X_valid[c].astype('category')

#         # データセット
#         lgb_train = lgb.Dataset(
#             X_train,y_train,categorical_feature=CATS_in)

#         lgb_valid = lgb.Dataset(
#             X_valid,y_valid,categorical_feature=CATS_in)
#         # --------------------------
#         # 学習
#         # --------------------------
#         model_lgb = lgb.train(
#             lgbm_params,
#             lgb_train,
#             num_boost_round=2000,
#             valid_sets=[lgb_train, lgb_valid],
#             valid_names=["train", "valid"],
#             callbacks=[
#                 lgb.early_stopping(stopping_rounds=100, verbose=False),
#                 lgb.log_evaluation(500),
#             ])

#         # 予測
#         pred_lgb[valid_idx] = model_lgb.predict(
#             X_valid, num_iteration=model_lgb.best_iteration)

#         # AUCスコア算出
#         fold_scores.append(roc_auc_score(y_valid,pred_lgb[valid_idx]))

#     # 各Foldの平均値
#     score = np.mean(fold_scores)
    
#     return score

# # 最適化実行
# study = optuna.create_study(direction="maximize",
#                            pruner=optuna.pruners.MedianPruner())
# study.optimize(objective, n_trials=12)

In [None]:
# # 最もスコアが高かった Trial
# best_trial = study.best_trial
# print("Best AUC:", best_trial.value)
# print("Best parameters:")
# for key, val in best_trial.params.items():
#     print(f"  {key}: {val}")

In [None]:
# # Trialをスコア順にソートして上位5件を表示
# trials_df = study.trials_dataframe()
# trials_df = trials_df.sort_values('value', ascending=False)
# print(trials_df.head(5))

In [None]:
# optuna.visualization.plot_param_importances(study)  # パラメータ重要度
# optuna.visualization.plot_optimization_history(study)  # スコア推移

In [None]:
# for i, trial in enumerate(study.best_trials):
#   print(trial.params)
#   print([j for j in trial.values])

### 【① Light GBM (内部データセットのみ)】

In [None]:
############################################################
############ Light GBM with OOF Target Encoding ############
############################################################
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
# from category_encoders import TargetEncoder # CPU版
from cuml.preprocessing import TargetEncoder
import cudf
import warnings
import gc
warnings.filterwarnings("ignore")

# 学習、バリデーションデータ
pred_lgb1 = np.zeros(len(train1))
pred_lgb_test1 = np.zeros(len(test1))
models_lgb1 = []

# 入力データ
X = train1.drop(["id","y"], axis=1).copy()
y = train1["y"].copy()
test_ = test1.drop(["id","y"], axis=1).copy()

# LightGBMパラメータ
lgbm_params = {
    'objective': 'binary',
    'device': 'gpu',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    "num_leaves": 77,
    "max_depth": 15,
    "min_data_in_leaf": 23,
    "min_gain_to_split": 0.17931836655003727,
    "learning_rate": 0.019740808893122665,
    "feature_fraction": 0.7499258780711098,
    "bagging_fraction": 0.9392312065171743,
    "bagging_freq": 3,
    "lambda_l1": 0.13817541814163015,
    "lambda_l2": 5.987592011786754,
    "min_sum_hessian_in_leaf": 2.7742348039686524,
    'verbosity': -1
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Fold分割し格納
for fold, (train_idx, valid_idx) in enumerate(skf.split(X,y)):
    print("#"*25)
    print(f"### Fold {fold+1}")
    print("#"*25)

    X_train = X.iloc[train_idx,:].copy()
    y_train = y.iloc[train_idx].copy()
    X_valid = X.iloc[valid_idx,:].copy()
    y_valid = y.iloc[valid_idx].copy()
    X_test = test_.copy()

    # ターゲットエンコーディング
    CC = CATS1+CATS2
    print(f"Target encoding {len(CC)} features... ",end="")
    
    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
            
        TE0 = TargetEncoder(n_folds=5, smooth=10, split_method='random', stat='mean')
        X_train[c] = TE0.fit_transform(X_train[c],y_train).astype('float32')
        X_valid[c] = TE0.transform(X_valid[c]).astype('float32')
        X_test[c] = TE0.transform(X_test[c]).astype('float32')
        
    print()

    # TE0を明示的に削除
    del TE0
    gc.collect()

    # CC以外はカテゴリ型に変換
    for c in CATS:
        if c not in CC:  
            X_train[c] = X_train[c].astype('category')
            X_valid[c] = X_valid[c].astype('category')
            X_test[c]  = X_test[c].astype('category')    

    # データセット
    lgb_train = lgb.Dataset(
        X_train,y_train,categorical_feature=CATS)

    lgb_valid = lgb.Dataset(
        X_valid,y_valid,categorical_feature=CATS)
    # --------------------------
    # 学習
    # --------------------------
    model_lgb = lgb.train(
        lgbm_params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(100),
        ]
    )

    # 予測
    pred_lgb1[valid_idx] = model_lgb.predict(
        X_valid, num_iteration=model_lgb.best_iteration)

    pred_lgb_test1 += model_lgb.predict(
        X_test, num_iteration=model_lgb.best_iteration)/5

    # モデル保存
    models_lgb1.append(model_lgb)

    # メモリ開放
    del X_train, X_valid
    gc.collect()

    # メモリ開放
    import numba.cuda as cuda
    cuda.current_context().deallocations.clear()   

In [None]:
# 訓練データのスコア
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

AUC_lgb1 = roc_auc_score(y,pred_lgb1)
print(f"LGB1: AUC score = {AUC_lgb1}")

### 【② Light GBM (外部データセットのTE追加)】

In [None]:
############################################################
############ Light GBM with OOF Target Encoding ############
############################################################
# 学習、バリデーションデータ
pred_lgb2 = np.zeros(len(train2))
pred_lgb_test2 = np.zeros(len(test2))
models_lgb2 = []

# 入力データ
X = train2.drop(["id","y"], axis=1).copy()
y = train2["y"].copy()
test_ = test2.drop(["id","y"], axis=1).copy()

# LightGBMパラメータ
lgbm_params = {
    'objective': 'binary',
    'device': 'gpu',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    "num_leaves": 77,
    "max_depth": 15,
    "min_data_in_leaf": 23,
    "min_gain_to_split": 0.17931836655003727,
    "learning_rate": 0.019740808893122665,
    "feature_fraction": 0.7499258780711098,
    "bagging_fraction": 0.9392312065171743,
    "bagging_freq": 3,
    "lambda_l1": 0.13817541814163015,
    "lambda_l2": 5.987592011786754,
    "min_sum_hessian_in_leaf": 2.7742348039686524,
    'verbosity': -1
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Fold分割し格納
for fold, (train_idx, valid_idx) in enumerate(skf.split(X,y)):
    print("#"*25)
    print(f"### Fold {fold+1}")
    print("#"*25)

    X_train = X.iloc[train_idx,:].copy()
    y_train = y.iloc[train_idx].copy()
    X_valid = X.iloc[valid_idx,:].copy()
    y_valid = y.iloc[valid_idx].copy()
    X_test = test_.copy()

    # ターゲットエンコーディング
    CC = CATS1+CATS2
    print(f"Target encoding {len(CC)} features... ",end="")

    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
            
        TE0 = TargetEncoder(n_folds=5, smooth=10, split_method='random', stat='mean')
        X_train[c] = TE0.fit_transform(X_train[c],y_train).astype('float32')
        X_valid[c] = TE0.transform(X_valid[c]).astype('float32')
        X_test[c] = TE0.transform(X_test[c]).astype('float32')
        
    print()

    # TE0を明示的に削除
    del TE0
    gc.collect()

    # CC以外はカテゴリ型に変換
    for c in CATS:
        if c not in CC:  
            X_train[c] = X_train[c].astype('category')
            X_valid[c] = X_valid[c].astype('category')
            X_test[c]  = X_test[c].astype('category')    

    # データセット
    lgb_train = lgb.Dataset(
        X_train,y_train,categorical_feature=CATS)

    lgb_valid = lgb.Dataset(
        X_valid,y_valid,categorical_feature=CATS)
    # --------------------------
    # 学習
    # --------------------------
    model_lgb = lgb.train(
        lgbm_params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(100),
        ]
    )

    # 予測
    pred_lgb2[valid_idx] = model_lgb.predict(
        X_valid, num_iteration=model_lgb.best_iteration)

    pred_lgb_test2 += model_lgb.predict(
        X_test, num_iteration=model_lgb.best_iteration)/5

    # モデル保存
    models_lgb2.append(model_lgb)

    # メモリ開放
    del X_train, X_valid
    gc.collect()

    # メモリ開放
    import numba.cuda as cuda
    cuda.current_context().deallocations.clear()   

In [None]:
# 訓練データのスコア
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

AUC_lgb2 = roc_auc_score(y,pred_lgb2)
print(f"LGB2: AUC score = {AUC_lgb2}")

### XGBoost + Optuna

In [None]:
# ###########################################
# ############ Light GBM + Optuna ###########
# ###########################################
# import optuna
# import numpy as np
# import pandas as pd
# import xgboost as xgb
# from sklearn.metrics import roc_auc_score
# from sklearn.model_selection import StratifiedKFold, KFold
# # from category_encoders import TargetEncoder # CPU版
# from cuml.preprocessing import TargetEncoder
# import cudf
# import warnings
# import gc
# warnings.filterwarnings("ignore")

# # 入力データ
# X = train1.drop(["id","y"], axis=1).copy()
# y = train1["y"].copy()

# # 評価履歴を保存する辞書
# evals_result_xgb = {}

# def objective(trial):

#     # 整数は±50%、確率系は±0.1〜0.2、正則化は対数で±1〜2
#     # XGBoostパラメータ
#     xgb_params = {
#         "objective": "binary:logistic",
#         "eval_metric": "auc",
#         "tree_method": "gpu_hist",
#         "device": "cuda",
#         "grow_policy": "lossguide",

#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),

#         "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),

#         "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
#         "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
    
#         "gamma": trial.suggest_float("gamma", 0.0, 5.0),
#         "max_leaves": trial.suggest_int("max_leaves", 16, 256),
#     }

#     pred_xgb = np.zeros(len(X))
#     fold_scores = []

#     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     # kf = KFold(n_splits=5, shuffle=True, random_state=42)

#     # Fold分割し格納
#     for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
#         print("#"*25)
#         print(f"### Fold {fold+1}")
#         print("#"*25)

#         X_train = X.iloc[train_idx,:].copy()
#         y_train = y.iloc[train_idx].copy()
#         X_valid = X.iloc[valid_idx,:].copy()
#         y_valid = y.iloc[valid_idx].copy()

#         # ターゲットエンコーディング
#         CC = CATS1_in+CATS2_in
#         print(f"Target encoding {len(CC)} features... ",end="")

#         for i,c in enumerate(CC):
#             if i%10==0: print(f"{i}, ",end="")
            
#             TE0 = TargetEncoder(n_folds=5, smooth=10, 
#                                 split_method='random', stat='mean')
#             X_train[c] = TE0.fit_transform(X_train[c],y_train).astype('float32')
#             X_valid[c] = TE0.transform(X_valid[c]).astype('float32')
        
#         print()

#         # TE0を明示的に削除
#         del TE0
#         gc.collect()

#         # CC以外はカテゴリ型に変換
#         for c in CATS_in:
#             if c not in CC:  
#                 X_train[c] = X_train[c].astype('category')
#                 X_valid[c] = X_valid[c].astype('category')

#         # DMatrixに変換
#         dtrain = xgb.DMatrix(X_train,label=y_train,enable_categorical=True)
#         dvalid = xgb.DMatrix(X_valid,label=y_valid,enable_categorical=True)

#         # 学習
#         model_xgb = xgb.train(
#             xgb_params,
#             dtrain,
#             num_boost_round=2000,
#             evals=[(dtrain,"train"),(dvalid,"valid")],
#             early_stopping_rounds=100,
#             evals_result=evals_result_xgb,
#             verbose_eval=500,
#         )

#         # 各foldでのバリデーション予測
#         pred_xgb[valid_idx] = model_xgb.predict(
#             dvalid, iteration_range=(0,model_xgb.best_iteration+1))

#         # AUCスコア算出
#         fold_scores.append(roc_auc_score(y_valid,pred_xgb[valid_idx]))

#     # 各Foldの平均値
#     score = np.mean(fold_scores)
    
#     return score

# # 最適化実行
# study = optuna.create_study(direction="maximize",
#                            pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10))
# study.optimize(objective, n_trials=100)

### 【③ XGBoost (内部データセットのみ)】

In [None]:
#################################################
############ XGBoost ############################
#################################################
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 学習、バリデーションデータ
pred_xgb1 = np.zeros(len(train1))
pred_xgb_test1 = np.zeros(len(test1))
models_xgb1 = []

# 入力データ
X = train1.drop(["id","y"],axis=1).copy()
y = train1["y"].copy()
test_ = test1.drop(["id","y"],axis=1).copy()

# 評価履歴を保存する辞書
evals_result_xgb = {}

# パラメータ
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "tree_method": "gpu_hist",
    "device": "cuda",
    "grow_policy": "lossguide",
    "learning_rate": 0.020187734867721113,
    "max_depth": 10,
    "min_child_weight": 0.0015137166209180514,
    "subsample": 0.6786153011677415,
    "colsample_bytree": 0.7917555828184474,
    "colsample_bylevel": 0.5539530181906183,
    "reg_alpha": 5.3805307261170965,
    "reg_lambda": 1.2434258141601598e-08,
    "gamma": 3.715076866606369,
    "max_leaves": 91,
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold,(train_idx,valid_idx) in enumerate(skf.split(X,y)):

    print("#"*25)
    print(f"### Fold {fold+1}")
    print("#"*25)

    # foldごとの訓練、バリデーションデータ
    X_train = X.iloc[train_idx,:].copy()
    y_train = y.iloc[train_idx].copy()
    X_valid = X.iloc[valid_idx,:].copy()
    y_valid = y.iloc[valid_idx].copy()
    X_test = test_.copy()

    # ターゲットエンコーディング
    CC = CATS1+CATS2
    print(f"Target encoding {len(CC)} features... ",end="")
    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
        TE0 = TargetEncoder(n_folds=5, smooth=0, split_method='random', stat='mean')
        X_train[c] = TE0.fit_transform(X_train[c],ｙ_train).astype('float32')
        X_valid[c] = TE0.transform(X_valid[c]).astype('float32')
        X_test[c] = TE0.transform(X_test[c]).astype('float32')
    print()

    # CC以外はカテゴリ型に変換
    for c in CATS:
        if c not in CC:  
            X_train[c] = X_train[c].astype('category')
            X_valid[c] = X_valid[c].astype('category')

    # DMatrixに変換
    dtrain = xgb.DMatrix(X_train,label=y_train,enable_categorical=True)
    dvalid = xgb.DMatrix(X_valid,label=y_valid,enable_categorical=True)
    dtest = xgb.DMatrix(X_test,enable_categorical=True)

    # 学習
    model_xgb = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=5000,
        evals=[(dtrain,"train"),(dvalid,"valid")],
        early_stopping_rounds=100,
        evals_result=evals_result_xgb,
        verbose_eval=100,
    )

    # 各foldでのバリデーション予測
    pred_xgb1[valid_idx] = model_xgb.predict(
        dvalid, iteration_range=(0,model_xgb.best_iteration+1))

    # 各foldでのバリデーション予測
    pred_xgb_test1 += model_xgb.predict(
        dtest, iteration_range=(0,model_xgb.best_iteration+1))/5
    
    # モデルの追加
    models_xgb1.append(model_xgb)

In [None]:
# 訓練データのスコア
from sklearn.metrics import f1_score

AUC_xgb1 = roc_auc_score(y,pred_xgb1)
print(f"XGB1: AUC score = {AUC_xgb1}")

### 【④ XGBoost (外部データセットのTE追加)】

In [None]:
#################################################
############ XGBoost ############################
#################################################
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 学習、バリデーションデータ
pred_xgb2 = np.zeros(len(train2))
pred_xgb_test2 = np.zeros(len(test2))
models_xgb2 = []

# 入力データ
X = train2.drop(["id","y"],axis=1).copy()
y = train2["y"].copy()
test_ = test2.drop(["id","y"],axis=1).copy()

# 評価履歴を保存する辞書
evals_result_xgb = {}

# パラメータ
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "tree_method": "gpu_hist",
    "device": "cuda",
    "grow_policy": "lossguide",
    "learning_rate": 0.020187734867721113,
    "max_depth": 10,
    "min_child_weight": 0.0015137166209180514,
    "subsample": 0.6786153011677415,
    "colsample_bytree": 0.7917555828184474,
    "colsample_bylevel": 0.5539530181906183,
    "reg_alpha": 5.3805307261170965,
    "reg_lambda": 1.2434258141601598e-08,
    "gamma": 3.715076866606369,
    "max_leaves": 91,
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold,(train_idx,valid_idx) in enumerate(skf.split(X,y)):

    print("#"*25)
    print(f"### Fold {fold+1}")
    print("#"*25)

    # foldごとの訓練、バリデーションデータ
    X_train = X.iloc[train_idx,:].copy()
    y_train = y.iloc[train_idx].copy()
    X_valid = X.iloc[valid_idx,:].copy()
    y_valid = y.iloc[valid_idx].copy()
    X_test = test_.copy()

    # ターゲットエンコーディング
    CC = CATS1+CATS2
    print(f"Target encoding {len(CC)} features... ",end="")
    for i,c in enumerate(CC):
        if i%10==0: print(f"{i}, ",end="")
        TE0 = TargetEncoder(n_folds=5, smooth=0, split_method='random', stat='mean')
        X_train[c] = TE0.fit_transform(X_train[c],ｙ_train).astype('float32')
        X_valid[c] = TE0.transform(X_valid[c]).astype('float32')
        X_test[c] = TE0.transform(X_test[c]).astype('float32')
    print()

    # CC以外はカテゴリ型に変換
    for c in CATS:
        if c not in CC:  
            X_train[c] = X_train[c].astype('category')
            X_valid[c] = X_valid[c].astype('category')

    # DMatrixに変換
    dtrain = xgb.DMatrix(X_train,label=y_train,enable_categorical=True)
    dvalid = xgb.DMatrix(X_valid,label=y_valid,enable_categorical=True)
    dtest = xgb.DMatrix(X_test,enable_categorical=True)

    # 学習
    model_xgb = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=5000,
        evals=[(dtrain,"train"),(dvalid,"valid")],
        early_stopping_rounds=100,
        evals_result=evals_result_xgb,
        verbose_eval=100,
    )

    # 各foldでのバリデーション予測
    pred_xgb2[valid_idx] = model_xgb.predict(
        dvalid, iteration_range=(0,model_xgb.best_iteration+1))

    # 各foldでのバリデーション予測
    pred_xgb_test2 += model_xgb.predict(
        dtest, iteration_range=(0,model_xgb.best_iteration+1))/5
    
    # モデルの追加
    models_xgb2.append(model_xgb)

In [None]:
# 訓練データのスコア
from sklearn.metrics import f1_score

AUC_xgb2 = roc_auc_score(y,pred_xgb2)
print(f"XGB2: AUC score = {AUC_xgb2}")

### 【CatBoost】

In [None]:
# #################################################
# ############ CatBoost (Classifier版) ############
# #################################################
# from catboost import CatBoostClassifier, Pool
# from sklearn.model_selection import StratifiedKFold
# import numpy as np

# # 学習、バリデーションデータ
# pred_cb = np.zeros(len(train_df))
# pred_cb_test = np.zeros(len(test_df))
# models_cb = []
# cb_auc_valid = []  # foldごとのAUC履歴

# # 入力データ
# X = train.drop(["id","y"],axis=1)
# y = train["y"]
# X_test = test.drop(["id","y"],axis=1)

# # X = X_train_enc
# # y = y_train
# # X_test = X_test_enc

# # CatBoostパラメータ
# cat_params = {
#     "loss_function": "Logloss",
#     "eval_metric": "AUC",
#     "depth": 8,                   # 6〜10
#     "learning_rate": 0.05,        # 0.03〜0.1
#     # "iterations": 1,          # 大きめ＋ES
#     "iterations": 2000,          # 大きめ＋ES
#     "bootstrap_type": "Bayesian", # 精度安定
#     "boosting_type": "Ordered",   # 多カテゴリに強い
#     "random_strength": 1.0,       # 0.5〜2.0で微調整
#     "task_type": "GPU",
#     # "task_type": "CPU",           # このデータ規模ならCPUの方が速い/安定なこと多い
#     "verbose": 100,
# }

# # Stratified KFold
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

#     print("#" * 25)
#     print(f"### Fold {fold+1}")
#     print("#" * 25)

#     # データ分割
#     X_train_kf = X.iloc[train_idx, :]
#     y_train_kf = y.iloc[train_idx]
#     X_valid_kf = X.iloc[valid_idx, :]
#     y_valid_kf = y.iloc[valid_idx]

#     # object型をカテゴリ型に変換
#     for col in cat_col:
#         X_train_kf.loc[:, col] = X_train_kf.loc[:, col].astype("category")
#         X_valid_kf.loc[:, col] = X_valid_kf.loc[:, col].astype("category")

#     # Poolを作成
#     train_pool = Pool(X_train_kf, y_train_kf, cat_features=cat_col)
#     valid_pool = Pool(X_valid_kf, y_valid_kf, cat_features=cat_col)

#     # モデル作成 & 学習
#     model_cb = CatBoostClassifier(**cat_params)
#     model_cb.fit(
#         train_pool,
#         eval_set=valid_pool,
#         early_stopping_rounds=100,
#         use_best_model=True
#     )

#     # バリデーション予測
#     pred_cb[valid_idx] = model_cb.predict_proba(X_valid_kf)[:, 1]

#     # モデル保存
#     models_cb.append(model_cb)

#     # foldごとのベストスコアを保存
#     cb_auc_valid.append(model_cb.get_best_score()["validation"]["AUC"])

# # テスト予測
# for model in models_cb:
#     pred_cb_test += model.predict_proba(X_test)[:, 1]

# # FOLD数で割って平均化
# pred_cb_test = pred_cb_test / skf.n_splits

# print("各foldのAUC:", cb_auc_valid)
# print("平均AUC:", np.mean(cb_auc_valid))

In [None]:
# from sklearn.metrics import f1_score

# # スコア表示
# AUC_cb = roc_auc_score(y,pred_cb)
# F1_cb = f1_score(y,np.round(pred_cb,0))
# print(f"CB: AUC score = {AUC_cb}, F1 = {F1_cb}")

# # # 学習履歴を一番短いfoldに揃える
# # min_len = min(len(m) for m in cb_auc_valid)
# # cb_auc_score = [m[:min_len] for m in cb_auc_valid]

# # # foldごとの結果を平均する
# # cb_auc_score = np.average(cb_auc_score,axis=0)

In [None]:
# # 提出データ作成
# sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")

# sample_submission['y'] = pred_cb_test
# sample_submission.to_csv('submission.csv', index=False)
# print('Submission file saved.')

In [None]:
# import  matplotlib.pyplot as plt
# # 履歴の可視化
# plt.plot(cb_auc_score, label='Validation')
# plt.xlabel('Iteration')
# plt.ylabel('AUC')
# plt.grid()
# plt.legend()
# plt.title("CabBoost AUC")
# plt.show()

### Stacking①③

In [None]:
# from sklearn.ensemble import StackingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import cross_val_score, StratifiedKFold
# from sklearn.metrics import roc_auc_score

# print("# STACKING ENSEMBLE IMPLEMENTATION")
# print("# " + "="*50)
# print("# Combining top 2 models: LightGBM①, XGBoost①")
# print("# Meta-learner: Logistic Regression")
# print("# " + "="*50)

# y = train1["y"].copy()

# stacking_train = pd.DataFrame({
#     'lgb1': pred_lgb1,
#     'xgb1': pred_xgb1, 
# })

# stacking_test = pd.DataFrame({
#     'lgb1': pred_lgb_test1,
#     'xgb1': pred_xgb_test1,
# })

# print(f"# Stacking train shape: {stacking_train.shape}")
# print(f"# Stacking test shape: {stacking_test.shape}")

# print("\n# METHOD 1: WEIGHTED AVERAGE")
# print("# " + "-"*30)

# scores = [AUC_lgb1, AUC_xgb1]  
# total_score = sum(scores)
# weights = [score/total_score for score in scores]

# print(f"# Model weights:")
# print(f"# LightGBM1: {weights[0]:.4f}")
# print(f"# XGBoost1:  {weights[1]:.4f}")

# weighted_oof = (
#     stacking_train['lgb1'] * weights[0] + 
#     stacking_train['xgb1'] * weights[1] 
# )

# weighted_test = (
#     stacking_test['lgb1'] * weights[0] + 
#     stacking_test['xgb1'] * weights[1] 
# ) 

# weighted_score = roc_auc_score(y, weighted_oof)
# print(f"# Weighted Average ROC AUC: {weighted_score:.6f}")

# print("\n# METHOD 2: LOGISTIC REGRESSION META-LEARNER")
# print("# " + "-"*40)

# meta_learner = LogisticRegression(random_state=42, max_iter=1000)
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# cv_scores = cross_val_score(meta_learner, stacking_train, y, 
#                            cv=skf, scoring='roc_auc', n_jobs=1)

# print(f"# Meta-learner CV scores: {[f'{score:.6f}' for score in cv_scores]}")
# print(f"# Meta-learner mean CV: {cv_scores.mean():.6f} ± {cv_scores.std():.6f}")

# meta_learner.fit(stacking_train, y)
# meta_oof = meta_learner.predict_proba(stacking_train)[:, 1]
# meta_test = meta_learner.predict_proba(stacking_test)[:, 1]
# meta_score = roc_auc_score(y, meta_oof)

# print(f"# Meta-learner ROC AUC: {meta_score:.6f}")

# coefficients = meta_learner.coef_[0]
# print(f"# Meta-learner coefficients:")
# print(f"# LightGBM1: {coefficients[0]:.4f}")
# print(f"# XGBoost1:  {coefficients[1]:.4f}")
# print(f"# Intercept: {meta_learner.intercept_[0]:.4f}")

# print("\n# METHOD 3: SIMPLE AVERAGE (BASELINE)")
# print("# " + "-"*35)

# simple_oof = (
#     stacking_train['lgb1'] + 
#     stacking_train['xgb1'] 
#     ) / 2
# simple_test = (
#     stacking_test['lgb1'] + 
#     stacking_test['xgb1'] 
#     ) / 2
# simple_score = roc_auc_score(y, simple_oof)

# print(f"# Simple Average ROC AUC: {simple_score:.6f}")

# print("\n# ENSEMBLE METHODS COMPARISON")
# print("# " + "="*40)
# ensemble_results = [
#     ('Individual LightGBM1', AUC_lgb1),
#     ('Individual XGBoost1', AUC_xgb1),
#     ('Weighted Average', weighted_score),
#     ('Meta-learner (LogReg)', meta_score),
#     ('Simple Average', simple_score)
# ]

# ensemble_results.sort(key=lambda x: x[1], reverse=True)

# for i, (method, score) in enumerate(ensemble_results, 1):
#     print(f"# {i}. {method:<25}: {score:.6f}")

# best_method, best_score = ensemble_results[0]
# print(f"\n# BEST ENSEMBLE METHOD: {best_method}")
# print(f"# BEST ENSEMBLE SCORE: {best_score:.6f}")

# if 'Meta-learner' in best_method:
#     final_oof1 = meta_oof
#     final_test1 = meta_test
#     print("# Using Meta-learner predictions for final submission")
# elif 'Weighted' in best_method:
#     final_oof1 = weighted_oof
#     final_test1 = weighted_test
#     print("# Using Weighted Average predictions for final submission")
# else:
#     final_oof1 = simple_oof
#     final_test1 = simple_test
#     print("# Using Simple Average predictions for final submission")

# print("\n# STACKING ENSEMBLE COMPLETED!")
# print("# " + "="*50)

### Stacking②④

In [None]:
# from sklearn.ensemble import StackingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import cross_val_score, StratifiedKFold
# from sklearn.metrics import roc_auc_score

# print("# STACKING ENSEMBLE IMPLEMENTATION")
# print("# " + "="*50)
# print("# Combining top 2 models: LightGBM②, XGBoost②")
# print("# Meta-learner: Logistic Regression")
# print("# " + "="*50)

# y = train2["y"].copy()

# stacking_train = pd.DataFrame({
#     'lgb2': pred_lgb2,
#     'xgb2': pred_xgb2, 
# })

# stacking_test = pd.DataFrame({
#     'lgb2': pred_lgb_test2,
#     'xgb2': pred_xgb_test2,
# })

# print(f"# Stacking train shape: {stacking_train.shape}")
# print(f"# Stacking test shape: {stacking_test.shape}")

# print("\n# METHOD 1: WEIGHTED AVERAGE")
# print("# " + "-"*30)

# scores = [AUC_lgb2, AUC_xgb2]  
# total_score = sum(scores)
# weights = [score/total_score for score in scores]

# print(f"# Model weights:")
# print(f"# LightGBM2: {weights[0]:.4f}")
# print(f"# XGBoost2:  {weights[1]:.4f}")

# weighted_oof = (
#     stacking_train['lgb2'] * weights[0] + 
#     stacking_train['xgb2'] * weights[1] 
# )

# weighted_test = (
#     stacking_test['lgb2'] * weights[0] + 
#     stacking_test['xgb2'] * weights[1] 
# ) 

# weighted_score = roc_auc_score(y, weighted_oof)
# print(f"# Weighted Average ROC AUC: {weighted_score:.6f}")

# print("\n# METHOD 2: LOGISTIC REGRESSION META-LEARNER")
# print("# " + "-"*40)

# meta_learner = LogisticRegression(random_state=42, max_iter=1000)
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# cv_scores = cross_val_score(meta_learner, stacking_train, y, 
#                            cv=skf, scoring='roc_auc', n_jobs=1)

# print(f"# Meta-learner CV scores: {[f'{score:.6f}' for score in cv_scores]}")
# print(f"# Meta-learner mean CV: {cv_scores.mean():.6f} ± {cv_scores.std():.6f}")

# meta_learner.fit(stacking_train, y)
# meta_oof = meta_learner.predict_proba(stacking_train)[:, 1]
# meta_test = meta_learner.predict_proba(stacking_test)[:, 1]
# meta_score = roc_auc_score(y, meta_oof)

# print(f"# Meta-learner ROC AUC: {meta_score:.6f}")

# coefficients = meta_learner.coef_[0]
# print(f"# Meta-learner coefficients:")
# print(f"# LightGBM2: {coefficients[0]:.4f}")
# print(f"# XGBoost2:  {coefficients[1]:.4f}")
# print(f"# Intercept: {meta_learner.intercept_[0]:.4f}")

# print("\n# METHOD 3: SIMPLE AVERAGE (BASELINE)")
# print("# " + "-"*35)

# simple_oof = (
#     stacking_train['lgb2'] + 
#     stacking_train['xgb2'] 
#     ) / 2
# simple_test = (
#     stacking_test['lgb2'] + 
#     stacking_test['xgb2'] 
#     ) / 2
# simple_score = roc_auc_score(y, simple_oof)

# print(f"# Simple Average ROC AUC: {simple_score:.6f}")

# print("\n# ENSEMBLE METHODS COMPARISON")
# print("# " + "="*40)
# ensemble_results = [
#     ('Individual LightGBM2', AUC_lgb2),
#     ('Individual XGBoost2', AUC_xgb2),
#     ('Weighted Average', weighted_score),
#     ('Meta-learner (LogReg)', meta_score),
#     ('Simple Average', simple_score)
# ]

# ensemble_results.sort(key=lambda x: x[1], reverse=True)

# for i, (method, score) in enumerate(ensemble_results, 1):
#     print(f"# {i}. {method:<25}: {score:.6f}")

# best_method, best_score = ensemble_results[0]
# print(f"\n# BEST ENSEMBLE METHOD: {best_method}")
# print(f"# BEST ENSEMBLE SCORE: {best_score:.6f}")

# if 'Meta-learner' in best_method:
#     final_oo2 = meta_oof
#     final_test2 = meta_test
#     print("# Using Meta-learner predictions for final submission")
# elif 'Weighted' in best_method:
#     final_oo2 = weighted_oof
#     final_test2 = weighted_test
#     print("# Using Weighted Average predictions for final submission")
# else:
#     final_oo2 = simple_oof
#     final_test2 = simple_test
#     print("# Using Simple Average predictions for final submission")

# print("\n# STACKING ENSEMBLE COMPLETED!")
# print("# " + "="*50)

In [None]:
# 提出データ作成
sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")

sample_submission['y'] = (pred_lgb_test1 + pred_lgb_test2 + pred_xgb_test1 + pred_xgb_test2) / 4
sample_submission.to_csv('submission.csv', index=False)
print('Submission file saved.')

### skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)【Neural Net】

In [None]:
# import random
# import os
# import pandas as pd
# import numpy as np
# from tqdm.notebook import tqdm
# import matplotlib.pyplot as plt

# import torch
# import torch.nn as nn
# from torch.utils.data import Dataset, DataLoader

# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import f1_score

# # pytorch実装
# import torch # Tensorの作成や操作
# import torch.nn as nn # ニューラルネットワーク
# import torch.nn.functional as F # 関数をメソッドとして提供
# import torch.optim as optim # オプティマイザ
# from torch.utils.data import Dataset, DataLoader
# from torch.autograd import Variable

# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import roc_auc_score
# from tqdm.notebook import tqdm
# # from tqdm import tqdm
# import matplotlib.pyplot as plt
# import time

# # GPUの使用状況確認
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print(device)

# all_df = all_df_NN
# all_df = all_df.drop(["id","y"],axis=1)
# y = train_df["y"]

# # 設定
# SEED = 42
# TARGET = "y"

# CATEGORICAL = cat_col
# NUMERICAL = num_col
# USE = CATEGORICAL + NUMERICAL
# # df_train = train_df.drop("id",axis=1)
# # df_test = test_df.drop("id",axis=1)

In [None]:
# # (1) 住宅ローン + ローン
# all_df["housing_loan"] = all_df["housing"].astype(str) + "_" + all_df["loan"].astype(str)

# # (2) コンタクト時間 x 年齢
# all_df["duration_x_age"] = all_df["duration"] * all_df["age"]

# # (3) sin,cos(コンタクト時間)
# all_df['duration_sin'] = np.sin(2*np.pi * all_df['duration'] / 400)
# all_df['duration_cos'] = np.cos(2*np.pi * all_df['duration'] / 400)

# # (4) monthを数値に直し周期的に使う
# month_map = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
#     'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
#     'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
# all_df['month_num'] = all_df['month'].map(month_map).astype('int')
# all_df['month_sin'] = np.sin(2 * np.pi * all_df['month_num'] / 12)
# all_df['month_cos'] = np.cos(2 * np.pi * all_df['month_num'] / 12)

# # (5) コンタクト時間をカテゴリ化
# all_df['duration_bin'] = pd.cut(
#     all_df['duration'],
#     bins=[0, 60, 300, 600, 900, float('inf')],
#     labels=['short', 'medium', 'long', 'very_long', 'extreme'],
#     right=False)
# all_df['duration_bin'] = all_df['duration_bin'].astype("object")

# # (6) 連絡手段 + 年齢
# all_df['age_group'] = pd.cut(
#     all_df['age'],
#     bins=[0, 30, 45, 60, 100],
#     labels=['young', 'mid', 'senior', 'elder'])
# all_df["contact_age"] = all_df["contact"].astype(str) + "_" + all_df["age_group"].astype(str)
# all_df = all_df.drop("age_group",axis=1)

# # (7) sin,cos(pdays)
# all_df['pdays_sin'] = np.sin(2*np.pi * all_df['pdays'] / 90)
# all_df['pdays_cos'] = np.cos(2*np.pi * all_df['pdays'] / 90)

In [None]:
# # 標準化 + ラベルエンコード
# def preprocessing(all_df, cat_cols=CATEGORICAL, num_cols=NUMERICAL, target=TARGET):

#     # 訓練データとテストデータに分離
#     train = all_df[:len(train_df)]
#     test = all_df[len(train_df):]

#     # y = train[target]
#     # train = train.drop("y",axis=1)
#     train_len = len(train)

#     # 訓練データ + テストデータ
#     # df = pd.concat([train.drop(columns=target), test])
#     # y = train[target]
#     # train_len = len(train)
    
#     # 欠損埋め
#     # df[cat_cols] = df[cat_cols].fillna('None')
#     # df[num_cols] = df[num_cols].fillna(0)

#     # train = df[:train_len]
#     # test = df[train_len:]

#     # 標準化
#     scaler = StandardScaler()

#     # フィッティング
#     # scaler.fit(df[num_cols])
#     scaler.fit(train[num_cols])

#     # 適用
#     train[num_cols] = scaler.transform(train[num_cols])
#     test[num_cols] = scaler.transform(test[num_cols])
#     df = pd.concat([train, test])
    
#     # ラベルエンコーダ
#     for col in df.columns:
#         if col in cat_cols:
#             df[col] = LabelEncoder().fit_transform(df[col])
#             df[col]= df[col].astype('category')
            
#     return pd.concat([df.iloc[:train_len], y], axis=1), df.iloc[train_len:]

In [None]:
# # 前処理の実施
# df_train, df_test = preprocessing(all_df)

In [None]:
# # データセット関数
# class CustomDataset(Dataset):

#     # オブジェクト定義
#     def __init__(self, df, target, cat_cols=CATEGORICAL):
#         self.df_cat = df[cat_cols]
#         self.df_num = df.drop(cat_cols, axis=1)
#         self.X_cats = self.df_cat.values.astype(np.int64)
#         self.X_nums = self.df_num.values.astype(np.float32)
#         self.target = target.values.astype(np.int64)

#     # データセットのサイズを返す
#     def __len__(self):
#         return len(self.target)

#     # 指定したインデックスのデータとラベルを返す
#     def __getitem__(self, idx):
#         return [self.X_cats[idx], self.X_nums[idx], self.target[idx]]

In [None]:
# # NNモデル作成
# class NN_Model(nn.Module):

#     # ネットワーク構造の定義
#     def __init__(self, embedding_sizes, n_num):
#         super().__init__()
#         self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories, size in embedding_sizes])
#         n_emb = sum(e.embedding_dim for e in self.embeddings)
#         self.n_emb, self.n_num = n_emb, n_num
#         self.lin1 = nn.Linear(self.n_emb + self.n_num, 100)
#         self.lin2 = nn.Linear(100, 70)
#         self.lin3 = nn.Linear(70, 2)
#         self.bn1 = nn.BatchNorm1d(self.n_num)
#         self.bn2 = nn.BatchNorm1d(100)
#         self.bn3 = nn.BatchNorm1d(70)
#         self.emb_drop = nn.Dropout(0.6)
#         self.drops = nn.Dropout(0.3)
 
#     # 順伝播
#     def forward(self,x_cat,x_num):
#         x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
#         x = torch.cat(x, dim=1)
#         x = self.emb_drop(x)
#         x2 = self.bn1(x_num)
#         x = torch.cat([x, x2], dim=1)
#         x = F.relu(self.lin1(x))
#         x = self.drops(x)
#         x = self.bn2(x)
#         x = F.relu(self.lin2(x))
#         x = self.drops(x)
#         x = self.bn3(x)
#         x = self.lin3(x)
#         return x

In [None]:
# # ラベルエンコード済みカテゴリ変数の埋め込み
# # 各カテゴリ列の変数の種類
# cat_sizes = [len(df_train[col].cat.categories) for col in CATEGORICAL]

# # (入力サイズ, 50と割る2の小さい方)でエンコード
# emb_sizes = [(size, min(50, (size+1)//2)) for size in cat_sizes]

In [None]:
# # 記録用
# hist = {
#     'train_loss': [], 'train_auc': [],
#     'val_loss': [], 'val_auc': []
# }

# # パラメータ
# bs = 64 # バッチサイズ
# EPOCHS = 5 # エポック
# save_every = 1
# FOLDS = 5 # FOLD数
# LR=1e-3 # 学習率

# patience = 3

# # stratified KFoldの宣言
# skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

# fold_results = []

# val_results = []
# test_results = []

# # SKFによるデータ分割
# for fold, (train_idx, val_idx) in enumerate(skf.split(df_train.drop(columns=TARGET), df_train[TARGET])):
    
#     print(f"\n========== Fold {fold+1} ==========")

#     # 学習データ
#     X_train = df_train.drop(columns=TARGET).iloc[train_idx] 
#     y_train = df_train[TARGET].iloc[train_idx]

#     # バリデーションデータ
#     X_val = df_train.drop(columns=TARGET).iloc[val_idx]
#     y_val = df_train[TARGET].iloc[val_idx]

#     # Datasetの作成
#     train_dataset = CustomDataset(X_train, y_train)
#     val_dataset = CustomDataset(X_val, y_val)
    
#     # DataLoaderの作成
#     train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=0)
#     val_loader = DataLoader(val_dataset, batch_size=bs, shuffle=False, num_workers=0)

#     # モデル構築
#     model = NN_Model(emb_sizes, len(NUMERICAL)).to(device)

#     # 最適化設定
#     optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-5)
#     # optimizer = torch.optim.Adam(model.parameters(), lr=LR)

#     # 損失関数
#     criterion = nn.CrossEntropyLoss()

#     hist = {"train_auc": [], "val_auc": []}
#     best_val_auc = 0
#     counter = 0

#     # 学習・予測エポックのループ
#     for epoch in range(EPOCHS):

#         # 開始時間
#         start_time = time.time()

#         # 学習モード
#         model.train()

#         # ラベル、予測値の保存場所
#         y_true_train, y_pred_train = [], []

#         # プログレスバー
#         train_iter = tqdm(train_loader, desc=f"<Train> Epoch {epoch+1}", leave=False)
        
#         for i, (cat_data, num_data, target) in enumerate(train_iter):

#             # DataLoaderから取り出した、カテゴリ、数値、ターゲット
#             cat_data, num_data, target = cat_data.to(device), num_data.to(device), target.to(device)

#             # パラメータの勾配を初期化
#             optimizer.zero_grad()

#             # 予測値の算出
#             output = model(cat_data, num_data)

#             # ラベルと予測値とのロス計算
#             loss = criterion(output, target)

#             # 各パラメータの勾配を算出
#             loss.backward()

#             # パラメータ更新
#             optimizer.step()

#             # ソフトマックスの分類結果を格納
#             probs = torch.softmax(output, dim=1)[:, 1].detach().cpu().numpy()
#             y_pred_train.extend(probs)

#             # ラベルの格納
#             y_true_train.extend(target.cpu().numpy())

#             # プログレスバーの後ろにロス値を表示
#             if i % 10 == 0:
#                 train_iter.set_postfix(loss=loss.item())

#         # チェックポイント保存
#         if (epoch + 1) % save_every == 0:
#             torch.save(model.state_dict(), f"model_epoch{epoch+1}.pt")        
    
#         # histに残すAUCスコア
#         train_auc = roc_auc_score(y_true_train, y_pred_train)

#         # 評価モード
#         model.eval()

#         # ラベル、予測値の保存場所        
#         y_true_val, y_pred_val = [], []

#         # プログレスバー
#         val_iter = tqdm(val_loader, desc=f"<Val> Epoch {epoch+1}", leave=False)

#         # 勾配を更新しない
#         with torch.no_grad():
            
#             for cat_data, num_data, target in val_iter:
    
#                 # DataLoaderから取り出した、カテゴリ、数値、ターゲット
#                 cat_data, num_data, target = cat_data.to(device), num_data.to(device), target.to(device)

#                 # 予測値の算出
#                 output = model(cat_data, num_data)

#                 # ソフトマックスの分類結果を格納
#                 probs = torch.softmax(output, dim=1)[:, 1].cpu().numpy()
#                 y_pred_val.extend(probs)

#                 # ラベルの格納
#                 y_true_val.extend(target.cpu().numpy())

#                 # プログレスバーの後ろにロス値を表示
#                 val_iter.set_postfix(loss=criterion(output, target).item())

#         # histに残すAUCスコア        
#         val_auc = roc_auc_score(y_true_val, y_pred_val)

#         # 差分時刻
#         elapsed = time.time() - start_time

#         # 履歴追加
#         hist["train_auc"].append(train_auc)
#         hist["val_auc"].append(val_auc)

#         # 進捗
#         print(f"Epoch {epoch+1}/{EPOCHS} - TrainAUC: {train_auc:.4f} | ValAUC: {val_auc:.4f} | Time: {elapsed:.1f}s")

#         # チェックポイント
#         if (epoch + 1) % save_every == 0:
#             torch.save(model.state_dict(), f"model_fold{fold+1}_epoch{epoch+1}.pth")

#         # EarlyStopping判定
#         if val_auc > best_val_auc:
#             best_val_auc = val_auc
#             counter = 0
#             torch.save(model.state_dict(), f"best_model_fold{fold+1}.pth")
#         else:
#             counter += 1
#             if counter >= patience:
#                 print(f"Early stopping at epoch {epoch+1}")
#                 break

    
#     # foldごとに保存
#     torch.save(model.state_dict(), f"model_fold{fold+1}.pth")
    
#     # ヒストグラムの更新
#     fold_results.append(hist)

#     # foldごとにテストデータ計算
#     model.eval()
#     with torch.no_grad():
#         X_val_cat = torch.from_numpy(df_train[CATEGORICAL].values.astype(np.int64)).to(device)
#         X_val_num = torch.from_numpy(df_train[NUMERICAL].values.astype(np.float32)).to(device)

#         # 予測
#         preds = torch.softmax(model(X_val_cat, X_val_num),dim=1)[:,1].cpu().numpy()
#         val_results.append(preds)
        
#         X_test_cat = torch.from_numpy(df_test[CATEGORICAL].values.astype(np.int64)).to(device)
#         X_test_num = torch.from_numpy(df_test[NUMERICAL].values.astype(np.float32)).to(device)

#         # 予測
#         preds = torch.softmax(model(X_test_cat, X_test_num),dim=1)[:,1].cpu().numpy()
#         # preds = torch.softmax(model(X_test_cat, X_test_num).squeeze()).cpu().numpy()
#         test_results.append(preds)
    
# # shape = (n_folds, n_test_samples) → 平均化
# val_results = np.mean(val_results, axis=0)        
# test_results = np.mean(test_results, axis=0)        

In [None]:
# pred_NN = val_results
# pred_NN_test = test_results
# AUC_NN = np.average(hist["val_auc"])

In [None]:
# # ======================
# # FoldごとのAUCをプロット
# # ======================
# import matplotlib.pyplot as plt
# plt.figure(figsize=(10,5))
# for i, hist in enumerate(fold_results):
#     plt.plot(hist["val_auc"], label=f"Fold {i+1} Val AUC")
# plt.xlabel("Epoch")
# plt.ylabel("AUC")
# plt.legend()
# plt.title("Validation AUC per Fold")
# plt.show() 

In [None]:
# from sklearn.ensemble import StackingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import cross_val_score, StratifiedKFold
# from sklearn.metrics import roc_auc_score

# print("# STACKING ENSEMBLE IMPLEMENTATION")
# print("# " + "="*50)
# print("# Combining top 4 models: LightGBM, XGBoost, CatBoost, NN")
# print("# Meta-learner: Logistic Regression")
# print("# " + "="*50)

# stacking_train = pd.DataFrame({
#     'lgb': pred_lgb,
#     'xgb': pred_xgb, 
#     'cat': pred_cb,
#     'NN': pred_NN,
# })

# stacking_test = pd.DataFrame({
#     'lgb': pred_lgb_test,
#     'xgb': pred_xgb_test,
#     'cat': pred_cb_test,
#     'NN': pred_NN_test,
# })

# print(f"# Stacking train shape: {stacking_train.shape}")
# print(f"# Stacking test shape: {stacking_test.shape}")

# print("\n# METHOD 1: WEIGHTED AVERAGE")
# print("# " + "-"*30)

# scores = [AUC_lgb, AUC_xgb, AUC_cb, AUC_NN]  
# total_score = sum(scores)
# weights = [score/total_score for score in scores]

# print(f"# Model weights:")
# print(f"# LightGBM: {weights[0]:.4f}")
# print(f"# XGBoost:  {weights[1]:.4f}")
# print(f"# CatBoost: {weights[2]:.4f}")
# print(f"# NN: {weights[3]:.4f}")

# weighted_oof = (stacking_train['lgb'] * weights[0] + 
#                 stacking_train['xgb'] * weights[1] + 
#                 stacking_train['cat'] * weights[2] +
#                 stacking_train['NN'] * weights[3])

# weighted_test = (stacking_test['lgb'] * weights[0] + 
#                  stacking_test['xgb'] * weights[1] + 
#                  stacking_test['cat'] * weights[2] +
#                  stacking_test['NN'] * weights[3])

# weighted_score = roc_auc_score(y, weighted_oof)
# print(f"# Weighted Average ROC AUC: {weighted_score:.6f}")

# print("\n# METHOD 2: LOGISTIC REGRESSION META-LEARNER")
# print("# " + "-"*40)

# meta_learner = LogisticRegression(penalty="l2",random_state=42, max_iter=1000)
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# cv_scores = cross_val_score(meta_learner, stacking_train, y, 
#                            cv=skf, scoring='roc_auc', n_jobs=-1)

# print(f"# Meta-learner CV scores: {[f'{score:.6f}' for score in cv_scores]}")
# print(f"# Meta-learner mean CV: {cv_scores.mean():.6f} ± {cv_scores.std():.6f}")

# meta_learner.fit(stacking_train, y)
# meta_oof = meta_learner.predict_proba(stacking_train)[:, 1]
# meta_test = meta_learner.predict_proba(stacking_test)[:, 1]
# meta_score = roc_auc_score(y, meta_oof)

# print(f"# Meta-learner ROC AUC: {meta_score:.6f}")

# coefficients = meta_learner.coef_[0]
# print(f"# Meta-learner coefficients:")
# print(f"# LightGBM: {coefficients[0]:.4f}")
# print(f"# XGBoost:  {coefficients[1]:.4f}")
# print(f"# CatBoost: {coefficients[2]:.4f}")
# print(f"# NN: {coefficients[3]:.4f}")
# print(f"# Intercept: {meta_learner.intercept_[0]:.4f}")

# print("\n# METHOD 3: SIMPLE AVERAGE (BASELINE)")
# print("# " + "-"*35)

# simple_oof = (stacking_train['lgb'] + stacking_train['xgb'] + stacking_train['cat'] + stacking_train['NN']) / 4
# simple_test = (stacking_test['lgb'] + stacking_test['xgb'] + stacking_test['cat'] + stacking_test['NN']) / 4
# simple_score = roc_auc_score(y, simple_oof)

# print(f"# Simple Average ROC AUC: {simple_score:.6f}")

# print("\n# ENSEMBLE METHODS COMPARISON")
# print("# " + "="*40)
# ensemble_results = [
#     ('Individual LightGBM', AUC_lgb),
#     ('Individual XGBoost', AUC_xgb),
#     ('Individual CatBoost', AUC_cb),
#     ('Individual NN', AUC_NN),
#     ('Weighted Average', weighted_score),
#     ('Meta-learner (LogReg)', meta_score),
#     ('Simple Average', simple_score)
# ]

# ensemble_results.sort(key=lambda x: x[1], reverse=True)

# for i, (method, score) in enumerate(ensemble_results, 1):
#     print(f"# {i}. {method:<25}: {score:.6f}")

# best_method, best_score = ensemble_results[0]
# print(f"\n# BEST ENSEMBLE METHOD: {best_method}")
# print(f"# BEST ENSEMBLE SCORE: {best_score:.6f}")

# if 'Meta-learner' in best_method:
#     final_oof = meta_oof
#     final_test = meta_test
#     print("# Using Meta-learner predictions for final submission")
# elif 'Weighted' in best_method:
#     final_oof = weighted_oof
#     final_test = weighted_test
#     print("# Using Weighted Average predictions for final submission")
# else:
#     final_oof = simple_oof
#     final_test = simple_test
#     print("# Using Simple Average predictions for final submission")

# print("\n# STACKING ENSEMBLE COMPLETED!")
# print("# " + "="*50)

In [None]:
# # 提出データ作成
# sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")

# sample_submission['y'] = test_results
# sample_submission.to_csv('submission.csv', index=False)
# print('Submission file saved.')