In [1]:
import os
os.environ['PYTHONIOENCODING'] = 'utf8' # 시스템 기본 인코딩 변경
os.environ['LOKY_MAX_CPU_COUNT'] = '4' #
# os.environ["CUDA_VISIBLE_DEVICES"] = '0'

# 📌Import Module & Load Data

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import torch
from tqdm import tqdm

# preprocessing
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MinMaxScaler, RobustScaler

# cv
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import cross_val_predict

# model
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

# pipeline & ensemble
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import VotingClassifier

# parameter
import optuna

In [3]:
df_train=pd.read_csv('C:/Users/Gyeom/.kaggle/input/playground-series-s3e24/train.csv')
df_test=pd.read_csv('C:/Users/Gyeom/.kaggle/input/playground-series-s3e24/test.csv')

---

# ⚙️Setting

In [4]:
ids_test = df_test['id']
df_train = df_train.drop(columns='id')
df_test = df_test.drop(columns='id')

In [5]:
# 주어진 데이터를 특성 행렬 (X)과 타겟 벡터 (y)로 분할
def generateXY(trainData):
    x = trainData.drop(columns='smoking')
    y = trainData.smoking
    return x,y

In [6]:
# import warnings
# warnings.filterwarnings("ignore")

# def objectiveTab(trial):
#     # TabNet parameters to optimize
#     n_d_a = trial.suggest_int('n_d_a', 8, 64, 4)
#     clf_params = {
#         'mask_type': trial.suggest_categorical("mask_type", ["sparsemax", "entmax"]),
#         'n_d': n_d_a,
#         'n_a': n_d_a,
#         'n_steps': trial.suggest_int('n_steps', 3, 10),
#         'gamma': trial.suggest_float('gamma', 1.0, 2.0),
#         'n_shared': trial.suggest_int("n_shared", 1, 3),
#         'lambda_sparse': trial.suggest_float('lambda_sparse', 1e-5, 1e-1, log=True),
#         'optimizer_fn': torch.optim.Adam,
#         'optimizer_params': dict(lr=trial.suggest_float('lr',1e-3 ,3e-2, log=True)),
#         'verbose': 0,
#     }
    
#     # 데이터 세팅
#     X, y = generateXY(df_train)
#     str_kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)
    
#     # 결과를 저장할 리스트 초기화
#     scores_list = []
#     for i, (train_index, valid_index) in enumerate(str_kf.split(X, y)):
#         X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[valid_index]
#         y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[valid_index]
        
#         scaler = StandardScaler()
#         X_train_scaled = scaler.fit_transform(X_train_fold.values)
#         X_val_scaled = scaler.transform(X_val_fold.values)

#         clf = TabNetClassifier(**clf_params)
#         clf.fit(
#             X_train_scaled,
#             y_train_fold.values,
#             eval_set=[(X_val_scaled,y_val_fold.values)],
#             eval_metric=['auc'],
#             max_epochs = 100,
#             # early stopping options - stop training if metric does not improve for 10 epochs.
#             patience=10 
#         )
#         scores_list.append(clf.best_cost)
# #         preds_val = clf.predict_proba(X_val_fold.values)
# #         score_val_auc = roc_auc_score(y_true=y_val_fold.values, y_score=preds_val[:,1])
# #         print(f'TabNetClassifier has ROC AUC Score(Avg) = {score_val_auc}')

# #         scores_list.append(score_val_auc)
#     print(f'TabNetClassifier has ROC AUC Score(Avg) = {np.mean(scores_list):.7f} +/- {np.std(scores_list) * 2:.5f}')
#     print()
#     return np.mean(scores_list)

In [7]:
# study = optuna.create_study(direction='maximize', study_name='TabNet optimization')
# study.optimize(objectiveTab, n_trials=30)

# best_trial = study.best_trial

# print(f"Best Trial: {best_trial.number}")
# print(f"Best Value: {best_trial.value}")
# print("Best Parameters:")
# for key, value in best_trial.params.items():
#     print(f"{key}: {value}")

---

# 📈Train

In [8]:
# TabNet parameters
opt_tabn_params = {'n_d': 8,
                   'n_a': 8,
                   'mask_type': 'entmax', 
                   'n_steps': 4, 
                   'gamma': 1.243456559159334, 
                   'n_shared': 3, 
                   'lambda_sparse': 0.0003086035231373375,
                   'optimizer_fn': torch.optim.Adam,
                   'optimizer_params': dict(lr= 0.025040231307553586),
                   'verbose': 0,
                  }
opt_tabn_params2 = {'n_d': 32,
                    'n_a': 32,
                    'mask_type': 'entmax', 
                    'n_steps': 5, 
                    'gamma': 1.281864812982242, 
                    'n_shared': 2, 
                    'lambda_sparse': 0.00035724765382634565, 
                    'optimizer_fn': torch.optim.Adam,
                    'optimizer_params': dict(lr= 0.014849401156317103),
                    'verbose': 0,
                    }
opt_tabn_params3 = {'n_d': 20,
                    'n_a': 20,
                    'mask_type': 'sparsemax',
                    'n_steps': 4,
                    'gamma': 1.989597966500972,
                    'n_shared': 1,
                    'lambda_sparse': 5.2615185452059766e-05,
                    'optimizer_fn': torch.optim.Adam,
                    'optimizer_params': dict(lr= 0.01989272121849772),
                    'verbose': 0,
                    }

# Prepare your dataset.
X, y = generateXY(df_train)

# 결과를 저장할 리스트 초기화
scores_list = []

str_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, valid_index) in enumerate(str_kf.split(X, y)):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[valid_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[valid_index]
    
    # Initialize the model with the parameters set above.
    clf = TabNetClassifier(**opt_tabn_params2)
    clf.fit(
        X_train=X_train_fold.values,
        y_train=y_train_fold.values,
        eval_set=[(X_val_fold.values,y_val_fold.values)],
        eval_metric=['auc'],
        max_epochs=100,
        patience=10  # early stopping options - stop training if metric does not improve for 10 epochs.
    )
    preds_val = clf.predict_proba(X_val_fold.values)
    score_val_auc = roc_auc_score(y_true=y_val_fold.values,y_score=preds_val[:,1])
    print(f'TabNetClassifier has ROC AUC Score(Avg) = {score_val_auc}')
    
    scores_list.append(score_val_auc)
print(f'★TabNetClassifier has ROC AUC Score(Avg) = {np.mean(scores_list):.7f} +/- {np.std(scores_list) * 2:.5f}')

  0%|          | 0/5 [00:00<?, ?it/s]


Early stopping occurred at epoch 48 with best_epoch = 38 and best_val_0_auc = 0.85852


 20%|██        | 1/5 [06:44<26:58, 404.52s/it]

TabNetClassifier has ROC AUC Score(Avg) = 0.8585188631121607

Early stopping occurred at epoch 23 with best_epoch = 13 and best_val_0_auc = 0.86096


 40%|████      | 2/5 [10:02<14:09, 283.18s/it]

TabNetClassifier has ROC AUC Score(Avg) = 0.8609559463029524

Early stopping occurred at epoch 24 with best_epoch = 14 and best_val_0_auc = 0.85905


 60%|██████    | 3/5 [13:30<08:17, 248.75s/it]

TabNetClassifier has ROC AUC Score(Avg) = 0.8590512944217419

Early stopping occurred at epoch 33 with best_epoch = 23 and best_val_0_auc = 0.85724


 80%|████████  | 4/5 [18:04<04:18, 258.73s/it]

TabNetClassifier has ROC AUC Score(Avg) = 0.8572407583045842

Early stopping occurred at epoch 25 with best_epoch = 15 and best_val_0_auc = 0.86135


100%|██████████| 5/5 [21:36<00:00, 259.27s/it]

TabNetClassifier has ROC AUC Score(Avg) = 0.8613545478955196
★TabNetClassifier has ROC AUC Score(Avg) = 0.8594243 +/- 0.00307





In [None]:
# opt_tabn_params ★TabNetClassifier has ROC AUC Score(Avg) = 0.8594234 +/- 0.00176
# opt_tabn_params3 ★TabNetClassifier has ROC AUC Score(Avg) = 0.8594243 +/- 0.00307

In [9]:
preds = clf.predict_proba(df_test.values)[:, 1]

df_submit = pd.DataFrame({'id':ids_test,'smoking': preds})
df_submit.head()

Unnamed: 0,id,smoking
0,159256,0.637591
1,159257,0.209604
2,159258,0.398783
3,159259,0.020589
4,159260,0.698793


In [11]:
df_submit.to_csv('opt_tabnet_submission.csv', index=False)

In [12]:
# !kaggle competitions submit -c playground-series-s3e24 -f opt_tabnet_submission.csv -m "최적화 TabNet 모델"

Successfully submitted to Binary Prediction of Smoker Status using Bio-Signals



  0%|          | 0.00/1.91M [00:00<?, ?B/s]
  0%|          | 8.00k/1.91M [00:00<00:48, 41.0kB/s]
 19%|█▉        | 368k/1.91M [00:00<00:01, 1.42MB/s] 
 54%|█████▎    | 1.02M/1.91M [00:00<00:00, 3.36MB/s]
 77%|███████▋  | 1.46M/1.91M [00:00<00:00, 3.75MB/s]
 98%|█████████▊| 1.88M/1.91M [00:01<00:00, 1.74MB/s]
100%|██████████| 1.91M/1.91M [00:02<00:00, 674kB/s] 
