### Context

#### Experiment Tools
- WanDB

In [1]:
sweep_config = {
  "name" : "mdc_sweep",
  "method" : "bayes",
  "parameters" : {
    "max_depth" : {
      "distribution": "int_uniform",
      "min":2,
      "max":15
    },
    "subsample" :{
      "distribution": "uniform",
      "min": 0.5,
      "max": 1.0
    },
    "colsample_bytree":{
      "distribution": "uniform",
      "min": 0.5,
      "max": 1.0
    }
  },
  "metric":{
      "name": "cv_loss",
      "goal": "minimize"
  }
}


In [3]:
import os
from os.path import join

import multiprocessing
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import wandb

wandb.init()
sweep_id = wandb.sweep(sweep_config, 
                       project="medici wandb test")

n_cpus = multiprocessing.cpu_count()

wandb: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ········


wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\medici/.netrc




Create sweep with ID: 9hyhq5rj
Sweep URL: https://wandb.ai/chanwoong/medici%20wandb%20test/sweeps/9hyhq5rj


In [4]:
BASE_DIR = './data' 

train_path = os.path.join(BASE_DIR, 'train.csv')
test_path = os.path.join(BASE_DIR, 'test.csv')
submission_path = os.path.join(BASE_DIR, 'sample_submission.csv')

data = pd.read_csv(train_path)
test = pd.read_csv(test_path)

label = data['credit']

In [5]:
# 불필요한 컬럼 제거
data.drop(columns=['index', 'credit'], inplace=True)
test.drop(columns=['index'],         inplace=True)

In [6]:
cat_columns = [c for c, t in zip(data.dtypes.index, data.dtypes) if t == 'O'] 
num_columns = [c for c    in data.columns if c not in cat_columns]

print('Categorical Columns: \n{}\n'.format(cat_columns))
print('Numeric Columns: \n{}'.format(num_columns))

Categorical Columns: 
['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']

Numeric Columns: 
['child_num', 'income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'family_size', 'begin_month']


#### 라벨 데이터 인코딩

In [7]:
label = label.astype(int)

#### 전처리 프로세스 함수로 작성

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def preprocess(x_train, x_valid, x_test):
    tmp_x_train = x_train.copy()
    tmp_x_valid = x_valid.copy()
    tmp_x_test  = x_test.copy()
    
    tmp_x_train.reset_index(drop=True, inplace=True)
    tmp_x_valid.reset_index(drop=True, inplace=True)
    
    # 결측치 처리
    imputer = SimpleImputer(strategy='most_frequent')
    tmp_x_train[cat_columns] = imputer.fit_transform(tmp_x_train[cat_columns])
    tmp_x_valid[cat_columns] = imputer.transform(tmp_x_valid[cat_columns])
    tmp_x_test[cat_columns]  = imputer.transform(tmp_x_test[cat_columns])
    
    # 스케일링
    scaler = StandardScaler()
    tmp_x_train[num_columns] = scaler.fit_transform(tmp_x_train[num_columns])
    tmp_x_valid[num_columns] = scaler.transform(tmp_x_valid[num_columns])
    tmp_x_test[num_columns]  = scaler.transform(tmp_x_test[num_columns])

    # 인코딩
    ohe = OneHotEncoder(sparse=False)
    ohe.fit(tmp_x_train[cat_columns])
    
    tmp_x_train_cat = pd.DataFrame(ohe.transform(tmp_x_train[cat_columns]))
    tmp_x_valid_cat = pd.DataFrame(ohe.transform(tmp_x_valid[cat_columns]))
    tmp_x_test_cat  = pd.DataFrame(ohe.transform(tmp_x_test[cat_columns]))
    
    tmp_x_train.drop(columns=cat_columns, inplace=True)
    tmp_x_valid.drop(columns=cat_columns, inplace=True)
    tmp_x_test.drop(columns=cat_columns, inplace=True)
    
    tmp_x_train = pd.concat([tmp_x_train, tmp_x_train_cat], axis=1)
    tmp_x_valid = pd.concat([tmp_x_valid, tmp_x_valid_cat], axis=1)
    tmp_x_test  = pd.concat([tmp_x_test, tmp_x_test_cat], axis=1)
    
    return tmp_x_train, tmp_x_valid, tmp_x_test

### Ensemble

In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from xgboost import XGBClassifier

def train():
    with wandb.init() as run:
        params = wandb.config
        
        val_scores = list()
        n_splits = 5

        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

        for i, (trn_idx, val_idx) in enumerate(skf.split(data, label)):
            x_train, y_train = data.iloc[trn_idx, :], label.iloc[trn_idx,]
            x_valid, y_valid = data.iloc[val_idx, :], label.iloc[val_idx,]

            # 전처리
            x_train, x_valid, x_test = preprocess(x_train, x_valid, test)

            # 모델 정의
            model = XGBClassifier(n_estimators=1000,
                                  max_depth=params['max_depth'],
                                  subsample=params['subsample'],
                                  colsample_bytree=params['colsample_bytree'],
#                                   tree_method='gpu_hist',
                                  n_jobs=n_cpus-1)

            # 모델 학습
            model.fit(x_train, y_train, 
                      eval_metric='mlogloss', 
                      eval_set=[[x_train, y_train], [x_valid, y_valid]],
                      early_stopping_rounds=100,
                      verbose=100)

            # 훈련, 검증 데이터 log_loss 확인
            trn_logloss = log_loss(y_train, model.predict_proba(x_train))
            val_logloss = log_loss(y_valid, model.predict_proba(x_valid))
            print('{} Fold, train logloss : {:.4f}4, validation logloss : {:.4f}'.format(i, trn_logloss, val_logloss))

            val_scores.append(val_logloss)
            
        metrics = {"cv_loss": np.mean(val_scores)}
        wandb.log(metrics)
count = 5
wandb.agent(sweep_id, function=train, count=count)

wandb: Agent Starting Run: k04hggzp with config:
wandb: 	colsample_bytree: 0.7905667116083781
wandb: 	max_depth: 7
wandb: 	subsample: 0.9980462160605172


[0]	validation_0-mlogloss:0.97212	validation_1-mlogloss:0.97695
[100]	validation_0-mlogloss:0.50743	validation_1-mlogloss:0.72907
[200]	validation_0-mlogloss:0.37660	validation_1-mlogloss:0.72269
[274]	validation_0-mlogloss:0.31588	validation_1-mlogloss:0.73665
0 Fold, train logloss : 0.40004, validation logloss : 0.7213
[0]	validation_0-mlogloss:0.97104	validation_1-mlogloss:0.97829
[100]	validation_0-mlogloss:0.50158	validation_1-mlogloss:0.73756
[200]	validation_0-mlogloss:0.36749	validation_1-mlogloss:0.73987
[222]	validation_0-mlogloss:0.34652	validation_1-mlogloss:0.74247
1 Fold, train logloss : 0.46644, validation logloss : 0.7339
[0]	validation_0-mlogloss:0.99904	validation_1-mlogloss:1.00361
[100]	validation_0-mlogloss:0.50782	validation_1-mlogloss:0.73889
[200]	validation_0-mlogloss:0.38069	validation_1-mlogloss:0.74159
[221]	validation_0-mlogloss:0.35857	validation_1-mlogloss:0.74604
2 Fold, train logloss : 0.46894, validation logloss : 0.7353
[0]	validation_0-mlogloss:0.999

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.73058
_runtime,52.0
_timestamp,1628321146.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


wandb: Agent Starting Run: 7c7hct7j with config:
wandb: 	colsample_bytree: 0.7879712600398464
wandb: 	max_depth: 6
wandb: 	subsample: 0.8861332993279486
wandb: Currently logged in as: chanwoong (use `wandb login --relogin` to force relogin)


[0]	validation_0-mlogloss:0.97468	validation_1-mlogloss:0.97688
[100]	validation_0-mlogloss:0.55388	validation_1-mlogloss:0.72879
[200]	validation_0-mlogloss:0.43780	validation_1-mlogloss:0.71815
[300]	validation_0-mlogloss:0.35618	validation_1-mlogloss:0.72764
0 Fold, train logloss : 0.43674, validation logloss : 0.7177
[0]	validation_0-mlogloss:0.97376	validation_1-mlogloss:0.97940
[100]	validation_0-mlogloss:0.55424	validation_1-mlogloss:0.74607
[200]	validation_0-mlogloss:0.43079	validation_1-mlogloss:0.74180
[300]	validation_0-mlogloss:0.34797	validation_1-mlogloss:0.75651
[304]	validation_0-mlogloss:0.34520	validation_1-mlogloss:0.75665
1 Fold, train logloss : 0.42544, validation logloss : 0.7415
[0]	validation_0-mlogloss:1.00160	validation_1-mlogloss:1.00597
[100]	validation_0-mlogloss:0.55427	validation_1-mlogloss:0.74272
[200]	validation_0-mlogloss:0.43338	validation_1-mlogloss:0.73969
[250]	validation_0-mlogloss:0.38973	validation_1-mlogloss:0.74626
2 Fold, train logloss : 0.

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.73146
_runtime,54.0
_timestamp,1628321209.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


wandb: Agent Starting Run: g3evbf7q with config:
wandb: 	colsample_bytree: 0.9229554981220864
wandb: 	max_depth: 8
wandb: 	subsample: 0.6410399369689497


[0]	validation_0-mlogloss:0.96959	validation_1-mlogloss:0.97586
[100]	validation_0-mlogloss:0.38526	validation_1-mlogloss:0.73154
[168]	validation_0-mlogloss:0.26995	validation_1-mlogloss:0.76700
0 Fold, train logloss : 0.46434, validation logloss : 0.7245
[0]	validation_0-mlogloss:0.96994	validation_1-mlogloss:0.97910
[100]	validation_0-mlogloss:0.37578	validation_1-mlogloss:0.75351
[168]	validation_0-mlogloss:0.26767	validation_1-mlogloss:0.79165
1 Fold, train logloss : 0.45544, validation logloss : 0.7485
[0]	validation_0-mlogloss:0.99786	validation_1-mlogloss:1.00452
[100]	validation_0-mlogloss:0.37273	validation_1-mlogloss:0.75511
[163]	validation_0-mlogloss:0.26580	validation_1-mlogloss:0.79552
2 Fold, train logloss : 0.47144, validation logloss : 0.7442
[0]	validation_0-mlogloss:0.99674	validation_1-mlogloss:1.00308
[100]	validation_0-mlogloss:0.38197	validation_1-mlogloss:0.73712
[170]	validation_0-mlogloss:0.26511	validation_1-mlogloss:0.77782
3 Fold, train logloss : 0.44984, 

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.73689
_runtime,51.0
_timestamp,1628321270.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


wandb: Agent Starting Run: j8q3mlh1 with config:
wandb: 	colsample_bytree: 0.6383242015013462
wandb: 	max_depth: 7
wandb: 	subsample: 0.9866775129350097


[0]	validation_0-mlogloss:0.97310	validation_1-mlogloss:0.97730
[100]	validation_0-mlogloss:0.51512	validation_1-mlogloss:0.72076
[200]	validation_0-mlogloss:0.38612	validation_1-mlogloss:0.72102
[226]	validation_0-mlogloss:0.35750	validation_1-mlogloss:0.72517
0 Fold, train logloss : 0.47564, validation logloss : 0.7166
[0]	validation_0-mlogloss:0.97177	validation_1-mlogloss:0.97860
[100]	validation_0-mlogloss:0.51277	validation_1-mlogloss:0.73633
[200]	validation_0-mlogloss:0.38614	validation_1-mlogloss:0.73430
[255]	validation_0-mlogloss:0.33748	validation_1-mlogloss:0.74248
1 Fold, train logloss : 0.43644, validation logloss : 0.7306
[0]	validation_0-mlogloss:1.00046	validation_1-mlogloss:1.00595
[100]	validation_0-mlogloss:0.50699	validation_1-mlogloss:0.73064
[200]	validation_0-mlogloss:0.38308	validation_1-mlogloss:0.73040
[235]	validation_0-mlogloss:0.35079	validation_1-mlogloss:0.73653
2 Fold, train logloss : 0.45164, validation logloss : 0.7254
[0]	validation_0-mlogloss:0.999

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.72421
_runtime,49.0
_timestamp,1628321355.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


wandb: Agent Starting Run: oxnd0n5e with config:
wandb: 	colsample_bytree: 0.5725564757511616
wandb: 	max_depth: 7
wandb: 	subsample: 0.9689280453272853


[0]	validation_0-mlogloss:0.97361	validation_1-mlogloss:0.97653
[100]	validation_0-mlogloss:0.52229	validation_1-mlogloss:0.72092
[200]	validation_0-mlogloss:0.39180	validation_1-mlogloss:0.71667
[246]	validation_0-mlogloss:0.35181	validation_1-mlogloss:0.71867
0 Fold, train logloss : 0.45584, validation logloss : 0.7141
[0]	validation_0-mlogloss:0.97208	validation_1-mlogloss:0.97876
[100]	validation_0-mlogloss:0.51413	validation_1-mlogloss:0.73508
[200]	validation_0-mlogloss:0.38482	validation_1-mlogloss:0.72969
[263]	validation_0-mlogloss:0.33190	validation_1-mlogloss:0.74245
1 Fold, train logloss : 0.42354, validation logloss : 0.7271
[0]	validation_0-mlogloss:1.00037	validation_1-mlogloss:1.00541
[100]	validation_0-mlogloss:0.51586	validation_1-mlogloss:0.73227
[200]	validation_0-mlogloss:0.39055	validation_1-mlogloss:0.73565
[212]	validation_0-mlogloss:0.37837	validation_1-mlogloss:0.73759
2 Fold, train logloss : 0.49234, validation logloss : 0.7314
[0]	validation_0-mlogloss:0.999

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.72504
_runtime,47.0
_timestamp,1628321411.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


In [None]:
# submit.to_csv('oof_first_submit.csv', index=False)