# Week 3
### Context

#### Experiment Tools
- WanDB

In [None]:
!nvidia-smi

Sat Aug  7 07:34:21 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.11.2-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 8.4 MB/s 
[?25hCollecting configparser>=3.8.1
  Downloading configparser-5.0.2-py3-none-any.whl (19 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.18-py3-none-any.whl (170 kB)
[K     |████████████████████████████████| 170 kB 37.0 MB/s 
[?25hCollecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.3.1-py2.py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 40.5 MB/s 
[?25hCollecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting urllib3>=1.26.5
  Downloading urllib3-1.26.6-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 40.0 MB/s 
Collecting subprocess32>=3.5.3
  Downloading subprocess32-3.5.4.tar.gz (97 kB)
[K     |████████████████████████████████| 97 kB 7.7 MB/s 
[?25hCollect

In [None]:
sweep_config = {
  "name" : "t-academy_sweep",
  "method" : "bayes",
  "parameters" : {
    "max_depth" : {
      "distribution": "int_uniform",
      "min":2,
      "max":15
    },
    "subsample" :{
      "distribution": "uniform",
      "min": 0.5,
      "max": 1.0
    },
    "colsample_bytree":{
      "distribution": "uniform",
      "min": 0.5,
      "max": 1.0
    }
  },
  "metric":{
      "name": "cv_loss",
      "goal": "minimize"
  }
}


In [None]:
import os
from os.path import join

import multiprocessing
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import wandb

sweep_id = wandb.sweep(sweep_config, 
                       project="t-academy wandb demo")

n_cpus = multiprocessing.cpu_count()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Create sweep with ID: a20hskps
Sweep URL: https://wandb.ai/ilevk/t-academy%20wandb%20demo/sweeps/a20hskps


In [None]:
BASE_DIR = '/content/drive/MyDrive/[2021_07]_T아카데미'

train_path = join(BASE_DIR, 'data', 'MDC14', 'train.csv')
test_path  = join(BASE_DIR, 'data', 'MDC14', 'test.csv')

data = pd.read_csv(train_path)
test = pd.read_csv(test_path)

label = data['credit']

In [None]:
# 불필요한 컬럼 제거
data.drop(columns=['index', 'credit'], inplace=True)
test.drop(columns=['index'],         inplace=True)

In [None]:
cat_columns = [c for c, t in zip(data.dtypes.index, data.dtypes) if t == 'O'] 
num_columns = [c for c    in data.columns if c not in cat_columns]

print('Categorical Columns: \n{}\n'.format(cat_columns))
print('Numeric Columns: \n{}'.format(num_columns))

Categorical Columns: 
['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']

Numeric Columns: 
['child_num', 'income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'family_size', 'begin_month']


#### 라벨 데이터 인코딩

In [None]:
label = label.astype(int)

#### 전처리 프로세스 함수로 작성

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def preprocess(x_train, x_valid, x_test):
    tmp_x_train = x_train.copy()
    tmp_x_valid = x_valid.copy()
    tmp_x_test  = x_test.copy()
    
    tmp_x_train.reset_index(drop=True, inplace=True)
    tmp_x_valid.reset_index(drop=True, inplace=True)
    
    # 결측치 처리
    imputer = SimpleImputer(strategy='most_frequent')
    tmp_x_train[cat_columns] = imputer.fit_transform(tmp_x_train[cat_columns])
    tmp_x_valid[cat_columns] = imputer.transform(tmp_x_valid[cat_columns])
    tmp_x_test[cat_columns]  = imputer.transform(tmp_x_test[cat_columns])
    
    # 스케일링
    scaler = StandardScaler()
    tmp_x_train[num_columns] = scaler.fit_transform(tmp_x_train[num_columns])
    tmp_x_valid[num_columns] = scaler.transform(tmp_x_valid[num_columns])
    tmp_x_test[num_columns]  = scaler.transform(tmp_x_test[num_columns])

    # 인코딩
    ohe = OneHotEncoder(sparse=False)
    ohe.fit(tmp_x_train[cat_columns])
    
    tmp_x_train_cat = pd.DataFrame(ohe.transform(tmp_x_train[cat_columns]))
    tmp_x_valid_cat = pd.DataFrame(ohe.transform(tmp_x_valid[cat_columns]))
    tmp_x_test_cat  = pd.DataFrame(ohe.transform(tmp_x_test[cat_columns]))
    
    tmp_x_train.drop(columns=cat_columns, inplace=True)
    tmp_x_valid.drop(columns=cat_columns, inplace=True)
    tmp_x_test.drop(columns=cat_columns, inplace=True)
    
    tmp_x_train = pd.concat([tmp_x_train, tmp_x_train_cat], axis=1)
    tmp_x_valid = pd.concat([tmp_x_valid, tmp_x_valid_cat], axis=1)
    tmp_x_test  = pd.concat([tmp_x_test, tmp_x_test_cat], axis=1)
    
    return tmp_x_train, tmp_x_valid, tmp_x_test

### Out-of-fold(OOF) Ensemble

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from xgboost import XGBClassifier

def train():
    with wandb.init() as run:
        params = wandb.config
        
        val_scores = list()
        oof_pred = np.zeros((test.shape[0], 3))
        n_splits = 5

        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

        for i, (trn_idx, val_idx) in enumerate(skf.split(data, label)):
            x_train, y_train = data.iloc[trn_idx, :], label.iloc[trn_idx,]
            x_valid, y_valid = data.iloc[val_idx, :], label.iloc[val_idx,]

            # 전처리
            x_train, x_valid, x_test = preprocess(x_train, x_valid, test)

            # 모델 정의
            model = XGBClassifier(n_estimators=1000,
                                  max_depth=params['max_depth'],
                                  subsample=params['subsample'],
                                  colsample_bytree=params['colsample_bytree'],
                                  tree_method='gpu_hist',
                                  n_jobs=n_cpus-1)

            # 모델 학습
            model.fit(x_train, y_train, 
                      eval_metric='mlogloss', 
                      eval_set=[[x_train, y_train], [x_valid, y_valid]],
                      early_stopping_rounds=100,
                      verbose=100)

            # 훈련, 검증 데이터 log_loss 확인
            trn_logloss = log_loss(y_train, model.predict_proba(x_train))
            val_logloss = log_loss(y_valid, model.predict_proba(x_valid))
            print('{} Fold, train logloss : {:.4f}4, validation logloss : {:.4f}'.format(i, trn_logloss, val_logloss))

            val_scores.append(val_logloss)
            
        metrics = {"cv_loss": np.mean(val_scores)}
        wandb.log(metrics)
count = 5
wandb.agent(sweep_id, function=train, count=count)

[34m[1mwandb[0m: Agent Starting Run: 0fj57bb3 with config:
[34m[1mwandb[0m: 	colsample_bytree: 0.8387914008925683
[34m[1mwandb[0m: 	max_depth: 15
[34m[1mwandb[0m: 	subsample: 0.6186116973904416
[34m[1mwandb[0m: Currently logged in as: [33milevk[0m (use `wandb login --relogin` to force relogin)


[0]	validation_0-mlogloss:1.04148	validation_1-mlogloss:1.05192
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 100 rounds.
[100]	validation_0-mlogloss:0.249419	validation_1-mlogloss:0.732797
Stopping. Best iteration:
[52]	validation_0-mlogloss:0.38921	validation_1-mlogloss:0.714112

0 Fold, train logloss : 0.38924, validation logloss : 0.7141
[0]	validation_0-mlogloss:1.04226	validation_1-mlogloss:1.05285
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 100 rounds.
[100]	validation_0-mlogloss:0.247036	validation_1-mlogloss:0.757528
Stopping. Best iteration:
[49]	validation_0-mlogloss:0.404841	validation_1-mlogloss:0.727561

1 Fold, train logloss : 0.40484, validation logloss : 0.7276
[0]	validation_0-mlogloss:1.04094	validation_1-mlogloss:1.05273
Multiple eval metrics have 

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.72251
_runtime,119.0
_timestamp,1628322697.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


[34m[1mwandb[0m: Agent Starting Run: zrqmtxgu with config:
[34m[1mwandb[0m: 	colsample_bytree: 0.5630725873190232
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 0.9415557607603176


[0]	validation_0-mlogloss:1.06143	validation_1-mlogloss:1.06126
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 100 rounds.
[100]	validation_0-mlogloss:0.784758	validation_1-mlogloss:0.79497
[200]	validation_0-mlogloss:0.764558	validation_1-mlogloss:0.787236
[300]	validation_0-mlogloss:0.748229	validation_1-mlogloss:0.781361
[400]	validation_0-mlogloss:0.733982	validation_1-mlogloss:0.776776
[500]	validation_0-mlogloss:0.721956	validation_1-mlogloss:0.773117
[600]	validation_0-mlogloss:0.711037	validation_1-mlogloss:0.770044
[700]	validation_0-mlogloss:0.700818	validation_1-mlogloss:0.766874
[800]	validation_0-mlogloss:0.691404	validation_1-mlogloss:0.764665
[900]	validation_0-mlogloss:0.682798	validation_1-mlogloss:0.762462
[999]	validation_0-mlogloss:0.675044	validation_1-mlogloss:0.761029
0 Fold, train logloss : 0.67514, validation logloss : 0.7610
[0]	validation_0-mlogloss:1.

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.76579
_runtime,90.0
_timestamp,1628322792.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


[34m[1mwandb[0m: Agent Starting Run: 761fjght with config:
[34m[1mwandb[0m: 	colsample_bytree: 0.9285175248130326
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	subsample: 0.8436485901088797


[0]	validation_0-mlogloss:1.05449	validation_1-mlogloss:1.05486
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 100 rounds.
[100]	validation_0-mlogloss:0.724808	validation_1-mlogloss:0.77349
[200]	validation_0-mlogloss:0.66049	validation_1-mlogloss:0.753305
[300]	validation_0-mlogloss:0.611931	validation_1-mlogloss:0.741654
[400]	validation_0-mlogloss:0.572155	validation_1-mlogloss:0.73479
[500]	validation_0-mlogloss:0.537422	validation_1-mlogloss:0.730452
[600]	validation_0-mlogloss:0.508967	validation_1-mlogloss:0.727422
[700]	validation_0-mlogloss:0.483654	validation_1-mlogloss:0.727549
Stopping. Best iteration:
[615]	validation_0-mlogloss:0.505095	validation_1-mlogloss:0.726859

0 Fold, train logloss : 0.50514, validation logloss : 0.7269
[0]	validation_0-mlogloss:1.05407	validation_1-mlogloss:1.05547
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be use

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.73587
_runtime,83.0
_timestamp,1628322879.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


[34m[1mwandb[0m: Agent Starting Run: 7bxzvm88 with config:
[34m[1mwandb[0m: 	colsample_bytree: 0.9406544662955061
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	subsample: 0.8223017927643013


[0]	validation_0-mlogloss:1.05438	validation_1-mlogloss:1.05466
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 100 rounds.
[100]	validation_0-mlogloss:0.724194	validation_1-mlogloss:0.773566
[200]	validation_0-mlogloss:0.659517	validation_1-mlogloss:0.752578
[300]	validation_0-mlogloss:0.61154	validation_1-mlogloss:0.741747
[400]	validation_0-mlogloss:0.571195	validation_1-mlogloss:0.736043
[500]	validation_0-mlogloss:0.537233	validation_1-mlogloss:0.73236
[600]	validation_0-mlogloss:0.507631	validation_1-mlogloss:0.729285
[700]	validation_0-mlogloss:0.482064	validation_1-mlogloss:0.728777
Stopping. Best iteration:
[625]	validation_0-mlogloss:0.500917	validation_1-mlogloss:0.728234

0 Fold, train logloss : 0.50094, validation logloss : 0.7282
[0]	validation_0-mlogloss:1.05408	validation_1-mlogloss:1.05554
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be us

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.73735
_runtime,83.0
_timestamp,1628322971.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


[34m[1mwandb[0m: Agent Starting Run: jaku202w with config:
[34m[1mwandb[0m: 	colsample_bytree: 0.8113843816799957
[34m[1mwandb[0m: 	max_depth: 15
[34m[1mwandb[0m: 	subsample: 0.7076145805064211


[0]	validation_0-mlogloss:1.03885	validation_1-mlogloss:1.05076
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 100 rounds.
[100]	validation_0-mlogloss:0.245812	validation_1-mlogloss:0.731015
Stopping. Best iteration:
[50]	validation_0-mlogloss:0.388743	validation_1-mlogloss:0.711666

0 Fold, train logloss : 0.38874, validation logloss : 0.7117
[0]	validation_0-mlogloss:1.04165	validation_1-mlogloss:1.05276
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 100 rounds.
[100]	validation_0-mlogloss:0.23846	validation_1-mlogloss:0.753631
Stopping. Best iteration:
[53]	validation_0-mlogloss:0.375268	validation_1-mlogloss:0.724076

1 Fold, train logloss : 0.37534, validation logloss : 0.7241
[0]	validation_0-mlogloss:1.04057	validation_1-mlogloss:1.05264
Multiple eval metrics have 

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.72179
_runtime,128.0
_timestamp,1628323108.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁
