<a href="https://colab.research.google.com/github/godlejr/PytorchSkeleton/blob/main/Machine_Learning_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data Preprocessing

In [None]:
import xgboost as xgb ## XGBoost 불러오기
from xgboost import plot_importance ## Feature Importance를 불러오기 위함

import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score
from sklearn.metrics import classification_report

import argparse
from copy import deepcopy # Add Deepcopy for args

import warnings
warnings.filterwarnings('ignore')

In [None]:
# 위스콘신 유방암 데이터 세트를 활용한 API 사용
dataset = load_breast_cancer()
X_features = dataset.data
y_label = dataset.target

#column1, column2, column3, ... , target 형태를 위함 
cancer_df = pd.DataFrame(data=X_features, columns = dataset.feature_names)
cancer_df['target'] = y_label

cancer_df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


In [None]:
#malignant : 양성 
#benign : 음성
print(dataset.target_names)
print(cancer_df['target'].value_counts())


['malignant' 'benign']
1    357
0    212
Name: target, dtype: int64


In [None]:
# 전체 데이터셋을 학습용 80%, 테스트용 20%로 분할
X_train, X_test, y_train, y_test = train_test_split(X_features, y_label, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)

(455, 30) (114, 30)


###Train & Test function

In [None]:
#Train 함수
def train(params, dtrain, num_rounds, wlist):
    xgb_model = xgb.train(params = params, dtrain=dtrain, num_boost_round=num_rounds, evals=wlist)
    return xgb_model


###Experiment

In [None]:
def get_clf_eval(y_test, y_pred, max_depth, sub_sample):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)

    print('max_depth:\n',max_depth)
    print('sub_sample:\n',sub_sample)
    #print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
    print('AUC: {:.4f}'.format(AUC))

In [None]:
def experiment(args):

    # ======= Hyper Parameter 정의 ======= #
    params = {
      'eta' : args.lr,
      'min_child_weight' : args.min_child_weight,
      'max_depth' : args.max_depth,
      'gamma' : args.gamma,
      'sub_sample': args.sub_sample,
      'alpha' : args.alpha,
      'colsample_bytree' : args.colsample_bytree,
      'lambda' : args.clambda,
      'scale_pos_weight' : args.scale_pos_weight,
      #실험 값
      'objective' : 'binary:logistic',
      'eval_metric' : 'logloss',
      'early_stoppings' : 100 }
    
    # =========== Train & Test ============= #
    # 넘파이 형태의 학습 데이터 세트와 테스트 데이터를 DMatrix로 변환하는 예제
    dtrain = xgb.DMatrix(data=X_train, label = y_train)
    dtest = xgb.DMatrix(data=X_test, label=y_test)
    wlist = [(dtrain, 'train'), (dtest,'eval')]
    
    xgb_model = train(params, dtrain, args.num_boost_round, wlist)    
    pred_probs = xgb_model.predict(dtest)  


    # =========== Evaluation =========== #
    # 예측 확률이 0.5보다 크면 1, 그렇지 않으면 0으로 예측값 결정해 리스트 객체인 preds에 저장
    preds = [ 1 if x > 0.5 else 0 for x in pred_probs]
    get_clf_eval(y_test, preds, args.max_depth, args.sub_sample)
    


###Hyper Parameter

eta (0.3)	
- GBM의 learning rate와 같은 파라미터
- 범위: 0 ~ 1

num_boost_around (10) 
- 생성할 weak learner의 수

min_child_weight (1) 
- GBM의 min_samples_leaf와 유사
- 관측치에 대한 가중치 합의 최소를 말하지만 GBM에서는 관측치 수에 대한 최소를 의미
- 과적합 조절 용도
- 범위: 0 ~ ∞

gamma (0)	
- 리프노드의 추가분할을 결정할 최소손실 감소값
- 해당값보다 손실이 크게 감소할 때 분리
- 값이 클수록 과적합 감소효과
- 범위: 0 ~ ∞

max_depth (6)	
- 트리 기반 알고리즘의 max_depth와 동일
- 0을 지정하면 깊이의 제한이 없음
- 너무 크면 과적합(통상 3~10정도 적용)
- 범위: 0 ~ ∞

sub_sample (1)	
- GBM의 subsample과 동일
- 데이터 샘플링 비율 지정(과적합 제어)
- 일반적으로 0.5~1 사이의 값을 사용
- 범위: 0 ~ 1

colsample_bytree (1)	
- GBM의 max_features와 유사
- 트리 생성에 필요한 피처의 샘플링에 사용
- 피처가 많을 때 과적합 조절에 사용
- 범위: 0 ~ 1

lambda (1)
- L2 Regularization 적용 값
- 피처 개수가 많을 때 적용을 검토
- 클수록 과적합 감소 효과

alpha (0)	
- L1 Regularization 적용 값
- 피처 개수가 많을 때 적용을 검토
- 클수록 과적합 감소 효과

scale_pos_weight(1)	
- 불균형 데이터셋의 균형을 유지

In [None]:


seed = 42
np.random.seed(seed)

parser = argparse.ArgumentParser()
args = parser.parse_args("")

args.exp_name = "exp1_lr"


# ====== Model Capacity ===== #
args.scale_pos_weight = 1 #불균형 데이터셋의 균형을 유지

# ====== Regularization ======= #
args.min_child_weight = 1 # 관측치에 대한 가중치 합의 최소를 말하지만 GBM에서는 관측치 수에 대한 최소를 의미
args.gamma = 0 #리프노드의 추가분할을 결정할 최소손실 감소값  0 ~ ∞
args.colsample_bytree = 1 # 트리 생성에 필요한 피처의 샘플링에 사용
args.clambda = 1 #lambda L2 Regularization 적용  -피처 개수가 많을 때 적용을 검토 - 클수록 과적합 감소 효과
args.alpha = 0 # L1 Regularization 적용 값 -피처 개수가 많을 때 적용을 검토 - 클수록 과적합 감소 효과


# ====== Experiment Variable ====== #

# eta 값을 낮춘다 (0.01 ~ 0.1) → eta 값을 낮추면 
# num_boost_round(n_estimator)를 반대로 높여주어야 함
# max_depth 값을 낮춘다
# min_child_weight 값을 높인다
# gamma 값을 높인다 
# subsample과 colsample_bytree를 낮춘다

args.num_boost_round = 400 #weak learner 수 부스팅 반복횟수는 400
args.sub_sample = 1 # 데이터 샘플링 비율


name_var1 = 'lr'
name_var2 = 'max_depth'
list_var1 = [0.01, 0.001, 0.0001] #args.eta learning rate
list_var2 = [3,5,7] #max_depth

# ====== Random Seed Initialization ====== #
# max_depth = 3, 학습률은 0.1, 예제가 이진분류이므로 목적함수(objective)는 binary:logistic(이진 로지스틱)
# 오류함수의 평가성능지표는 logloss
# 부스팅 반복횟수는 400
# 조기중단을 위한 최소 반복횟수는 100
# train 데이터 세트는 'train', evaluation(test) 데이터 세트는 'eval' 로 명기


for var1 in list_var1:
    for var2 in list_var2:
        setattr(args, name_var1, var1)
        setattr(args, name_var2, var2)
        print(args)
        experiment(deepcopy(args))


Namespace(alpha=0, clambda=1, colsample_bytree=1, exp_name='exp1_lr', gamma=0, lr=0.01, max_depth=3, min_child_weight=1, num_boost_round=400, scale_pos_weight=1, sub_sample=1)
[0]	train-logloss:0.684412	eval-logloss:0.684888
[1]	train-logloss:0.675844	eval-logloss:0.676793
[2]	train-logloss:0.667439	eval-logloss:0.66887
[3]	train-logloss:0.659191	eval-logloss:0.66109
[4]	train-logloss:0.651124	eval-logloss:0.653205
[5]	train-logloss:0.643193	eval-logloss:0.645784
[6]	train-logloss:0.635417	eval-logloss:0.638184
[7]	train-logloss:0.627767	eval-logloss:0.631169
[8]	train-logloss:0.620257	eval-logloss:0.624297
[9]	train-logloss:0.612888	eval-logloss:0.617081
[10]	train-logloss:0.605799	eval-logloss:0.610401
[11]	train-logloss:0.598684	eval-logloss:0.603759
[12]	train-logloss:0.591699	eval-logloss:0.596899
[13]	train-logloss:0.584968	eval-logloss:0.590571
[14]	train-logloss:0.57821	eval-logloss:0.584265
[15]	train-logloss:0.57157	eval-logloss:0.577756
[16]	train-logloss:0.565173	eval-loglo