In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
import shap
import lime
import lime.lime_tabular

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [3]:
df = pd.read_csv('dataset/heloc.csv')

## Train Model

In [4]:
# 특징과 라벨 분리 (헬로크 데이터에서 y값을 'RiskPerformance'로 가정)
X = df.drop(columns=['RiskPerformance'])  # 입력 변수
y = df['RiskPerformance'].apply(lambda x: 1 if x == 'Bad' else 0)  # 'Bad'를 1로, 'Good'을 0으로 변환

# 학습/테스트 데이터 분리
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost용 데이터 형식 변환
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)

# 기본 하이퍼파라미터 설정
param = {'silent':True, 'objective':'binary:logistic', "eta":0.05, 'eval_metric': 'rmse',
         'monotone_constraints':"(1,1,1,1,-1,-1,1,0,0,-1,-1,-1,0,-1,0,1,1)"}

# Cross-validation으로 적절한 boosting round 찾기
bst_cv = xgb.cv(param, dtrain, 500, nfold=10, early_stopping_rounds=10)

# 모델 훈련
evals_result = {}
evallist  = [(dtrain, 'train'), (dtest, 'eval')]
bst = xgb.train(param, dtrain, num_boost_round=bst_cv.shape[0], evals_result=evals_result, evals=evallist, verbose_eval=False)

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.



## Save Trained Model

In [6]:
model_path = 'model/xgb.model'
bst.save_model(model_path)

