In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import shap
from sklearn.metrics import accuracy_score, confusion_matrix

In [4]:
# 데이터 로딩
train = pd.read_csv('../data/train/train_gs_2306_m3_20%.csv')
test = pd.read_csv('../data/remove_zero_gs_2306_m3.csv')
# XGBoost DMatrix 객체 생성
dtrain = xgb.DMatrix(train.drop('target', axis=1), label=train[['target']])
dtest = xgb.DMatrix(test.drop('target', axis=1), label=test[['target']])

In [7]:
print(test.columns)


Index(['band 1', 'band 2', 'band 3', 'band 4', 'band 5', 'band 6', 'band 7',
       'band 8', 'band 9', 'band 10',
       ...
       'nx141', 'nx142', 'nx143', 'nx144', 'nx145', 'nx146', 'nx147', 'nx148',
       'nx149', 'nx150'],
      dtype='object', length=301)


In [5]:
# 하이퍼파라미터 설정
params = {
    'objective': 'binary:logistic',  # 이진 분류용 로지스틱 손실 함수
    'eta': 0.1,
    'max_depth': 6,
    'eval_metric': 'logloss',  # 로그 손실을 평가 지표로 사용
}
w_list = [(dtrain, 'train'), (dtest, 'test')]

# 모델 학습
num_round = 100
bst = xgb.train(params, dtrain, num_round, evals=w_list)
bst.save_model('../model/gs_m3_20%_2306_boo.model')

# 예측
pred_probs = bst.predict(dtest)
pred = [1 if x > 0.8 else 0 for x in pred_probs]


# 정확도 계산
accuracy = accuracy_score(test[['target']], pred)
print(f"Accuracy: {accuracy:.4f}")

# 컨퓨전 매트릭스 계산 및 출력
cm = confusion_matrix(test[['target']], pred)
print("Confusion Matrix:")
print(cm)

explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(dtest,tree_limit=100)

columns1 = ['nx96','nx97','nx98','nx101']
columns2 = ['nx76', 'nx78', 'nx79', 'nx80', 'nx81']
indices = [test.columns.get_loc(col) - 1 for col in columns1]  # DMatrix에는 target 컬럼이 없으므로 -1 필요

# 선택된 컬럼들에 대한 SHAP 값 추출 및 summary plot 그리기
shap_values_selected = shap_values[:, indices]
shap.summary_plot(shap_values_selected, test[columns1], feature_names=columns1, max_display=50, color_bar=False, auto_size_plot=True, show=False)

# 그래프를 이미지로 저장
output_path = f"gs_06_summary_plot_pos.png"
plt.savefig(output_path, bbox_inches="tight")
plt.close()  # 저장 후 그래프 창 닫기
indices = [test.columns.get_loc(col) - 1 for col in columns2]  # DMatrix에는 target 컬럼이 없으므로 -1 필요

# 선택된 컬럼들에 대한 SHAP 값 추출 및 summary plot 그리기
shap_values_selected = shap_values[:, indices]
shap.summary_plot(shap_values_selected, test[columns2], feature_names=columns2, max_display=50, color_bar=False, auto_size_plot=True, show=False)

# 그래프를 이미지로 저장
output_path = f"gs_06_summary_plot_ne.png"
plt.savefig(output_path, bbox_inches="tight")
plt.close()  # 저장 후 그래프 창 닫기

[0]	train-logloss:0.63461	test-logloss:0.63590
[1]	train-logloss:0.58547	test-logloss:0.58837
[2]	train-logloss:0.54481	test-logloss:0.54956
[3]	train-logloss:0.50838	test-logloss:0.51537
[4]	train-logloss:0.47704	test-logloss:0.48648
[5]	train-logloss:0.45097	test-logloss:0.46245
[6]	train-logloss:0.42769	test-logloss:0.44164
[7]	train-logloss:0.40556	test-logloss:0.42160
[8]	train-logloss:0.38883	test-logloss:0.40663
[9]	train-logloss:0.37289	test-logloss:0.39246
[10]	train-logloss:0.35727	test-logloss:0.37854
[11]	train-logloss:0.34355	test-logloss:0.36646
[12]	train-logloss:0.33175	test-logloss:0.35621
[13]	train-logloss:0.32143	test-logloss:0.34726
[14]	train-logloss:0.31201	test-logloss:0.33936
[15]	train-logloss:0.30355	test-logloss:0.33207
[16]	train-logloss:0.29616	test-logloss:0.32599
[17]	train-logloss:0.28811	test-logloss:0.31895
[18]	train-logloss:0.28121	test-logloss:0.31307
[19]	train-logloss:0.27561	test-logloss:0.30833
[20]	train-logloss:0.27065	test-logloss:0.30428
[2



Accuracy: 0.9566
Confusion Matrix:
[[6397173  264891]
 [  32821  165078]]
