In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from xgboost import XGBClassifier

import shap
import pickle
import joblib
import os


# 인코더 생성

In [28]:
df = pd.read_csv("data/model_data.csv")

# 1. 로그 변환을 적용할 특성 정의
log_feature = 'income'

# 2. 원핫 인코딩을 적용할 특성 정의
categorical_features = ['education', 'home_ownership', 'intent']

# 3. OneHotEncoder 객체 생성 및 학습
# handle_unknown='ignore'로 새로운 범주에 대한 오류를 방지
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(df[categorical_features])

# 4. 학습된 인코더 객체를 파일로 저장
if not os.path.exists("model"):
    os.makedirs("model")

joblib.dump(encoder, os.path.join("model", "ohe_encoder.pkl"))
print("OneHotEncoder 객체가 성공적으로 저장되었습니다.")

# 5. SHAP Explainer를 위한 데이터셋 준비
# 원본 데이터프레임을 복사하여 변환 작업 수행
df_temp = df.copy()

# 로그 변환을 수행
df_temp['income'] = np.log1p(df_temp[log_feature])

# 원핫 인코딩 수행
encoded_features = encoder.transform(df_temp[categorical_features])
encoded_df = pd.DataFrame(
    encoded_features,
    columns=encoder.get_feature_names_out(categorical_features)
)

other_features = [col for col in df_temp.columns if col not in [log_feature] + categorical_features]
df_final = pd.concat([df_temp[other_features], df_temp[[log_feature]], encoded_df], axis=1)

df_final.to_csv('data/model_data_encoded.csv', index=False)
print("데이터가 성공적으로 저장되었습니다.")


OneHotEncoder 객체가 성공적으로 저장되었습니다.
데이터가 성공적으로 저장되었습니다.


In [29]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44921 entries, 0 to 44920
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       44921 non-null  float64
 1   gender                    44921 non-null  int64  
 2   emp_exp                   44921 non-null  int64  
 3   amount                    44921 non-null  float64
 4   int_rate                  44921 non-null  float64
 5   loan_percent_income       44921 non-null  float64
 6   cred_hist_length          44921 non-null  float64
 7   credit_score              44921 non-null  int64  
 8   previous_loan_defaults    44921 non-null  int64  
 9   loan_status               44921 non-null  int64  
 10  income                    44921 non-null  float64
 11  education_Associate       44921 non-null  float64
 12  education_Bachelor        44921 non-null  float64
 13  education_Doctorate       44921 non-null  float64
 14  educat

# log처리만 한 데이터

In [30]:
df = pd.read_csv("data/model_data_encoded.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44921 entries, 0 to 44920
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       44921 non-null  float64
 1   gender                    44921 non-null  int64  
 2   emp_exp                   44921 non-null  int64  
 3   amount                    44921 non-null  float64
 4   int_rate                  44921 non-null  float64
 5   loan_percent_income       44921 non-null  float64
 6   cred_hist_length          44921 non-null  float64
 7   credit_score              44921 non-null  int64  
 8   previous_loan_defaults    44921 non-null  int64  
 9   loan_status               44921 non-null  int64  
 10  income                    44921 non-null  float64
 11  education_Associate       44921 non-null  float64
 12  education_Bachelor        44921 non-null  float64
 13  education_Doctorate       44921 non-null  float64
 14  educat

In [21]:
# df['person_gender'] = df['person_gender'].replace({'female': 1, 'male': 0})
# df['previous_loan_defaults_on_file'] = df['previous_loan_defaults_on_file'].replace({'Yes': 1, 'No': 0})

# # 2. 범주형 데이터 원-핫 인코딩
# df = pd.get_dummies(df, columns=['person_education','person_home_ownership', 'loan_intent'], ) #drop_first=True
# df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44921 entries, 0 to 44920
Data columns (total 26 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      44921 non-null  float64
 1   person_gender                   44921 non-null  int64  
 2   person_emp_exp                  44921 non-null  int64  
 3   loan_amnt                       44921 non-null  float64
 4   loan_int_rate                   44921 non-null  float64
 5   loan_percent_income             44921 non-null  float64
 6   cb_person_cred_hist_length      44921 non-null  float64
 7   credit_score                    44921 non-null  int64  
 8   previous_loan_defaults_on_file  44921 non-null  int64  
 9   loan_status                     44921 non-null  int64  
 10  person_income_log               44921 non-null  float64
 11  person_education_Associate      44921 non-null  bool   
 12  person_education_Bachelor       

In [31]:
# 3. 데이터 분할 (Feature와 Target 나누기)
X = df.drop('loan_status', axis=1) # 피처 데이터
y = df['loan_status'] # 타겟(종속) 변수

# 4. 학습용/평가용 데이터셋 분리 (80% 학습, 20% 평가)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# 5. XGBoost 모델 생성 및 학습
# XGBClassifier 객체를 생성합니다.
# n_estimators: 학습할 트리의 개수
# learning_rate: 학습률
# model = XGBClassifier(n_estimators=500, learning_rate=0.1, random_state=42,
#                       early_stopping_rounds=15, eval_metric="logloss")

model = XGBClassifier(
    n_estimators=10000, 
    learning_rate=0.01, 
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    random_state=42,
    early_stopping_rounds=20, 
    eval_metric="logloss"
)

# 모델을 학습 데이터에 맞춰 훈련시킵니다.
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
    )

# 6. 모델 예측
y_pred = model.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"모델의 정확도: {accuracy:.4f}")

# F1-Score 계산 (추가된 코드)
f1 = f1_score(y_test, y_pred)
print(f"모델의 F1-Score: {f1:.4f}")

# 예측 결과와 실제 값 비교
print("\n예측값:", y_pred[:5])
print("실제값:", y_test[:5].tolist())

[0]	validation_0-logloss:0.51569
[1]	validation_0-logloss:0.51074
[2]	validation_0-logloss:0.50513
[3]	validation_0-logloss:0.50046
[4]	validation_0-logloss:0.49593
[5]	validation_0-logloss:0.49155
[6]	validation_0-logloss:0.48680
[7]	validation_0-logloss:0.48279
[8]	validation_0-logloss:0.47868
[9]	validation_0-logloss:0.47481
[10]	validation_0-logloss:0.47019
[11]	validation_0-logloss:0.46570
[12]	validation_0-logloss:0.46135
[13]	validation_0-logloss:0.45710
[14]	validation_0-logloss:0.45417
[15]	validation_0-logloss:0.45087
[16]	validation_0-logloss:0.44683
[17]	validation_0-logloss:0.44312
[18]	validation_0-logloss:0.44004
[19]	validation_0-logloss:0.43626
[20]	validation_0-logloss:0.43281
[21]	validation_0-logloss:0.43020
[22]	validation_0-logloss:0.42684
[23]	validation_0-logloss:0.42408
[24]	validation_0-logloss:0.42113
[25]	validation_0-logloss:0.41797
[26]	validation_0-logloss:0.41558
[27]	validation_0-logloss:0.41322
[28]	validation_0-logloss:0.41091
[29]	validation_0-loglos

In [32]:
# 모델 객체를 'xgb_model.pkl' 파일로 저장
with open('model/xgb_model_log.pkl', 'wb') as f:
    pickle.dump(model, f)
print("모델이 'xgb_model.pkl' 파일로 저장되었습니다.")

모델이 'xgb_model.pkl' 파일로 저장되었습니다.


In [33]:
# 저장된 모델을 다시 불러오기
with open('model/xgb_model_log.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
print("모델이 성공적으로 불러와졌습니다.")
print("-" * 30)

모델이 성공적으로 불러와졌습니다.
------------------------------


In [34]:
# ----------------------------------------------------
# 4. 불러온 모델로 예측 및 평가
# ----------------------------------------------------
# 불러온 모델을 사용하여 예측
y_pred_loaded = loaded_model.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred_loaded)
print(f"불러온 모델의 정확도: {accuracy:.4f}")

불러온 모델의 정확도: 0.9355


# capped 처리 한 데이터

In [15]:
df_capped = pd.read_csv("data/model_data_capped.csv")

df_capped['person_gender'] = df_capped['person_gender'].replace({'female': 1, 'male': 0})
df_capped['previous_loan_defaults_on_file'] = df_capped['previous_loan_defaults_on_file'].replace({'Yes': 1, 'No': 0})

# 2. 범주형 데이터 원-핫 인코딩
df_capped = pd.get_dummies(df_capped, columns=['person_education','person_home_ownership', 'loan_intent'], drop_first=True)

  df_capped['person_gender'] = df_capped['person_gender'].replace({'female': 1, 'male': 0})
  df_capped['previous_loan_defaults_on_file'] = df_capped['previous_loan_defaults_on_file'].replace({'Yes': 1, 'No': 0})


In [16]:
df_capped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44921 entries, 0 to 44920
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      44921 non-null  float64
 1   person_gender                   44921 non-null  int64  
 2   person_emp_exp                  44921 non-null  int64  
 3   loan_amnt                       44921 non-null  float64
 4   loan_int_rate                   44921 non-null  float64
 5   loan_percent_income             44921 non-null  float64
 6   cb_person_cred_hist_length      44921 non-null  float64
 7   credit_score                    44921 non-null  int64  
 8   previous_loan_defaults_on_file  44921 non-null  int64  
 9   loan_status                     44921 non-null  int64  
 10  person_income_log_capped        44921 non-null  float64
 11  person_education_Bachelor       44921 non-null  bool   
 12  person_education_Doctorate      

In [17]:
# 3. 데이터 분할 (Feature와 Target 나누기)
X = df_capped.drop('loan_status', axis=1) # 피처 데이터
y = df_capped['loan_status'] # 타겟(종속) 변수

# 4. 학습용/평가용 데이터셋 분리 (80% 학습, 20% 평가)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [18]:
X_train

Unnamed: 0,person_age,person_gender,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,person_income_log_capped,...,person_education_High School,person_education_Master,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
25664,27.0,1,5,10000.0,7.51,0.09,9.0,678,0,11.591422,...,True,False,False,False,False,False,False,False,True,False
37022,33.0,0,13,12000.0,13.57,0.23,7.0,665,0,10.847491,...,False,False,False,False,True,False,False,False,True,False
14993,24.0,1,2,15000.0,13.22,0.24,3.0,578,1,11.051620,...,False,False,False,False,True,False,False,True,False,False
5244,22.0,1,1,6000.0,7.90,0.11,2.0,673,0,10.912029,...,False,False,False,False,False,False,False,False,True,False
16663,26.0,0,2,6400.0,14.35,0.16,4.0,461,1,10.574364,...,False,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2960,21.0,0,0,3175.0,6.03,0.09,4.0,494,1,10.484585,...,False,False,False,False,True,True,False,False,False,False
35091,30.0,1,7,3600.0,12.69,0.08,7.0,684,1,10.664387,...,False,False,False,True,False,False,False,False,True,False
39922,24.0,0,1,1936.0,10.56,0.06,4.0,646,0,10.430462,...,False,False,False,False,False,False,False,False,True,False
37926,31.0,1,6,24214.0,11.01,0.11,8.0,606,1,12.035459,...,True,False,False,False,False,False,False,False,False,True


In [19]:
# 5. XGBoost 모델 생성 및 학습
# XGBClassifier 객체를 생성합니다.
# n_estimators: 학습할 트리의 개수
# learning_rate: 학습률
model_capped = XGBClassifier(n_estimators=500, learning_rate=0.1, random_state=42,
                      early_stopping_rounds=15, eval_metric="logloss")

# 모델을 학습 데이터에 맞춰 훈련시킵니다.
model_capped.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
    )

# 6. 모델 예측
# 훈련된 모델로 평가 데이터의 결과를 예측합니다.
y_pred = model_capped.predict(X_test)

# 7. 모델 성능 평가
# 예측값과 실제값의 정확도를 계산합니다.
accuracy = accuracy_score(y_test, y_pred)
print(f"모델의 정확도: {accuracy:.4f}")

# 예측 결과와 실제 값 비교
print("\n예측값:", y_pred[:5])
print("실제값:", y_test[:5].tolist())

[0]	validation_0-logloss:0.46359
[1]	validation_0-logloss:0.42166
[2]	validation_0-logloss:0.38877
[3]	validation_0-logloss:0.36225
[4]	validation_0-logloss:0.34000
[5]	validation_0-logloss:0.32129
[6]	validation_0-logloss:0.30519
[7]	validation_0-logloss:0.29116
[8]	validation_0-logloss:0.27921
[9]	validation_0-logloss:0.26843
[10]	validation_0-logloss:0.25885
[11]	validation_0-logloss:0.25058
[12]	validation_0-logloss:0.24326
[13]	validation_0-logloss:0.23652
[14]	validation_0-logloss:0.23050
[15]	validation_0-logloss:0.22513
[16]	validation_0-logloss:0.22040
[17]	validation_0-logloss:0.21617
[18]	validation_0-logloss:0.21209
[19]	validation_0-logloss:0.20844
[20]	validation_0-logloss:0.20514
[21]	validation_0-logloss:0.20220
[22]	validation_0-logloss:0.19952
[23]	validation_0-logloss:0.19724
[24]	validation_0-logloss:0.19523
[25]	validation_0-logloss:0.19315
[26]	validation_0-logloss:0.19131
[27]	validation_0-logloss:0.18972
[28]	validation_0-logloss:0.18786
[29]	validation_0-loglos

In [20]:
# 모델 객체를 'xgb_model.pkl' 파일로 저장
with open('model/xgb_model_capped.pkl', 'wb') as f:
    pickle.dump(model_capped, f)
print("모델이 'xgb_model.pkl' 파일로 저장되었습니다.")

모델이 'xgb_model.pkl' 파일로 저장되었습니다.


In [21]:
# 저장된 모델을 다시 불러오기
with open('model/xgb_model_capped.pkl', 'rb') as f:
    loaded_model2 = pickle.load(f)
print("모델이 성공적으로 불러와졌습니다.")
print("-" * 30)

# ----------------------------------------------------
# 4. 불러온 모델로 예측 및 평가
# ----------------------------------------------------
# 불러온 모델을 사용하여 예측
y_pred_loaded = loaded_model2.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred_loaded)
print(f"불러온 모델의 정확도: {accuracy:.4f}")

모델이 성공적으로 불러와졌습니다.
------------------------------


불러온 모델의 정확도: 0.9337


In [22]:
X_test.iloc[:,:]

Unnamed: 0,person_age,person_gender,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,person_income_log_capped,...,person_education_High School,person_education_Master,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
44742,29.0,0,5,8000.0,11.04,0.27,9.0,679,0,10.301324,...,True,False,False,False,True,True,False,False,False,False
43415,24.0,1,2,19553.0,15.62,0.21,4.0,581,0,11.455741,...,True,False,False,False,False,False,True,False,False,False
41214,32.0,1,8,10000.0,7.60,0.27,8.0,607,1,10.515994,...,False,False,False,False,False,True,False,False,False,False
23326,29.0,1,8,5000.0,10.38,0.07,10.0,689,1,11.224256,...,False,False,False,True,False,False,False,False,False,True
29814,37.0,1,12,8000.0,8.49,0.17,13.0,617,1,10.785973,...,False,True,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31743,36.0,0,14,2500.0,14.91,0.02,14.0,581,0,11.908892,...,True,False,False,False,False,False,False,False,False,False
29056,29.0,1,5,3575.0,11.49,0.06,6.0,676,1,11.017661,...,False,False,False,False,False,False,True,False,False,False
28338,34.0,1,12,17500.0,6.17,0.34,8.0,660,1,10.838129,...,False,False,False,False,False,False,False,False,False,False
44356,27.0,1,3,5000.0,17.51,0.15,6.0,545,0,10.440887,...,True,False,False,False,True,False,False,False,False,True


In [23]:
y_test.iloc[:]

44742    1
43415    1
41214    0
23326    0
29814    0
        ..
31743    1
29056    0
28338    0
44356    1
10351    0
Name: loan_status, Length: 4493, dtype: int64

In [24]:
X_test.iloc[0,:]

person_age                             29.0
person_gender                             0
person_emp_exp                            5
loan_amnt                            8000.0
loan_int_rate                         11.04
loan_percent_income                    0.27
cb_person_cred_hist_length              9.0
credit_score                            679
previous_loan_defaults_on_file            0
person_income_log_capped          10.301324
person_education_Bachelor             False
person_education_Doctorate            False
person_education_High School           True
person_education_Master               False
person_home_ownership_OTHER           False
person_home_ownership_OWN             False
person_home_ownership_RENT             True
loan_intent_EDUCATION                  True
loan_intent_HOMEIMPROVEMENT           False
loan_intent_MEDICAL                   False
loan_intent_PERSONAL                  False
loan_intent_VENTURE                   False
Name: 44742, dtype: object

In [25]:
# SHAP Explainer 생성
# 학습된 모델을 explainer에 전달합니다.
# TreeExplainer는 트리 기반 모델에 최적화된 SHAP Explainer입니다.
explainer = shap.TreeExplainer(model)

# SHAP 값 계산
# 모델 예측에 대한 각 특성의 기여도(SHAP 값)를 계산합니다.
# 보통 테스트 데이터셋을 사용합니다.
shap_values = explainer.shap_values(X_test)

# --- SHAP 시각화 ---

# 1. 전반적인 특성 중요도 요약 (Summary Plot)
# 모델 예측에 가장 큰 영향을 미치는 특성을 보여줍니다.
# 각 점은 데이터셋의 인스턴스(행)를 나타내며, 색상은 특성 값의 크기(붉은색: 높음, 푸른색: 낮음)를 의미합니다.
# shap.summary_plot(shap_values, X_test)
# 위 코드를 사용하면 아래와 같은 그림이 나옵니다.


# 2. 개별 예측 분석 (Force Plot)
# 특정 데이터 샘플 하나에 대한 예측을 분석합니다.
# 예측값(f(x))이 기준값(E[f(x)])에서 어떻게 변했는지 각 특성의 기여도를 시각적으로 보여줍니다.
# 'Force Plot'의 쿼리는 'shap force plot example'로 명시할 수 있습니다.
# 예를 들어, X_test의 첫 번째 샘플(인스턴스)에 대한 예측을 분석합니다.
shap.initjs() # 주피터 노트북 환경에서 시각화를 위해 필요
shap.force_plot(explainer.expected_value, shap_values[2,:], X_test.iloc[2,:])
# 위 코드를 사용하면 아래와 같은 그림이 나옵니다.


# 3. 특성 의존성 분석 (Dependence Plot)
# 특정 특성이 예측 결과에 어떻게 영향을 미치는지 보여줍니다.
# x축은 특성 값, y축은 SHAP 값으로, 특성 값이 변함에 따라 예측에 대한 영향력이 어떻게 달라지는지 알 수 있습니다.
# shap.dependence_plot("person_age", shap_values, X_test) # 'age' 특성에 대한 의존성 분석
# 위 코드를 사용하면 아래와 같은 그림이 나옵니다.