In [3]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from pycaret.classification import *
import warnings
warnings.filterwarnings("ignore")

# 데이터 로드
train_df = pd.read_csv("open/train.csv").drop(columns=['UID'])
test_df = pd.read_csv("open/test.csv").drop(columns=['UID'])

# 타겟 변수 확인
print(train_df['채무 불이행 여부'].value_counts(normalize=True))


채무 불이행 여부
0    0.6588
1    0.3412
Name: proportion, dtype: float64


In [4]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# X, y 분리
X = train_df.drop(columns=['채무 불이행 여부'])
y = train_df['채무 불이행 여부']

# 범주형 변수 처리 (One-Hot Encoding)
categorical_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_cols)

# 훈련/검증 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# SMOTE 적용 (데이터 불균형 해결)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# 확인
print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE:", np.bincount(y_train_smote))


Before SMOTE: [5270 2730]
After SMOTE: [5270 5270]


In [5]:
# PyCaret 설정
clf = setup(data=pd.concat([X_train_smote, y_train_smote], axis=1), 
            target='채무 불이행 여부',
            normalize=True,
            use_gpu=True,
            session_id=42)

# 여러 모델 비교 후 가장 성능이 좋은 모델 5개 선택
top_models = compare_models(sort='F1', n_select=5, fold=5)


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recomp

Unnamed: 0,Description,Value
0,Session id,42
1,Target,채무 불이행 여부
2,Target type,Binary
3,Original data shape,"(10540, 44)"
4,Transformed data shape,"(10540, 44)"
5,Transformed train set shape,"(7377, 44)"
6,Transformed test set shape,"(3163, 44)"
7,Numeric features,12
8,Preprocess,True
9,Imputation type,simple


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.782,0.8588,0.7425,0.8064,0.773,0.5641,0.5659,0.18
ada,Ada Boost Classifier,0.7686,0.8492,0.759,0.7741,0.7664,0.5372,0.5375,0.286
lightgbm,Light Gradient Boosting Machine,0.7748,0.8537,0.7354,0.7986,0.7656,0.5497,0.5515,0.698
gbc,Gradient Boosting Classifier,0.7696,0.8526,0.7403,0.7865,0.7626,0.5391,0.5402,1.202
xgboost,Extreme Gradient Boosting,0.7636,0.8442,0.7308,0.7822,0.7556,0.5272,0.5284,0.256
et,Extra Trees Classifier,0.7677,0.85,0.7078,0.8045,0.7528,0.5353,0.5395,0.138
lr,Logistic Regression,0.7668,0.8466,0.6701,0.831,0.7418,0.5337,0.5441,0.05
ridge,Ridge Classifier,0.7602,0.8423,0.6454,0.838,0.7291,0.5204,0.5348,0.03
lda,Linear Discriminant Analysis,0.7602,0.8423,0.6454,0.838,0.7291,0.5204,0.5348,0.282
knn,K Neighbors Classifier,0.7217,0.7848,0.6788,0.7429,0.7092,0.4434,0.4452,0.106


In [6]:
# 최상의 모델 중 하나를 선택하여 튜닝 (예: LightGBM 선택)
best_model = top_models[0]  # 가장 성능이 좋은 모델 선택
tuned_model = tune_model(best_model, optimize='F1', fold=5)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.752,0.8406,0.7344,0.7612,0.7476,0.5041,0.5044
1,0.7629,0.8443,0.7466,0.7717,0.759,0.5257,0.526
2,0.7532,0.8337,0.7449,0.7572,0.751,0.5064,0.5065
3,0.7702,0.8473,0.7615,0.7752,0.7683,0.5403,0.5404
4,0.7498,0.8323,0.7534,0.7483,0.7508,0.4997,0.4997
Mean,0.7576,0.8397,0.7482,0.7627,0.7553,0.5152,0.5154
Std,0.0077,0.0058,0.009,0.0098,0.0075,0.0154,0.0154


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [7]:
best_model


In [9]:
# 범주형 변수 처리 (One-Hot Encoding)
categorical_cols = test_df.select_dtypes(include=['object']).columns
test_df = pd.get_dummies(test_df, columns=categorical_cols)

In [11]:
# 채무 불이행 '확률'을 예측합니다.
preds = best_model.predict_proba(test_df)[:,1]
submit = pd.read_csv('open/sample_submission.csv')

# 결과 저장
submit['채무 불이행 확률'] = preds
submit.to_csv('smote_pycaret1.csv', encoding='UTF-8-sig', index=False)

tuning

In [None]:
# AUC 기준으로 RandomForestClassifier 최적화
tuned_rf_auc = tune_model(best_model, optimize='AUC', fold=10)
# 최종 모델 학습
final_rf = finalize_model(tuned_rf_auc)

In [None]:
# 채무 불이행 '확률'을 예측합니다.
preds = final_rf.predict_proba(test_df)[:,1]
submit = pd.read_csv('open/sample_submission.csv')

# 결과 저장
submit['채무 불이행 확률'] = preds
submit.to_csv('smote_pycaret2.csv', encoding='UTF-8-sig', index=False)