In [17]:
import warnings
warnings.filterwarnings('ignore')

# 데이터 처리
import pandas as pd
import numpy as np

# 데이터 전처리
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold

# 불균형 데이터 처리
from imblearn.over_sampling import SMOTE

# 모델 평가
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# 학습/평가 데이터 로드
train_df = pd.read_csv('open/train.csv').drop(columns=['UID'])
test_df = pd.read_csv('open/test.csv').drop(columns=['UID'])

In [5]:


# X, y 분리
X = train_df.drop(columns=["채무 불이행 여부"])
y = train_df["채무 불이행 여부"]

In [6]:
categorical_cols = ["주거 형태", "현재 직장 근속 연수", "대출 목적", "대출 상환 기간"]

# 라벨 인코딩 (현재 직장 근속 연수)
label_enc = LabelEncoder()
X["현재 직장 근속 연수"] = label_enc.fit_transform(X["현재 직장 근속 연수"])
test_df["현재 직장 근속 연수"] = label_enc.transform(test_df["현재 직장 근속 연수"])

# 원-핫 인코딩 적용
X = pd.get_dummies(X, columns=["주거 형태", "대출 목적", "대출 상환 기간"], drop_first=True)
test_df = pd.get_dummies(test_df, columns=["주거 형태", "대출 목적", "대출 상환 기간"], drop_first=True)

In [7]:
# 로그 변환
log_columns = ["현재 미상환 신용액", "월 상환 부채액", "현재 대출 잔액"]
for col in log_columns:
    X[col] = np.log1p(X[col])
    test_df[col] = np.log1p(test_df[col])

# "마지막 연체 이후 경과 개월 수"가 0이면 "연체 없음" 컬럼 추가
X["연체 없음"] = (X["마지막 연체 이후 경과 개월 수"] == 0).astype(int)
test_df["연체 없음"] = (test_df["마지막 연체 이후 경과 개월 수"] == 0).astype(int)

In [8]:
imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X)
test_imputed = imputer.transform(test_df)

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
test_scaled = scaler.transform(test_imputed)

In [10]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

In [11]:
X_train, X_val, y_train, y_val = train_test_split(
    X_resampled, 
    y_resampled, 
    test_size=0.2, 
    random_state=42
)

In [12]:
# AdaBoost 모델 생성
base_model = DecisionTreeClassifier()
ada = AdaBoostClassifier(estimator=base_model, random_state=42)

In [37]:
# RandomizedSearchCV에서 사용할 파라미터 범위
param_dist = {
    'n_estimators': np.arange(50, 301, 50),
    'learning_rate': np.linspace(0.001, 1.0, 10),
    'algorithm': ['SAMME', 'SAMME.R'],
    'estimator__max_depth': np.arange(1, 11, 1),
    'estimator__min_samples_split': np.arange(2, 11, 2),
    'estimator__min_samples_leaf': np.arange(1, 5)
}

# RandomizedSearchCV 실행 (n_iter=20이면 20개 조합만 탐색)
random_search = RandomizedSearchCV(
    estimator=ada,
    param_distributions=param_dist,
    n_iter=20,  # 탐색할 조합 수 (GridSearch보다 빠름)
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# 모델 학습
random_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Accuracy: {random_search.best_score_:.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits




[CV] END algorithm=SAMME, estimator__max_depth=1, estimator__min_samples_leaf=3, estimator__min_samples_split=10, learning_rate=0.334, n_estimators=150; total time=   1.5s
[CV] END algorithm=SAMME, estimator__max_depth=1, estimator__min_samples_leaf=3, estimator__min_samples_split=10, learning_rate=0.334, n_estimators=150; total time=   1.6s
[CV] END algorithm=SAMME, estimator__max_depth=1, estimator__min_samples_leaf=3, estimator__min_samples_split=10, learning_rate=0.334, n_estimators=150; total time=   1.6s
[CV] END algorithm=SAMME, estimator__max_depth=1, estimator__min_samples_leaf=3, estimator__min_samples_split=10, learning_rate=0.334, n_estimators=150; total time=   1.6s
[CV] END algorithm=SAMME, estimator__max_depth=1, estimator__min_samples_leaf=3, estimator__min_samples_split=10, learning_rate=0.334, n_estimators=150; total time=   1.5s
[CV] END algorithm=SAMME.R, estimator__max_depth=4, estimator__min_samples_leaf=1, estimator__min_samples_split=8, learning_rate=0.223, n_es



[CV] END algorithm=SAMME, estimator__max_depth=5, estimator__min_samples_leaf=2, estimator__min_samples_split=10, learning_rate=0.889, n_estimators=150; total time=   6.4s




[CV] END algorithm=SAMME, estimator__max_depth=5, estimator__min_samples_leaf=2, estimator__min_samples_split=10, learning_rate=0.889, n_estimators=150; total time=   6.4s
[CV] END algorithm=SAMME.R, estimator__max_depth=10, estimator__min_samples_leaf=3, estimator__min_samples_split=10, learning_rate=0.223, n_estimators=150; total time=  11.1s
[CV] END algorithm=SAMME.R, estimator__max_depth=10, estimator__min_samples_leaf=3, estimator__min_samples_split=10, learning_rate=0.223, n_estimators=150; total time=  11.2s




[CV] END algorithm=SAMME.R, estimator__max_depth=10, estimator__min_samples_leaf=3, estimator__min_samples_split=10, learning_rate=0.223, n_estimators=150; total time=  11.3s
[CV] END algorithm=SAMME.R, estimator__max_depth=10, estimator__min_samples_leaf=3, estimator__min_samples_split=10, learning_rate=0.223, n_estimators=150; total time=  11.4s
[CV] END algorithm=SAMME.R, estimator__max_depth=10, estimator__min_samples_leaf=3, estimator__min_samples_split=10, learning_rate=0.223, n_estimators=150; total time=  11.4s
[CV] END algorithm=SAMME, estimator__max_depth=5, estimator__min_samples_leaf=2, estimator__min_samples_split=10, learning_rate=0.889, n_estimators=150; total time=   6.3s
[CV] END algorithm=SAMME, estimator__max_depth=5, estimator__min_samples_leaf=2, estimator__min_samples_split=10, learning_rate=0.889, n_estimators=150; total time=   6.5s
[CV] END algorithm=SAMME, estimator__max_depth=5, estimator__min_samples_leaf=2, estimator__min_samples_split=10, learning_rate=0.8



[CV] END algorithm=SAMME, estimator__max_depth=10, estimator__min_samples_leaf=4, estimator__min_samples_split=10, learning_rate=0.445, n_estimators=50; total time=   4.0s




[CV] END algorithm=SAMME.R, estimator__max_depth=9, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.667, n_estimators=150; total time=  10.6s




[CV] END algorithm=SAMME.R, estimator__max_depth=8, estimator__min_samples_leaf=4, estimator__min_samples_split=10, learning_rate=0.556, n_estimators=300; total time=  19.3s
[CV] END algorithm=SAMME.R, estimator__max_depth=8, estimator__min_samples_leaf=4, estimator__min_samples_split=10, learning_rate=0.556, n_estimators=300; total time=  19.4s




[CV] END algorithm=SAMME.R, estimator__max_depth=9, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.667, n_estimators=150; total time=  10.6s
[CV] END algorithm=SAMME.R, estimator__max_depth=8, estimator__min_samples_leaf=4, estimator__min_samples_split=10, learning_rate=0.556, n_estimators=300; total time=  19.4s
[CV] END algorithm=SAMME.R, estimator__max_depth=8, estimator__min_samples_leaf=4, estimator__min_samples_split=10, learning_rate=0.556, n_estimators=300; total time=  19.3s
[CV] END algorithm=SAMME.R, estimator__max_depth=8, estimator__min_samples_leaf=4, estimator__min_samples_split=10, learning_rate=0.556, n_estimators=300; total time=  20.1s
[CV] END algorithm=SAMME, estimator__max_depth=10, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.001, n_estimators=250; total time=  18.2s
[CV] END algorithm=SAMME, estimator__max_depth=10, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.001



[CV] END algorithm=SAMME, estimator__max_depth=6, estimator__min_samples_leaf=1, estimator__min_samples_split=10, learning_rate=0.445, n_estimators=100; total time=   5.0s




[CV] END algorithm=SAMME, estimator__max_depth=10, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.001, n_estimators=250; total time=  17.9s
[CV] END algorithm=SAMME, estimator__max_depth=10, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.001, n_estimators=250; total time=  18.2s




[CV] END algorithm=SAMME, estimator__max_depth=10, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.001, n_estimators=250; total time=  18.3s




[CV] END algorithm=SAMME, estimator__max_depth=6, estimator__min_samples_leaf=1, estimator__min_samples_split=10, learning_rate=0.445, n_estimators=100; total time=   5.0s
[CV] END algorithm=SAMME, estimator__max_depth=6, estimator__min_samples_leaf=1, estimator__min_samples_split=10, learning_rate=0.445, n_estimators=100; total time=   5.3s
[CV] END algorithm=SAMME.R, estimator__max_depth=9, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.667, n_estimators=150; total time=  10.5s
[CV] END algorithm=SAMME, estimator__max_depth=6, estimator__min_samples_leaf=1, estimator__min_samples_split=10, learning_rate=0.445, n_estimators=100; total time=   5.4s
[CV] END algorithm=SAMME, estimator__max_depth=6, estimator__min_samples_leaf=1, estimator__min_samples_split=10, learning_rate=0.445, n_estimators=100; total time=   5.0s
[CV] END algorithm=SAMME.R, estimator__max_depth=9, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.667, n_e



[CV] END algorithm=SAMME.R, estimator__max_depth=9, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.667, n_estimators=150; total time=  10.8s




[CV] END algorithm=SAMME.R, estimator__max_depth=5, estimator__min_samples_leaf=1, estimator__min_samples_split=2, learning_rate=0.889, n_estimators=150; total time=   6.4s




[CV] END algorithm=SAMME.R, estimator__max_depth=5, estimator__min_samples_leaf=1, estimator__min_samples_split=2, learning_rate=0.889, n_estimators=150; total time=   6.4s
[CV] END algorithm=SAMME.R, estimator__max_depth=5, estimator__min_samples_leaf=1, estimator__min_samples_split=2, learning_rate=0.889, n_estimators=150; total time=   6.2s




[CV] END algorithm=SAMME.R, estimator__max_depth=5, estimator__min_samples_leaf=1, estimator__min_samples_split=2, learning_rate=0.889, n_estimators=150; total time=   6.4s




[CV] END algorithm=SAMME.R, estimator__max_depth=5, estimator__min_samples_leaf=1, estimator__min_samples_split=2, learning_rate=0.889, n_estimators=150; total time=   6.4s




[CV] END algorithm=SAMME, estimator__max_depth=4, estimator__min_samples_leaf=3, estimator__min_samples_split=8, learning_rate=0.778, n_estimators=250; total time=   8.6s




[CV] END algorithm=SAMME, estimator__max_depth=4, estimator__min_samples_leaf=3, estimator__min_samples_split=8, learning_rate=0.778, n_estimators=250; total time=   8.7s




[CV] END algorithm=SAMME, estimator__max_depth=4, estimator__min_samples_leaf=3, estimator__min_samples_split=8, learning_rate=0.778, n_estimators=250; total time=   8.6s
[CV] END algorithm=SAMME, estimator__max_depth=4, estimator__min_samples_leaf=3, estimator__min_samples_split=8, learning_rate=0.778, n_estimators=250; total time=   8.5s




[CV] END algorithm=SAMME, estimator__max_depth=4, estimator__min_samples_leaf=3, estimator__min_samples_split=8, learning_rate=0.778, n_estimators=250; total time=   8.7s
[CV] END algorithm=SAMME.R, estimator__max_depth=9, estimator__min_samples_leaf=2, estimator__min_samples_split=4, learning_rate=0.001, n_estimators=150; total time=  10.0s
[CV] END algorithm=SAMME.R, estimator__max_depth=9, estimator__min_samples_leaf=2, estimator__min_samples_split=4, learning_rate=0.001, n_estimators=150; total time=   9.9s
[CV] END algorithm=SAMME.R, estimator__max_depth=3, estimator__min_samples_leaf=1, estimator__min_samples_split=2, learning_rate=0.334, n_estimators=300; total time=   8.0s
[CV] END algorithm=SAMME.R, estimator__max_depth=3, estimator__min_samples_leaf=1, estimator__min_samples_split=2, learning_rate=0.334, n_estimators=300; total time=   7.9s




[CV] END algorithm=SAMME.R, estimator__max_depth=9, estimator__min_samples_leaf=2, estimator__min_samples_split=4, learning_rate=0.001, n_estimators=150; total time=  10.0s




[CV] END algorithm=SAMME.R, estimator__max_depth=9, estimator__min_samples_leaf=2, estimator__min_samples_split=4, learning_rate=0.001, n_estimators=150; total time=  10.0s




[CV] END algorithm=SAMME.R, estimator__max_depth=9, estimator__min_samples_leaf=2, estimator__min_samples_split=4, learning_rate=0.001, n_estimators=150; total time=  10.3s




[CV] END algorithm=SAMME.R, estimator__max_depth=3, estimator__min_samples_leaf=1, estimator__min_samples_split=2, learning_rate=0.334, n_estimators=300; total time=   7.9s




[CV] END algorithm=SAMME.R, estimator__max_depth=3, estimator__min_samples_leaf=1, estimator__min_samples_split=2, learning_rate=0.334, n_estimators=300; total time=   8.0s
[CV] END algorithm=SAMME.R, estimator__max_depth=3, estimator__min_samples_leaf=1, estimator__min_samples_split=2, learning_rate=0.334, n_estimators=300; total time=   7.9s
[CV] END algorithm=SAMME.R, estimator__max_depth=4, estimator__min_samples_leaf=2, estimator__min_samples_split=6, learning_rate=0.001, n_estimators=200; total time=   6.7s
[CV] END algorithm=SAMME, estimator__max_depth=7, estimator__min_samples_leaf=4, estimator__min_samples_split=8, learning_rate=0.778, n_estimators=50; total time=   2.8s
[CV] END algorithm=SAMME.R, estimator__max_depth=4, estimator__min_samples_leaf=2, estimator__min_samples_split=6, learning_rate=0.001, n_estimators=200; total time=   6.8s
[CV] END algorithm=SAMME.R, estimator__max_depth=4, estimator__min_samples_leaf=2, estimator__min_samples_split=6, learning_rate=0.001, n_



[CV] END algorithm=SAMME, estimator__max_depth=1, estimator__min_samples_leaf=3, estimator__min_samples_split=6, learning_rate=0.889, n_estimators=100; total time=   1.0s




[CV] END algorithm=SAMME, estimator__max_depth=2, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.001, n_estimators=300; total time=   5.5s




[CV] END algorithm=SAMME, estimator__max_depth=2, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.001, n_estimators=300; total time=   5.7s
[CV] END algorithm=SAMME, estimator__max_depth=2, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.001, n_estimators=300; total time=   5.5s




[CV] END algorithm=SAMME, estimator__max_depth=2, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.001, n_estimators=300; total time=   5.6s
[CV] END algorithm=SAMME, estimator__max_depth=2, estimator__min_samples_leaf=2, estimator__min_samples_split=8, learning_rate=0.001, n_estimators=300; total time=   5.5s
[CV] END algorithm=SAMME, estimator__max_depth=10, estimator__min_samples_leaf=2, estimator__min_samples_split=10, learning_rate=0.334, n_estimators=300; total time=  22.6s
[CV] END algorithm=SAMME, estimator__max_depth=3, estimator__min_samples_leaf=1, estimator__min_samples_split=2, learning_rate=0.556, n_estimators=200; total time=   5.2s
[CV] END algorithm=SAMME, estimator__max_depth=3, estimator__min_samples_leaf=1, estimator__min_samples_split=2, learning_rate=0.556, n_estimators=200; total time=   5.2s
[CV] END algorithm=SAMME, estimator__max_depth=10, estimator__min_samples_leaf=2, estimator__min_samples_split=10, learning_rate=0.334, n_estim

In [38]:
# 최적의 모델 평가
best_ada = random_search.best_estimator_

In [39]:
# 최적의 모델 평가
best_ada = random_search.best_estimator_

In [40]:
# 채무 불이행 '확률'을 예측합니다.
preds = best_ada.predict_proba(test_scaled)[:,1]

In [41]:
submit = pd.read_csv('open/sample_submission.csv')

# 결과 저장
submit['채무 불이행 확률'] = preds
submit.to_csv('ada5.csv', encoding='UTF-8-sig', index=False)

In [33]:
# GBC 모델 생성
gbc = GradientBoostingClassifier(random_state=42)
# RandomizedSearchCV에서 사용할 파라미터 범위
param_dist = {
    'n_estimators': np.arange(50, 501, 10),
    'learning_rate': np.linspace(0.001, 0.3, 300),
    'max_depth': np.arange(7, 25, 2),
    'min_samples_split': np.arange(2, 11, 2),
    'min_samples_leaf': np.arange(1, 7),
    'subsample': np.linspace(0.6, 1.0, 5),
    # 'max_features': ['sqrt', 'log2', None]
}

# RandomizedSearchCV 실행 (n_iter=20이면 20개 조합만 탐색)
random_search = RandomizedSearchCV(
    estimator=gbc,
    param_distributions=param_dist,
    n_iter=20,  # 탐색할 조합 수 (GridSearch보다 빠름)
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# 모델 학습
random_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Accuracy: {random_search.best_score_:.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END learning_rate=0.036000000000000004, max_depth=23, min_samples_leaf=6, min_samples_split=2, n_estimators=120, subsample=1.0; total time=  14.0s
[CV] END learning_rate=0.036000000000000004, max_depth=23, min_samples_leaf=6, min_samples_split=2, n_estimators=120, subsample=1.0; total time=  15.3s
[CV] END learning_rate=0.215, max_depth=13, min_samples_leaf=5, min_samples_split=4, n_estimators=370, subsample=0.8; total time=  24.4s
[CV] END learning_rate=0.215, max_depth=13, min_samples_leaf=5, min_samples_split=4, n_estimators=370, subsample=0.8; total time=  24.7s
[CV] END learning_rate=0.036000000000000004, max_depth=23, min_samples_leaf=6, min_samples_split=2, n_estimators=120, subsample=1.0; total time=  12.3s
[CV] END learning_rate=0.215, max_depth=13, min_samples_leaf=5, min_samples_split=4, n_estimators=370, subsample=0.8; total time=  28.8s
[CV] END learning_rate=0.215, max_depth=13, min_samples_leaf=5, min_sam

In [34]:
best_gbc = random_search.best_estimator_


In [35]:
# 채무 불이행 '확률'을 예측합니다.
preds = best_gbc.predict_proba(test_df)[:,1]

In [36]:
submit = pd.read_csv('open/sample_submission.csv')

# 결과 저장
submit['채무 불이행 확률'] = preds
submit.to_csv('gbc5_somte.csv', encoding='UTF-8-sig', index=False)

In [42]:
from lightgbm import LGBMClassifier

# ✅ LightGBM 모델 생성
model = LGBMClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.15,
    random_state=42,
    early_stopping_rounds=10
)



In [43]:
# ✅ 학습 및 Validation 성능 모니터링
eval_set = [(X_train, y_train), (X_val, y_val)]

# ✅ LightGBM 학습 (early_stopping_rounds 지원됨)
model.fit(
    X_train, y_train,
    eval_set=eval_set,
    eval_metric="auc",  # 평가 지표 설정
    # verbose_eval=True,  # 학습 로그 출력
      # 10번 동안 개선 없으면 학습 종료
)




[LightGBM] [Info] Number of positive: 5255, number of negative: 5285
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002058 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3056
[LightGBM] [Info] Number of data points in the train set: 10540, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498577 -> initscore=-0.005693
[LightGBM] [Info] Start training from score -0.005693
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[78]	training's auc: 0.918784	training's binary_logloss: 0.394047	valid_1's auc: 0.861189	valid_1's binary_logloss: 0.46504


In [44]:
# 채무 불이행 '확률'을 예측합니다.
preds = model.predict_proba(test_df)[:,1]

In [45]:
submit = pd.read_csv('open/sample_submission.csv')

# 결과 저장
submit['채무 불이행 확률'] = preds
submit.to_csv('submission_base4.csv', encoding='UTF-8-sig', index=False)