## 1. Import

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings(action='ignore')

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/jinushin/dacon/.venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <89AD948E-E564-3266-867D-7AF89D6488F0> /Users/jinushin/dacon/.venv/lib/python3.10/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file)"]


## 2. Data Load

In [None]:
# 학습/평가 데이터 로드
train_df = pd.read_csv('open/train.csv').drop(columns=['UID'])
test_df = pd.read_csv('open/test.csv').drop(columns=['UID'])

## 3. Pre-processing (전처리)

In [None]:
categorical_col = [
    '주거 형태',
    '현재 직장 근속 연수',
    '대출 목적',
    '대출 상환 기간'
]

# OneHotEncoder 초기화
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# 훈련 데이터에 대해 인코더 학습
encoder.fit(train_df[categorical_col])

# 훈련 데이터와 테스트 데이터 변환
train_encoded = encoder.transform(train_df[categorical_col])
test_encoded = encoder.transform(test_df[categorical_col])

# One-hot encoding 결과를 데이터프레임으로 변환
train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_col))
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_col))

# 인코딩된 결과를 원래 데이터에 적용
train_df = pd.concat([train_df.drop(columns=categorical_col).reset_index(drop=True), train_encoded_df], axis=1)
test_df = pd.concat([test_df.drop(columns=categorical_col).reset_index(drop=True), test_encoded_df], axis=1)

## 4. Train / Validation Split (학습 데이터 분할)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df.drop(columns=['채무 불이행 여부']), 
    train_df['채무 불이행 여부'], 
    test_size=0.2, 
    random_state=42
)

## 5. Model Training (모델 학습)

In [53]:
# XGBoost 모델 학습
model = XGBClassifier(
    n_estimators=10,  # 트리 개수
    max_depth=5,       # 최대 깊이
    learning_rate=0.15, # 학습률
    random_state=42,
    use_label_encoder=False,  # 경고 방지
    eval_metric="auc",        # 평가 지표 설정
    early_stopping_rounds=10 
)



In [54]:
# 학습 및 Validation 성능 모니터링
eval_set = [(X_train, y_train), (X_val, y_val)]
model.fit(
    X_train, y_train,
    eval_set=eval_set,
    verbose=True,  # 학습 로그 출력
     # 5번의 학습 동안 성능 개선 없을 시 조기 종료
)

[0]	validation_0-auc:0.73056	validation_1-auc:0.67897
[1]	validation_0-auc:0.74434	validation_1-auc:0.69985
[2]	validation_0-auc:0.75222	validation_1-auc:0.70013
[3]	validation_0-auc:0.75962	validation_1-auc:0.70342
[4]	validation_0-auc:0.76341	validation_1-auc:0.70646
[5]	validation_0-auc:0.76785	validation_1-auc:0.70622
[6]	validation_0-auc:0.77482	validation_1-auc:0.71070
[7]	validation_0-auc:0.78105	validation_1-auc:0.71547
[8]	validation_0-auc:0.78692	validation_1-auc:0.71940
[9]	validation_0-auc:0.78952	validation_1-auc:0.72130


In [31]:
from xgboost import XGBClassifier, callback
# 조기 종료 콜백 설정
early_stopping = callback.EarlyStopping(
    rounds=10,  # 10번 동안 개선이 없으면 중지
    metric_name="auc",  # 모니터링할 평가 지표
    data_name="validation"  # validation set 기준
)

In [48]:

# 학습 및 Validation 성능 모니터링
eval_set = [(X_train, y_train), (X_val, y_val)]
model.fit(
    X_train, y_train,
    eval_set=eval_set,
    verbose=True,  # 학습 로그 출력
    # early_stopping_rounds=10  # 5번의 학습 동안 성능 개선 없을 시 조기 종료
    
)

[0]	validation_0-auc:0.85771	validation_1-auc:0.65670
[1]	validation_0-auc:0.89544	validation_1-auc:0.67326
[2]	validation_0-auc:0.90981	validation_1-auc:0.68084
[3]	validation_0-auc:0.92602	validation_1-auc:0.68967
[4]	validation_0-auc:0.93630	validation_1-auc:0.68469
[5]	validation_0-auc:0.94508	validation_1-auc:0.68863
[6]	validation_0-auc:0.95141	validation_1-auc:0.69159
[7]	validation_0-auc:0.95644	validation_1-auc:0.69489
[8]	validation_0-auc:0.96268	validation_1-auc:0.69589
[9]	validation_0-auc:0.96669	validation_1-auc:0.69660
[10]	validation_0-auc:0.97014	validation_1-auc:0.69824
[11]	validation_0-auc:0.97353	validation_1-auc:0.69934
[12]	validation_0-auc:0.97556	validation_1-auc:0.70011
[13]	validation_0-auc:0.97897	validation_1-auc:0.70063
[14]	validation_0-auc:0.98106	validation_1-auc:0.70174
[15]	validation_0-auc:0.98339	validation_1-auc:0.70115
[16]	validation_0-auc:0.98481	validation_1-auc:0.70062
[17]	validation_0-auc:0.98599	validation_1-auc:0.69946
[18]	validation_0-au

## 6. Prediction

In [49]:
# 채무 불이행 '확률'을 예측합니다.
preds = model.predict_proba(test_df)[:,1]

## 7. Submission (제출 파일 생성)

In [50]:
submit = pd.read_csv('open/sample_submission.csv')

# 결과 저장
submit['채무 불이행 확률'] = preds
submit.to_csv('submission_base2.csv', encoding='UTF-8-sig', index=False)