In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
# 학습/평가 데이터 로드
train_df = pd.read_csv('open/train.csv').drop(columns=['UID'])
test_df = pd.read_csv('open/test.csv').drop(columns=['UID'])

In [3]:
categorical_col = [
    '주거 형태',
    '현재 직장 근속 연수',
    '대출 목적',
    '대출 상환 기간'
]

# OneHotEncoder 초기화
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# 훈련 데이터에 대해 인코더 학습
encoder.fit(train_df[categorical_col])

# 훈련 데이터와 테스트 데이터 변환
train_encoded = encoder.transform(train_df[categorical_col])
test_encoded = encoder.transform(test_df[categorical_col])

# One-hot encoding 결과를 데이터프레임으로 변환
train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_col))
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_col))

# 인코딩된 결과를 원래 데이터에 적용
train_df = pd.concat([train_df.drop(columns=categorical_col).reset_index(drop=True), train_encoded_df], axis=1)
test_df = pd.concat([test_df.drop(columns=categorical_col).reset_index(drop=True), test_encoded_df], axis=1)

In [4]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df.drop(columns=['채무 불이행 여부']), 
    train_df['채무 불이행 여부'], 
    test_size=0.2, 
    random_state=42
)

In [21]:
from lightgbm import LGBMClassifier

# ✅ LightGBM 모델 생성
model = LGBMClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.15,
    random_state=42,
    early_stopping_rounds=10
)



In [22]:
# ✅ 학습 및 Validation 성능 모니터링
eval_set = [(X_train, y_train), (X_val, y_val)]

# ✅ LightGBM 학습 (early_stopping_rounds 지원됨)
model.fit(
    X_train, y_train,
    eval_set=eval_set,
    eval_metric="auc",  # 평가 지표 설정
    # verbose_eval=True,  # 학습 로그 출력
      # 10번 동안 개선 없으면 학습 종료
)




[LightGBM] [Info] Number of positive: 2682, number of negative: 5318
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000877 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1879
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.335250 -> initscore=-0.684535
[LightGBM] [Info] Start training from score -0.684535
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[33]	training's auc: 0.83814	training's binary_logloss: 0.478829	valid_1's auc: 0.737436	valid_1's binary_logloss: 0.576181


In [12]:
help(model.fit)


Help on method fit in module lightgbm.sklearn:

fit(X: Union[lightgbm.compat.dt_DataTable, List[Union[List[float], List[int]]], numpy.ndarray, pandas.core.frame.DataFrame, scipy.sparse._matrix.spmatrix], y: Union[List[float], List[int], numpy.ndarray, pandas.core.series.Series, pandas.core.frame.DataFrame, lightgbm.compat.pa_Array, lightgbm.compat.pa_ChunkedArray], sample_weight: Union[List[float], List[int], numpy.ndarray, pandas.core.series.Series, lightgbm.compat.pa_Array, lightgbm.compat.pa_ChunkedArray, NoneType] = None, init_score: Union[List[float], List[List[float]], numpy.ndarray, pandas.core.series.Series, pandas.core.frame.DataFrame, lightgbm.compat.pa_Table, lightgbm.compat.pa_Array, lightgbm.compat.pa_ChunkedArray, NoneType] = None, eval_set: Optional[List[Tuple[Union[lightgbm.compat.dt_DataTable, List[Union[List[float], List[int]]], numpy.ndarray, pandas.core.frame.DataFrame, scipy.sparse._matrix.spmatrix], Union[List[float], List[int], numpy.ndarray, pandas.core.series.S

In [23]:
# 채무 불이행 '확률'을 예측합니다.
preds = model.predict_proba(test_df)[:,1]

In [24]:
submit = pd.read_csv('open/sample_submission.csv')

# 결과 저장
submit['채무 불이행 확률'] = preds
submit.to_csv('submission_base3.csv', encoding='UTF-8-sig', index=False)