# LGBM + bayesian-optimization

In [None]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6


In [None]:
import os
import gc
import re
import pickle
import joblib
import pandas as pd
import numpy as np

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from bayes_opt import BayesianOptimization

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from lightgbm import LGBMClassifier
from collections import Counter
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import PolynomialFeatures

SEED = 42

In [None]:
from google.colab import drive
import pandas as pd

# Google Drive를 마운트
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 파일 경로 설정
file_path = '/content/drive/My Drive/데이콘/train.csv'
file_path_2 = '/content/drive/My Drive/데이콘/test.csv'

# pandas를 사용하여 데이터 파일 불러오기
train_data = pd.read_csv(file_path)
test_data = pd.read_csv(file_path_2)

In [None]:
train_data.describe()

Unnamed: 0,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
count,96294.0,96294.0,96294.0,96294.0,96294.0,96294.0,96294.0,96294.0,96294.0
mean,18304000.0,93926720.0,19.37959,25.304827,0.345681,822503.5,428228.2,54.380584,0.005805
std,10329080.0,99568710.0,33.569559,12.088566,0.919119,1027745.0,440211.1,1414.769218,0.079966
min,1200000.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
25%,10200000.0,57600000.0,12.65,17.0,0.0,307572.0,134616.0,0.0,0.0
50%,16800000.0,78000000.0,18.74,24.0,0.0,597696.0,287004.0,0.0,0.0
75%,24000000.0,112800000.0,25.54,32.0,0.0,1055076.0,570216.0,0.0,0.0
max,42000000.0,10800000000.0,9999.0,169.0,30.0,41955940.0,5653416.0,75768.0,4.0


In [None]:
# 새로운 feature 형성
train_data['원금_대비_이자_비율'] = (train_data['총상환이자'].replace(0,0.00000001)) / (train_data['총상환원금'].replace(0,0.00000001))
test_data['원금_대비_이자_비율'] = (test_data['총상환이자'].replace(0,0.00000001)) / (test_data['총상환원금'].replace(0,0.00000001))

train_data['대출_대비_원금_비율'] = (train_data['총상환원금'].replace(0,0.00000001)) / (train_data['대출금액'] )
test_data['대출_대비_원금_비율'] = (test_data['총상환원금'].replace(0,0.00000001)) / (test_data['대출금액'] )

train_data['대출_대비_이자_비율'] = (train_data['총상환이자'].replace(0,0.00000001)) / (train_data['대출금액'] )
test_data['대출_대비_이자_비율'] = (test_data['총상환이자'].replace(0,0.00000001)) / (test_data['대출금액'] )

train_data['연간소득_대비_원금_비율'] = (train_data['총상환원금'].replace(0,0.00000001)) / (train_data['연간소득'].replace(0,0.00000001))
test_data['연간소득_대비_원금_비율'] = (test_data['총상환원금'].replace(0,0.00000001)) / (test_data['연간소득'].replace(0,0.00000001))


# 대출기간을 숫자로 변환 (예: '36 months' -> 36, '60 months' -> 60)
train_data['대출기간_숫자'] = train_data['대출기간'].str.extract('(\d+)').astype(int)
test_data['대출기간_숫자'] = test_data['대출기간'].str.extract('(\d+)').astype(int)

In [None]:
conti_col = ['대출기간_숫자' , '원금_대비_이자_비율', '대출_대비_원금_비율', '대출_대비_이자_비율', '총상환이자','총상환원금' ]
p_train_data = train_data[conti_col]
# PolynomialFeatures를 사용하여 2차 다항 특성 생성
# 변수들간의 상호작용을 전처리
poly = PolynomialFeatures(degree=2)
poly_features = poly.fit_transform(p_train_data)
polyed = pd.DataFrame(poly_features)
train_data = pd.concat([train_data,polyed],axis=1)

p_test_data = test_data[conti_col]
poly_features_test = poly.transform(p_test_data)
polyed_test = pd.DataFrame(poly_features_test)
test_data = pd.concat([test_data, polyed_test], axis=1)

In [None]:
train_data = train_data.drop(columns=['ID','근로기간', '연체계좌수', '대출기간'])
test_ids = test_data['ID']
test_data = test_data.drop(columns=['ID','근로기간', '연체계좌수', '대출기간'])

In [None]:
# X's & Y Split
Y = train_data['대출등급']
X = train_data.drop(columns=['대출등급'])

In [None]:
# 선택된 수치형 변수를 사용하여 이상치 탐지를 수행
numerical_features = ['대출_대비_원금_비율', '대출_대비_이자_비율']

# LOF 모델 생성
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01)

# 이상치 탐지 실행
lof_labels = lof.fit_predict(X[numerical_features])

# 이상치 결과 추가
X['LOF_Outlier'] = lof_labels

# 이상치를 제거하기 위해 LOF_Outlier 열이 1인 행만 필터링
X = X[X['LOF_Outlier'] == 1]

X = X.drop(['LOF_Outlier'], axis=1)

X.describe()

Unnamed: 0,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,원금_대비_이자_비율,대출_대비_원금_비율,...,18,19,20,21,22,23,24,25,26,27
count,95331.0,95331.0,95331.0,95331.0,95331.0,95331.0,95331.0,95331.0,95331.0,95331.0,...,95331.0,95331.0,95331.0,95331.0,95331.0,95331.0,95331.0,95331.0,95331.0,95331.0
mean,18389340.0,94109410.0,19.391955,25.321962,0.3459,825252.8,430315.8,51.641376,1965850000.0,0.04775235,...,0.004586476,0.001287743,22743.46,76592.33,0.0007529069,14872.189596,22743.46,379865800000.0,542022700000.0,1744648000000.0
std,10316810.0,99931240.0,33.728229,12.093855,0.919222,1031318.0,441243.9,1394.836463,469888500000.0,0.04802304,...,0.03195708,0.001609679,35631.74,646905.8,0.001042758,26388.05875,35631.74,892174700000.0,1171931000000.0,19171550000000.0
min,1200000.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,8.333333e-16,2.380952e-16,...,5.668934e-32,5.668934e-32,0.0,0.0,5.668934e-32,0.0,0.0,0.0,0.0,0.0
25%,10470000.0,57600000.0,12.65,17.0,0.0,308184.0,135822.0,0.0,0.2998103,0.02349294,...,0.0005519183,0.0002233705,3651.849,7748.429,0.0001158923,1640.44864,3651.849,18447620000.0,46360480000.0,94977380000.0
50%,16800000.0,78000000.0,18.75,24.0,0.0,598944.0,288912.0,0.0,0.4651914,0.043575,...,0.001898781,0.000877264,12648.15,24235.22,0.0003799381,5545.978446,12648.15,83470140000.0,183057400000.0,358733900000.0
75%,24000000.0,112800000.0,25.56,32.0,0.0,1056648.0,572712.0,0.0,0.8937165,0.068998,...,0.004760724,0.001936246,30114.46,61226.13,0.0009603077,16388.02752,30114.46,327999000000.0,581166700000.0,1116505000000.0
max,42000000.0,10800000000.0,9999.0,169.0,30.0,41955940.0,5653416.0,75768.0,135465600000000.0,1.0,...,1.0,0.0594557,2140405.0,41911920.0,0.03293004,887808.679696,2140405.0,31961110000000.0,77054590000000.0,1760301000000000.0


In [None]:
# 선택된 수치형 변수를 사용하여 이상치 탐지를 수행
numerical_features = ['연간소득']

# LOF 모델 생성
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01)

# 이상치 탐지 실행
lof_labels = lof.fit_predict(X[numerical_features])

# 이상치 결과 추가
X['LOF_Outlier'] = lof_labels

# 이상치를 제거하기 위해 LOF_Outlier 열이 1인 행만 필터링
X = X[X['LOF_Outlier'] == 1]

X = X.drop(['LOF_Outlier'], axis=1)

X.describe()

Unnamed: 0,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,원금_대비_이자_비율,대출_대비_원금_비율,...,18,19,20,21,22,23,24,25,26,27
count,94377.0,94377.0,94377.0,94377.0,94377.0,94377.0,94377.0,94377.0,94377.0,94377.0,...,94377.0,94377.0,94377.0,94377.0,94377.0,94377.0,94377.0,94377.0,94377.0,94377.0
mean,18298140.0,92866530.0,19.425403,25.265605,0.345529,820200.8,428513.4,52.163387,1985721000.0,0.04774912,...,0.004579007,0.001288448,22629.13,75592.01,0.0007537262,14826.735984,22629.13,377074400000.0,535999600000.0,1696339000000.0
std,10261560.0,99126300.0,33.885438,12.063728,0.918896,1011741.0,439832.6,1401.858878,472257400000.0,0.04794845,...,0.03180539,0.001608499,35182.69,622646.3,0.001044467,26363.468793,35182.69,889332500000.0,1144731000000.0,17599030000000.0
min,1200000.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,8.333333e-16,2.380952e-16,...,5.668934e-32,5.668934e-32,0.0,0.0,5.668934e-32,0.0,0.0,0.0,0.0,0.0
25%,10290000.0,57240000.0,12.69,17.0,0.0,307848.0,135348.0,0.0,0.299815,0.02349273,...,0.0005519082,0.0002233687,3639.337,7702.845,0.000115896,1634.616919,3639.337,18319080000.0,45945410000.0,94770390000.0
50%,16800000.0,78000000.0,18.78,24.0,0.0,596544.0,287652.0,0.0,0.465144,0.043575,...,0.001898781,0.0008772465,12569.25,24224.75,0.0003799381,5516.072284,12569.25,82743670000.0,181304800000.0,355864700000.0
75%,24000000.0,110400000.0,25.6,32.0,0.0,1051068.0,570180.0,0.0,0.8974023,0.068998,...,0.004760724,0.001939134,29992.56,61014.67,0.0009608177,16310.108268,29992.56,325105200000.0,573815700000.0,1104744000000.0
max,42000000.0,10800000000.0,9999.0,169.0,30.0,36721160.0,5653416.0,75768.0,135465600000000.0,1.0,...,1.0,0.0594557,2140405.0,35550000.0,0.03293004,887808.679696,2140405.0,31961110000000.0,77054590000000.0,1348444000000000.0


In [None]:
# 선택된 수치형 변수를 사용하여 이상치 탐지를 수행
numerical_features = ['부채_대비_소득_비율']

# LOF 모델 생성
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)

# 이상치 탐지 실행
lof_labels = lof.fit_predict(X[numerical_features])

# 이상치 결과 추가
X['LOF_Outlier'] = lof_labels

# 이상치를 제거하기 위해 LOF_Outlier 열이 1인 행만 필터링
X = X[X['LOF_Outlier'] == 1]

X = X.drop(['LOF_Outlier'], axis=1)

X.describe()

Unnamed: 0,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,원금_대비_이자_비율,대출_대비_원금_비율,...,18,19,20,21,22,23,24,25,26,27
count,89658.0,89658.0,89658.0,89658.0,89658.0,89658.0,89658.0,89658.0,89658.0,89658.0,...,89658.0,89658.0,89658.0,89658.0,89658.0,89658.0,89658.0,89658.0,89658.0,89658.0
mean,18304740.0,92893840.0,19.17506,25.258616,0.346472,821579.0,427593.9,50.123268,2090236000.0,0.04781004,...,0.004594122,0.001286598,22605.66,75720.6,0.0007499684,14757.813195,22605.66,375504200000.0,535699200000.0,1699172000000.0
std,10264040.0,99852500.0,8.488509,12.04297,0.923047,1012023.0,438941.7,1384.519693,484526200000.0,0.04804527,...,0.03190799,0.001601414,35031.19,623106.4,0.001039151,26263.494953,35031.19,887330800000.0,1142231000000.0,17665020000000.0
min,1200000.0,6000000.0,0.0,4.0,0.0,0.0,0.0,0.0,8.333333e-16,2.380952e-16,...,5.668934e-32,5.668934e-32,0.0,0.0,5.668934e-32,0.0,0.0,0.0,0.0,0.0
25%,10350000.0,57600000.0,12.86,17.0,0.0,308184.0,135336.0,0.0,0.2998048,0.023493,...,0.000551921,0.0002233523,3639.337,7747.872,0.0001155195,1628.580183,3639.337,18315830000.0,45883410000.0,94977380000.0
50%,16800000.0,78000000.0,18.64,24.0,0.0,598680.0,287088.0,0.0,0.4639654,0.04357992,...,0.001899209,0.0008772267,12555.34,24235.22,0.0003799158,5504.9046,12555.34,82419520000.0,181319400000.0,358417700000.0
75%,24000000.0,110400000.0,25.03,32.0,0.0,1054812.0,568404.0,0.0,0.8927937,0.069,...,0.004761,0.001920842,29970.37,61136.11,0.0009602252,16228.003549,29970.37,323083100000.0,573581100000.0,1112628000000.0
max,42000000.0,10800000000.0,39.96,169.0,30.0,36721160.0,5653416.0,75768.0,135465600000000.0,1.0,...,1.0,0.0594557,2140405.0,35550000.0,0.03293004,887808.679696,2140405.0,31961110000000.0,77054590000000.0,1348444000000000.0


In [None]:
# 선택된 수치형 변수를 사용하여 이상치 탐지를 수행
numerical_features = ['총계좌수']

# LOF 모델 생성
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01)

# 이상치 탐지 실행
lof_labels = lof.fit_predict(X[numerical_features])

# 이상치 결과 추가
X['LOF_Outlier'] = lof_labels

# 이상치를 제거하기 위해 LOF_Outlier 열이 1인 행만 필터링
X = X[X['LOF_Outlier'] == 1]

X = X.drop(['LOF_Outlier'], axis=1)

X.describe()


Unnamed: 0,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,원금_대비_이자_비율,대출_대비_원금_비율,...,18,19,20,21,22,23,24,25,26,27
count,89557.0,89557.0,89557.0,89557.0,89557.0,89557.0,89557.0,89557.0,89557.0,89557.0,...,89557.0,89557.0,89557.0,89557.0,89557.0,89557.0,89557.0,89557.0,89557.0,89557.0
mean,18298910.0,92855140.0,19.171889,25.186663,0.34645,821059.0,427327.9,49.643825,2092594000.0,0.04780426,...,0.004586686,0.001286238,22586.47,75450.31,0.0007496665,14744.856344,22586.47,375013000000.0,535002600000.0,1690585000000.0
std,10261960.0,99882580.0,8.48887,11.844671,0.923075,1008196.0,438641.0,1376.00159,484799300000.0,0.04797358,...,0.03179977,0.001599466,34929.27,617601.1,0.001038797,26231.671536,34929.27,885604200000.0,1139068000000.0,17488610000000.0
min,1200000.0,6000000.0,0.0,4.0,0.0,0.0,0.0,0.0,8.333333e-16,2.380952e-16,...,5.668934e-32,5.668934e-32,0.0,0.0,5.668934e-32,0.0,0.0,0.0,0.0,0.0
25%,10350000.0,57600000.0,12.85,17.0,0.0,308172.0,135204.0,0.0,0.2998048,0.023493,...,0.000551921,0.0002233503,3639.337,7740.701,0.0001155095,1627.92081,3639.337,18280120000.0,45883410000.0,94969980000.0
50%,16800000.0,78000000.0,18.63,24.0,0.0,598152.0,286860.0,0.0,0.4639567,0.04358,...,0.001899216,0.0008772225,12553.06,24235.22,0.0003799158,5495.82135,12553.06,82288660000.0,181172100000.0,357785800000.0
75%,24000000.0,110400000.0,25.03,32.0,0.0,1054368.0,568008.0,0.0,0.8927895,0.069,...,0.004761,0.001918231,29963.2,61136.11,0.0009602148,16197.963471,29963.2,322633100000.0,572721700000.0,1111692000000.0
max,42000000.0,10800000000.0,39.96,92.0,30.0,36721160.0,5653416.0,75768.0,135465600000000.0,1.0,...,1.0,0.0594557,2140405.0,35550000.0,0.03293004,887808.679696,2140405.0,31961110000000.0,77054590000000.0,1348444000000000.0


In [None]:
# 숫자형(Integer), 범주형(Categorical) 변수 분할
numerical_list=[]
categorical_list=[]

for i in X.columns :
  if X[i].dtypes == 'O' :
    categorical_list.append(i)
  else :
    numerical_list.append(i)

print("categorical_list :", categorical_list)
print("numerical_list :", numerical_list)

categorical_list : ['주택소유상태', '대출목적']
numerical_list : ['대출금액', '연간소득', '부채_대비_소득_비율', '총계좌수', '최근_2년간_연체_횟수', '총상환원금', '총상환이자', '총연체금액', '원금_대비_이자_비율', '대출_대비_원금_비율', '대출_대비_이자_비율', '연간소득_대비_원금_비율', '대출기간_숫자', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]


In [None]:
encoders = {}
for col in categorical_list:
    encoder = LabelEncoder()
    encoder.fit(X[col])  # 원래 트레인 데이터에 대해 fit
    encoders[col] = encoder

In [None]:
remaining_indices = X.index
Y_filtered = Y.loc[remaining_indices]
# X의 현재 인덱스를 사용하여 train_test_split 호출
train_idx, valid_idx = train_test_split(X.index, test_size=0.3,random_state=SEED)

# 반환된 인덱스를 사용하여 X와 Y를 분할
x_train = X.loc[train_idx]
x_valid = X.loc[valid_idx]
y_train = Y_filtered.loc[train_idx]
y_valid = Y_filtered.loc[valid_idx]

In [None]:
X_test = test_data[x_train.columns]

In [None]:
def safe_transform(col, encoder, default_val=-1):
    labels = list(encoder.classes_)
    return col.apply(lambda x: encoder.transform([x])[0] if x in labels else default_val)

for col in categorical_list:
    # 각 컬럼에 대해 저장된 LabelEncoder를 사용하여 훈련 데이터와 검증 데이터를 변환합니다.
    x_train[col] = encoders[col].transform(x_train[col])
    x_valid[col] = encoders[col].transform(x_valid[col])

# 이제 테스트 데이터에 safe_transform을 적용합니다.
for col in categorical_list:
    X_test[col] = safe_transform(X_test[col], encoders[col])


In [None]:
x_train = x_train.values
# 샘플링 전략 설정
sampling_strategy = "auto"
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=SEED)
x_train, y_train = smote.fit_resample(x_train, y_train)

# 샘플링 후 클래스 분포 확인
print("SMOTE class distribution:", Counter(y_train))

SMOTE class distribution: Counter({'A': 18692, 'E': 18692, 'C': 18692, 'B': 18692, 'D': 18692, 'F': 18692, 'G': 18692})


In [None]:
from bayes_opt import BayesianOptimization

# Objective Function
def lgbm_eval(max_depth, n_estimators):
    params = {
        'n_estimators': int(round(n_estimators)),
        'learning_rate': 0.05,
        'max_depth': int(round(max_depth)),
        'num_leaves': 31,
        'reg_alpha': 0.3,
        'objective': 'multiclassova',
        'random_state': SEED
    }

    clf = LGBMClassifier(**params)
    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_valid)
    f1 = f1_score(y_valid, y_pred, average='macro')
    return f1

# Bayesian Optimization
optimizer = BayesianOptimization(
    f=lgbm_eval,
    pbounds={'max_depth': (7, 15), 'n_estimators': (100, 600)},
    random_state=SEED,
    verbose=2
)

optimizer.maximize(init_points=5, n_iter=20)


|   iter    |  target   | max_depth | n_esti... |
-------------------------------------------------
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10709
[LightGBM] [Info] Number of data points in the train set: 130844, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]

KeyboardInterrupt: 

In [None]:
best_params = optimizer.max['params']
print(best_params)


In [None]:
# n_estimators
n_tree = [399, 400]
# learning_rate
l_rate = [0.05]
# max_depth
m_depth = [12, 13]
# reg_alpha
L1_norm = [0.3]
# num_leaves
num_lvs = [31]

# Modeling
save_n = []
save_l = []
save_m = []
save_L1 = []
save_num_lv = []
f1_score_ = []

cnt = 0

for n in n_tree:
    for l in l_rate:
        for m in m_depth:
            for L1 in L1_norm:
                for nl in num_lvs:
                    print(">>> {} <<<".format(cnt))
                    cnt +=1
                    print("n_estimators : {}, learning_rate : {}, max_depth : {}, reg_alpha : {}, num_leaves : {}".format(n, l, m, L1, nl))
                    model = LGBMClassifier(n_estimators=n, learning_rate=l, max_depth=m, reg_alpha=L1, num_leaves=nl, n_jobs=-1, objective='multiclassova')
                    model.fit(x_train, y_train)

                    # Train Acc
                    y_pre_train = model.predict(x_train)
                    cm_train = confusion_matrix(y_train, y_pre_train)
                    print("Train Confusion Matrix")
                    print(cm_train)
                    print("Train Acc : {}".format(np.diag(cm_train).sum()/cm_train.sum()))
                    print("Train F1-Score : {}".format(f1_score(y_train, y_pre_train, average= 'macro')))

                    # Test Acc
                    y_pre_test = model.predict(x_valid)
                    cm_test = confusion_matrix(y_valid, y_pre_test)
                    print("Test Confusion Matrix")
                    print(cm_test)
                    print("Test Acc : {}".format(np.diag(cm_test).sum()/cm_test.sum()))
                    print("Test F1-Score : {}".format(f1_score(y_valid, y_pre_test, average= 'macro')))
                    print("-----------------------------------------------------------------------")
                    print("-----------------------------------------------------------------------")
                    save_n.append(n)
                    save_l.append(l)
                    save_m.append(m)
                    save_L1.append(L1)
                    save_num_lv.append(nl)
                    f1_score_.append(f1_score(y_valid, y_pre_test, average='macro'))

>>> 0 <<<
n_estimators : 599, learning_rate : 0.09, max_depth : 13, reg_alpha : 0.2, num_leaves : 31
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.129515 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10709
[LightGBM] [Info] Number of data points in the train set: 130844, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142857 -> initscore=-1.791759
[LightGBM] [Info] Start 

In [None]:
print(">>> {} <<<\nBest Test f1-score : {}\nBest n_estimators : {}\nBest Learning Rate : {}\nBest Max_depth : {}\nBest L1-norm : {}\n Best num_leaves : {}".format(np.argmax(f1_score_),
                                                                                                                                            f1_score_[np.argmax(f1_score_)],
                                                                                                                                            save_n[np.argmax(f1_score_)],
                                                                                                                                            save_l[np.argmax(f1_score_)],
                                                                                                                                            save_m[np.argmax(f1_score_)],
                                                                                                                                            save_L1[np.argmax(f1_score_)],
                                                                                                                                            save_num_lv[np.argmax(f1_score_)]))

>>> 0 <<<
Best Test f1-score : 0.9161603150324922
Best n_estimators : 599
Best Learning Rate : 0.09
Best Max_depth : 13
Best L1-norm : 0.2
 Best num_leaves : 31


In [None]:
best_model = LGBMClassifier(n_estimators=save_n[np.argmax(f1_score_)], learning_rate=save_l[np.argmax(f1_score_)],
                           max_depth=save_m[np.argmax(f1_score_)], reg_alpha=save_L1[np.argmax(f1_score_)], num_leaves=save_num_lv[np.argmax(f1_score_)], objective='Multiclassova',
                           random_state=SEED)
best_model.fit(x_train, y_train)

# Train Acc
y_pre_train = best_model.predict(x_train)
cm_train = confusion_matrix(y_train, y_pre_train)
print("Train Confusion Matrix")
print(cm_train)
print("Train Acc : {}".format(np.diag(cm_train).sum()/cm_train.sum()))
print("Train F1-Score : {}".format(f1_score(y_train, y_pre_train, average='macro')))

# Test Acc
y_pre_test = best_model.predict(x_valid)
cm_test = confusion_matrix(y_valid, y_pre_test)
print("Test Confusion Matrix")
print(cm_test)
print("Test Acc : {}".format(np.diag(cm_test).sum()/cm_test.sum()))
print("Test F1-Score : {}".format(f1_score(y_valid, y_pre_test, average='macro')))

[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Number of positive: 18692, number of negative: 112152
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021324 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10709
[LightGBM] [Info] Number of data points in the train set: 130844, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142857 -> initscore=-1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightG

In [None]:
import joblib

# 모델을 파일로 저장
model_path = '/content/drive/My Drive/데이콘/best_model.joblib'
joblib.dump(best_model, model_path)


['/content/drive/My Drive/데이콘/best_model.joblib']

In [None]:
predictions = best_model.predict(X_test)



In [None]:
# 예측 결과와 ID 열 결합
submission = pd.DataFrame({'ID': test_ids, '대출등급': predictions})

# 제출 파일로 저장
submission.to_csv('submission.csv', index=False)

In [None]:
submission

Unnamed: 0,ID,대출등급
0,TEST_00000,B
1,TEST_00001,B
2,TEST_00002,A
3,TEST_00003,C
4,TEST_00004,C
...,...,...
64192,TEST_64192,D
64193,TEST_64193,D
64194,TEST_64194,D
64195,TEST_64195,C


In [None]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>