* https://docs.pola.rs/user-guide/getting-started/#filter

In [1]:
# [1] 설정/임포트
import pandas as pd
import numpy as np
import polars as pl

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

SEED = 42
SAMPLE_N = 200_000  # 전체 데이터 사용 시 None
TEST_SIZE = 0.2

In [2]:
df = pd.read_csv('./data/restaurant_20_25.csv', encoding='cp949')

  df = pd.read_csv('./data/restaurant_20_25.csv', encoding='cp949')


In [3]:
le = LabelEncoder()

In [4]:
y_label = le.fit_transform(df['영업상태명'])

In [5]:
set(df['영업상태명'])

{'영업/정상', '폐업'}

In [6]:
# num_data = df.select(pl.col(pl.Float64, pl.Int64))
# print(num_data.columns)

num_data = df.select_dtypes(include=['float64', 'int64'])
print(num_data.columns)

Index(['번호', '개방자치단체코드', '인허가취소일자', '영업상태구분코드', '상세영업상태코드', '휴업시작일자', '휴업종료일자',
       '재개업일자', '소재지면적', '도로명우편번호', '좌표정보x(epsg5174)', '좌표정보y(epsg5174)',
       '남성종사자수', '여성종사자수', '총직원수', '본사직원수', '공장사무직직원수', '공장판매직직원수', '공장생산직직원수',
       '보증액', '월세액', '시설총규모', '홈페이지', 'Unnamed: 47'],
      dtype='object')


In [7]:
num_data = num_data[['번호', '개방자치단체코드', '인허가취소일자', '영업상태구분코드', '상세영업상태코드', '휴업시작일자', '휴업종료일자',
       '재개업일자', '소재지면적', '도로명우편번호', '남성종사자수', '여성종사자수', '총직원수', '본사직원수', '공장사무직직원수', '공장판매직직원수', '공장생산직직원수',
       '보증액', '월세액', '시설총규모', '홈페이지']]
print(num_data.columns)

Index(['번호', '개방자치단체코드', '인허가취소일자', '영업상태구분코드', '상세영업상태코드', '휴업시작일자', '휴업종료일자',
       '재개업일자', '소재지면적', '도로명우편번호', '남성종사자수', '여성종사자수', '총직원수', '본사직원수',
       '공장사무직직원수', '공장판매직직원수', '공장생산직직원수', '보증액', '월세액', '시설총규모', '홈페이지'],
      dtype='object')


In [8]:

y = y_label
X = num_data

In [9]:
# [4] 데이터 분할(Stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print("y_train ratio (pos):", np.mean(y_train).round(4))

Train: (1781984, 21), Test: (445496, 21)
y_train ratio (pos): 0.6915


In [10]:
# [5] LightGBM 학습(얼리 스토핑, 불균형 처리)
clf = lgb.LGBMClassifier(
    objective="binary",
    metric="auc",
    learning_rate=0.05,
    n_estimators=2000,
    num_leaves=31,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    class_weight="balanced",
    random_state=SEED,
    n_jobs=-1
)

In [11]:

fit_kwargs = dict(
    X=X_train, y=y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_metric="auc",
    # early_stopping_rounds=100,   # 주석 해제
    # verbose=100
)

clf.fit(**fit_kwargs)

[LightGBM] [Info] Number of positive: 1232306, number of negative: 549678
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072459 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1315
[LightGBM] [Info] Number of data points in the train set: 1781984, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,2000
,subsample_for_bin,200000
,objective,'binary'
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [12]:
# [6] 평가 및 중요도
from sklearn.metrics import roc_curve, precision_recall_curve

pred_proba = clf.predict_proba(X_test)[:, 1]
pred_label = (pred_proba >= 0.5).astype(int)

roc_auc = roc_auc_score(y_test, pred_proba)
pr_auc = average_precision_score(y_test, pred_proba)

print(f"ROC-AUC: {roc_auc:.4f}")
print(f"PR-AUC : {pr_auc:.4f}")
print("\nConfusion Matrix (th=0.5):")
print(confusion_matrix(y_test, pred_label))
print("\nClassification Report (th=0.5):")
print(classification_report(y_test, pred_label, digits=4))

# 상위 중요도 30개
importances = clf.booster_.feature_importance(importance_type="gain")
feat_names = clf.booster_.feature_name()
imp_series = pd.Series(importances, index=feat_names).sort_values(ascending=False)
print("\nTop-30 Feature Importances (gain):")
print(imp_series.head(30))

ROC-AUC: 1.0000
PR-AUC : 1.0000

Confusion Matrix (th=0.5):
[[137420      0]
 [     0 308076]]

Classification Report (th=0.5):
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000    137420
           1     1.0000    1.0000    1.0000    308076

    accuracy                         1.0000    445496
   macro avg     1.0000    1.0000    1.0000    445496
weighted avg     1.0000    1.0000    1.0000    445496


Top-30 Feature Importances (gain):
영업상태구분코드    1.619146e+07
상세영업상태코드    3.939743e+06
번호          1.354774e+05
총직원수        4.134526e+02
도로명우편번호     6.527021e+01
개방자치단체코드    1.524236e+01
여성종사자수      1.799785e+00
소재지면적       7.811656e-01
본사직원수       5.800411e-01
시설총규모       8.607441e-03
공장사무직직원수    1.696686e-07
남성종사자수      3.092421e-09
월세액         4.919539e-12
공장생산직직원수    1.001870e-12
공장판매직직원수    5.234211e-13
보증액         2.009500e-14
인허가취소일자     0.000000e+00
휴업종료일자      0.000000e+00
휴업시작일자      0.000000e+00
재개업일자       0.000000e+00
홈페이지     

In [13]:
filename = 'lgbm_model.txt'

In [16]:
import pickle
with open(filename, 'wb') as f:
    pickle.dump(clf, f)