* https://docs.pola.rs/user-guide/getting-started/#filter

In [1]:
# [1] 설정/임포트
import pandas as pd
import numpy as np
import polars as pl

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report

import lightgbm as lgb

SEED = 42
SAMPLE_N = 200_000  # 전체 데이터 사용 시 None
TEST_SIZE = 0.2

In [2]:
df = pl.read_csv('./data/restaurant_20_25.csv', encoding='cp949')

In [3]:
# [2] 타깃 생성 + 누수/식별 컬럼 제거 + (옵션)샘플링 + pandas 변환
# 사전: 노트북 앞 셀에서 df(polars.DataFrame)가 이미 로드되어 있다고 가정
assert isinstance(df, pl.DataFrame), "df(polars.DataFrame)가 먼저 로드되어 있어야 합니다."

# 타깃: 폐업(1) vs 기타(0)
df_s = df.with_columns([
    (pl.col("영업상태명") == "폐업").cast(pl.Int8).alias("target")
])

# (옵션) 빠른 실험을 위한 층화 샘플
if SAMPLE_N is not None and SAMPLE_N < df_s.height:
    # 층화 샘플링을 위해 클래스별 비율 유지
    frac_closed = df_s.filter(pl.col("target") == 1).height / df_s.height
    n_closed = int(SAMPLE_N * frac_closed)
    n_open = SAMPLE_N - n_closed
    df_closed = df_s.filter(pl.col("target") == 1).sample(n=min(n_closed, df_s.filter(pl.col("target")==1).height), seed=SEED)
    df_open = df_s.filter(pl.col("target") == 0).sample(n=min(n_open, df_s.filter(pl.col("target")==0).height), seed=SEED)
    df_s = pl.concat([df_closed, df_open]).sample(fraction=1.0, shuffle=True, seed=SEED)
else:
    # 전체 데이터 사용 시 셔플만
    df_s = df_s.sample(fraction=1.0, shuffle=True, seed=SEED)

# 누수/식별/고카디널리티 텍스트 컬럼 제거
leak_cols = {
    "영업상태명","영업상태구분코드","상세영업상태코드","상세영업상태명",
    "폐업일자","인허가취소일자","휴업시작일자","휴업종료일자","재개업일자"
}
id_text_cols = {
    "번호","관리번호","개방서비스명","개방서비스아이디","사업장명",
    "소재지전화","홈페이지","소재지전체주소","도로명전체주소"
}

drop_cols = [c for c in df_s.columns if c in leak_cols or c in id_text_cols]

use_cols = [c for c in df_s.columns if c not in drop_cols]  # target 포함됨
df_s = df_s.select(use_cols)

# pandas 변환(메모리 절약 옵션)
# pandas 변환(파이애로우 확장 끄기: LightGBM가 싫어함)
pdf = df_s.to_pandas()  # use_pyarrow_extension_array 제거
y = pdf["target"].astype("int8").values
X = pdf.drop(columns=["target"])
del pdf

In [4]:
# 모델에 바로 쓰지 않을 원시 문자열/날짜/공백 이름 컬럼 추가 드롭
non_model_raw_cols = {
    "인허가일자", "최종수정시점", "데이터갱신일자", "데이터갱신구분", "전통업소지정번호", ""
}
drop_more = [c for c in X.columns if c in non_model_raw_cols or (isinstance(c, str) and c.strip() == "")]
if drop_more:
    X = X.drop(columns=drop_more)

In [5]:
# 수치/범주 컬럼 식별
num_candidates = {
    "소재지면적","좌표정보x(epsg5174)","좌표정보y(epsg5174)","총직원수","남성종사자수","여성종사자수",
    "본사직원수","공장사무직직원수","공장판매직직원수","공장생산직직원수","보증액","월세액","시설총규모"
}
cat_candidates = {
    "업태구분명","위생업태명","영업장주변구분명","등급구분명","급수시설구분명","건물소유구분명",
    "다중이용업소여부","전통업소주된음식","개방자치단체코드","도로명우편번호","소재지우편번호"
}

num_cols = [c for c in X.columns if c in num_candidates]
cat_cols = [c for c in X.columns if c in cat_candidates]

# 숫자: coercion → 결측 대치 → numpy float로 고정
for c in num_cols:
    X[c] = pd.to_numeric(X[c], errors="coerce")
for c in num_cols:
    if c in X.columns:
        med = X[c].median(skipna=True)
        X[c] = X[c].fillna(med).astype("float32")  # numpy dtype

# 범주: 문자열 → 결측 "미상" → category로 고정
for c in cat_cols:
    X[c] = X[c].astype("string").fillna("미상").astype("category")

# 남은 object 계열 중 저카디널리티만 category로 흡수(선택)
other_obj = [c for c in X.columns if c not in num_cols and c not in cat_cols and X[c].dtype == "object"]
for c in other_obj:
    if X[c].nunique(dropna=True) <= 50:
        X[c] = X[c].astype("string").fillna("미상").astype("category")
        cat_cols.append(c)

In [6]:
# [3] 수치/범주 컬럼 식별 및 최소 전처리(결측 대치, dtype 캐스팅)
# 후보 목록(교집합으로 안전 처리)
num_candidates = {
    "소재지면적","좌표정보x(epsg5174)","좌표정보y(epsg5174)","총직원수","남성종사자수","여성종사자수",
    "본사직원수","공장사무직직원수","공장판매직직원수","공장생산직직원수","보증액","월세액","시설총규모"
}
cat_candidates = {
    "업태구분명","위생업태명","영업장주변구분명","등급구분명","급수시설구분명","건물소유구분명","다중이용업소여부","전통업소주된음식",
    "개방자치단체코드","도로명우편번호","소재지우편번호"  # 숫자처럼 보여도 범주로 취급
}

num_cols = [c for c in X.columns if c in num_candidates]
cat_cols = [c for c in X.columns if c in cat_candidates]

# 숫자 컬럼: 문자열 포함 가능 → 수치로 강제 변환
for c in num_cols:
    X[c] = pd.to_numeric(X[c], errors="coerce")

# 범주 컬럼: 결측 -> "미상", category 캐스팅
for c in cat_cols:
    X[c] = X[c].astype("string").fillna("미상").astype("category")

# 남은 object/string 컬럼 중 카디널리티 낮은 것만 범주로 흡수(선택)
other_obj = [c for c in X.columns if c not in num_cols and c not in cat_cols and X[c].dtype == "object"]
for c in other_obj:
    if X[c].nunique(dropna=True) <= 50:
        X[c] = X[c].astype("string").fillna("미상").astype("category")
        cat_cols.append(c)

# 수치 결측: 중앙값 대치
for c in num_cols:
    if c in X.columns:
        med = X[c].median(skipna=True)
        X[c] = X[c].fillna(med)

In [7]:
# [4] 데이터 분할(Stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print("y_train ratio (pos):", np.mean(y_train).round(4))

Train: (160000, 24), Test: (40000, 24)
y_train ratio (pos): 0.6915


In [8]:
# [5] LightGBM 학습(얼리 스토핑, 불균형 처리)
clf = lgb.LGBMClassifier(
    objective="binary",
    metric="auc",
    learning_rate=0.05,
    n_estimators=2000,
    num_leaves=31,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    class_weight="balanced",
    random_state=SEED,
    n_jobs=-1
)
fit_kwargs = dict(
    X=X_train, y=y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_metric="auc",
    early_stopping_rounds=100,   # 주석 해제
    verbose=100
)
if len(cat_cols) > 0:
    fit_kwargs["categorical_feature"] = cat_cols

clf.fit(**fit_kwargs)

TypeError: LGBMClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [None]:
# [6] 평가 및 중요도
from sklearn.metrics import roc_curve, precision_recall_curve

pred_proba = clf.predict_proba(X_test)[:, 1]
pred_label = (pred_proba >= 0.5).astype(int)

roc_auc = roc_auc_score(y_test, pred_proba)
pr_auc = average_precision_score(y_test, pred_proba)

print(f"ROC-AUC: {roc_auc:.4f}")
print(f"PR-AUC : {pr_auc:.4f}")
print("\nConfusion Matrix (th=0.5):")
print(confusion_matrix(y_test, pred_label))
print("\nClassification Report (th=0.5):")
print(classification_report(y_test, pred_label, digits=4))

# 상위 중요도 30개
importances = clf.booster_.feature_importance(importance_type="gain")
feat_names = clf.booster_.feature_name()
imp_series = pd.Series(importances, index=feat_names).sort_values(ascending=False)
print("\nTop-30 Feature Importances (gain):")
print(imp_series.head(30))