* https://docs.pola.rs/user-guide/getting-started/#filter

In [6]:
# -*- coding: utf-8 -*-
# pandas + LightGBM 폐업예측 파이프라인 (견고한 버전)
import os, json, pickle, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, roc_auc_score, average_precision_score,
    precision_recall_curve, f1_score, confusion_matrix
)
from lightgbm import LGBMClassifier
import lightgbm as lgb  # callbacks용

# ======================
# 설정
# ======================
CSV_PATH = "./data/restaurant_20_25.csv"  # 경로 확인
TARGET_CANDIDATES = ["영업상태명"]
ID_LIKE_PATTERNS = ["사업자", "business", "등록번호", "id", "아이디", "uuid"]
RARE_MIN_COUNT = 20
TEST_SIZE = 0.2
RANDOM_STATE = 42
ARTIFACT_DIR = "./data/lgbm_closure"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

# ======================
# 유틸
# ======================
def read_csv_safely(path: str) -> pd.DataFrame:
    """
    1) UTF-8 시도 → 실패시 cp949로 재시도
    2) 크고 지저분한 파일 대비 on_bad_lines='skip'
    """
    try:
        return pd.read_csv(path, encoding="utf-8", on_bad_lines="skip", low_memory=False)
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding="cp949", on_bad_lines="skip", low_memory=False)

def find_target(colnames):
    low = [c.lower() for c in colnames]
    for t in TARGET_CANDIDATES:
        if t.lower() in low:
            return colnames[low.index(t.lower())]
    raise ValueError(f"타깃 컬럼을 찾지 못했습니다. 후보: {TARGET_CANDIDATES}")

def looks_like_id(name: str) -> bool:
    n = name.lower()
    if n in ("index",):
        return True
    for pat in ID_LIKE_PATTERNS:
        if pat.lower() in n:
            return True
    return False

def compress_rare_categories(s: pd.Series, min_count: int) -> pd.Series:
    vc = s.value_counts(dropna=False)
    rare_values = set(vc[vc < min_count].index)
    if not rare_values:
        return s
    return s.where(~s.isin(rare_values), "__RARE__")

def auto_datetime_parse(df: pd.DataFrame) -> pd.DataFrame:
    """
    이름 패턴 기반으로 날짜 후보 컬럼들을 errors='coerce'로 유연 파싱(NaT 처리)
    """
    date_like_keywords = ("일자", "date", "날짜", "개업", "등록", "인허가")
    for c in df.columns:
        cl = c.lower()
        if any(k in cl for k in date_like_keywords):
            # 'YYYY-MM-DD...' 앞 10자리만 남기기
            df[c] = df[c].astype(str).str.slice(0, 10)
            df[c] = pd.to_datetime(df[c], errors="coerce", format="%Y-%m-%d")
    return df

def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
    now_year = datetime.now().year
    # 개업연도 → 영업연수
    if "개업연도" in df.columns:
        # 숫자형만 처리
        if pd.api.types.is_numeric_dtype(df["개업연도"]):
            df["영업연수"] = (now_year - df["개업연도"].astype(float)).astype(float)
    # 개업일자/인허가일자 → 영업연수
    for date_col in ["개업일자", "인허가일자"]:
        if date_col in df.columns and pd.api.types.is_datetime64_any_dtype(df[date_col]):
            df["영업연수"] = (now_year - df[date_col].dt.year).astype(float)
    return df

def auto_log1p(df: pd.DataFrame, num_cols, skew_thr: float = 1.0):
    """
    양의 왜도 큰(> skew_thr) & 최소값>=0 인 수치에 log1p 파생
    """
    to_log = []
    for c in num_cols:
        col = pd.to_numeric(df[c], errors="coerce")
        if col.notna().sum() == 0:
            continue
        mn = col.min()
        sk = col.skew()
        if pd.notna(sk) and sk > skew_thr and pd.notna(mn) and mn >= 0:
            to_log.append(c)
    for c in to_log:
        df[f"{c}_log1p"] = np.log1p(pd.to_numeric(df[c], errors="coerce")).astype(float)
    return df

def infer_time_split_key(df: pd.DataFrame):
    """
    날짜/연도 관련 컬럼 중 하나를 분할 기준으로 선택
    """
    for c in df.columns:
        cl = c.lower()
        if pd.api.types.is_datetime64_any_dtype(df[c]) or any(k in cl for k in ["년", "연도", "date", "날짜", "개업", "open", "인허가"]):
            return c
    return None

# ======================
# 1) 로드 & 타깃 식별
# ======================
df = read_csv_safely(CSV_PATH)
print(f"[INFO] shape={df.shape}, columns={list(df.columns)[:8]} ...(+{len(df.columns)-8} more)" if len(df.columns)>8 else f"[INFO] shape={df.shape}, columns={list(df.columns)}")

TARGET = find_target(df.columns)
df = df[~df[TARGET].isna()].copy()

# 타깃을 0/1로 보정 (bool/문자형 모두 커버)
if df[TARGET].dtype == "bool":
    df[TARGET] = df[TARGET].astype(int)
elif pd.api.types.is_numeric_dtype(df[TARGET]):
    df[TARGET] = (df[TARGET].astype(float) > 0).astype(int)  # 0/1/… → 0/1
else:
    # 문자열 매핑
    pos_values = {"1", "true", "t", "y", "yes", "폐업", "closed"}
    df[TARGET] = df[TARGET].astype(str).str.lower().isin(pos_values).astype(int)

# ======================
# 2) 날짜 파싱 & 파생
# ======================
df = auto_datetime_parse(df)
df = add_derived_features(df)

# ======================
# 3) 컬럼 타입 분류/정리
# ======================
# ID/고유키 제거 (정보누설 방지)
drop_cols = [c for c in df.columns if looks_like_id(c) and c != TARGET]
if drop_cols:
    df.drop(columns=drop_cols, inplace=True, errors="ignore")

# 범주/수치 후보
cat_cols = [c for c in df.columns if c != TARGET and (df[c].dtype == "object" or pd.api.types.is_string_dtype(df[c]))]
num_cols = [c for c in df.columns if c != TARGET and pd.api.types.is_numeric_dtype(df[c])]

# 결측치 처리
# - 수치: 중앙값
for c in num_cols:
    med = pd.to_numeric(df[c], errors="coerce").median()
    if pd.isna(med):
        med = 0.0
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(med)

# - 범주: "__MISSING__" + 희귀 카테고리 묶기
for c in cat_cols:
    df[c] = df[c].astype("string").fillna("__MISSING__")
    df[c] = compress_rare_categories(df[c], RARE_MIN_COUNT)

# 왜도 큰 수치 log1p 파생
df = auto_log1p(df, num_cols, skew_thr=1.0)

# pandas category 부여 (LightGBM 네이티브 카테고리 처리)
for c in cat_cols:
    if c in df.columns:
        df[c] = df[c].astype("category")

# ======================
# 4) 분할 방식 결정(시간 분할 우선)
# ======================
split_key = infer_time_split_key(df)
use_time_split = False
if split_key is not None:
    # 연도형 문자열/숫자라면 숫자로 변환 시도
    if not pd.api.types.is_datetime64_any_dtype(df[split_key]):
        # 연도로 보이면 숫자 변환
        df["_split_key_num_"] = pd.to_numeric(df[split_key], errors="coerce")
        key_series = df["_split_key_num_"]
    else:
        key_series = pd.to_datetime(df[split_key], errors="coerce").view("int64")  # NaT→NaN 예방용 int64
    # NaN 비율이 과하면 시간분할 포기
    if key_series.notna().mean() >= 0.7:
        use_time_split = True
    else:
        split_key = None
        if "_split_key_num_" in df.columns:
            df.drop(columns=["_split_key_num_"], inplace=True)

# ======================
# 5) 특징/타깃 분리
# ======================
X_cols = [c for c in df.columns if c != TARGET]
X = df[X_cols].copy()
y = df[TARGET].astype(int).copy()

# ======================
# 6) 학습/검증 분할
# ======================
if use_time_split:
    if pd.api.types.is_datetime64_any_dtype(df[split_key]):
        order = df[split_key].argsort(kind="mergesort")  # 안정 정렬
    elif "_split_key_num_" in df.columns:
        order = df["_split_key_num_"].argsort(kind="mergesort")
    else:
        order = X.index.to_series().argsort(kind="mergesort")
    cutoff = int(len(df) * (1.0 - TEST_SIZE))
    idx_train = order.iloc[:cutoff].index
    idx_test  = order.iloc[cutoff:].index
    X_train, X_test = X.loc[idx_train], X.loc[idx_test]
    y_train, y_test = y.loc[idx_train], y.loc[idx_test]
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )

# ======================
# 7) LightGBM 학습 (불균형 보정 + 조기종료)
# ======================
categorical_features = [c for c in cat_cols if c in X_train.columns]
lgbm = LGBMClassifier(
    n_estimators=1500,
    learning_rate=0.04,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    class_weight="balanced",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

lgbm.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(stopping_rounds=80, verbose=True)],
    categorical_feature=categorical_features  # pandas category 처리
)

# ======================
# 8) 평가 (ROC-AUC, PR-AUC, 최적 threshold)
# ======================
proba = lgbm.predict_proba(X_test)[:, 1]
pred_default = (proba >= 0.5).astype(int)

roc = roc_auc_score(y_test, proba)
pr_auc = average_precision_score(y_test, proba)

prec, rec, thr = precision_recall_curve(y_test, proba)
f1s = 2 * (prec * rec) / (prec + rec + 1e-12)
best_idx = int(np.nanargmax(f1s))
best_thr = 0.5 if best_idx >= len(thr) else float(thr[best_idx])
pred_best = (proba >= best_thr).astype(int)

print("\n[기본 threshold=0.5]")
print(classification_report(y_test, pred_default))
print("ROC-AUC:", roc)
print("PR-AUC :", pr_auc)
print("Confusion Matrix:\n", confusion_matrix(y_test, pred_default))

print("\n[최적 F1 threshold=%.4f]" % best_thr)
print(classification_report(y_test, pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, pred_best))

# ======================
# 9) 중요 변수 상위 20 (gain)
# ======================
booster = lgbm.booster_
gain_importance = booster.feature_importance(importance_type="gain")
features = booster.feature_name()
order = np.argsort(gain_importance)[::-1]
topk = min(20, len(features))
print("\n[Feature Importance - gain 기준 TOP %d]" % topk)
for i in range(topk):
    j = order[i]
    print(f"{i+1:2d}. {features[j]}  gain={gain_importance[j]:.2f}")

# ======================
# 10) 아티팩트 저장 (모델/메타/임계값)
# ======================
with open(os.path.join(ARTIFACT_DIR, "model_lgbm.pkl"), "wb") as f:
    pickle.dump(lgbm, f)

meta = {
    "target": TARGET,
    "categorical_features": categorical_features,
    "dropped_id_cols": [c for c in drop_cols if c in X_cols],
    "time_split_used": use_time_split,
    "time_col": split_key,
    "best_threshold": best_thr,
    "columns_final": X_cols
}
with open(os.path.join(ARTIFACT_DIR, "preprocess_meta.json"), "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print(f"\n[저장 완료] 모델: {os.path.join(ARTIFACT_DIR,'model_lgbm.pkl')}")
print(f"[저장 완료] 메타 : {os.path.join(ARTIFACT_DIR,'preprocess_meta.json')}")


[INFO] shape=(2227480, 48), columns=['번호', '개방서비스명', '개방서비스아이디', '개방자치단체코드', '관리번호', '인허가일자', '인허가취소일자', '영업상태구분코드'] ...(+40 more)


DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int16DType'>, <class 'numpy.dtypes.Int16DType'>, <class 'numpy.dtypes.Int16DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int16DType'>, <class 'numpy.dtypes.Int16DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float32DType'>)