## Main model

### 라이브러리 import

In [32]:
import warnings

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import BorderlineSMOTE, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    VotingClassifier,
)

from sklearn.metrics import (
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

### 데이터 처리

In [33]:
# 데이터 불러오기
df = pd.read_csv("final.csv")

# 정답 라벨 생성
df["label"] = (df["wildfire_count"] > 0).astype(int)

### 추가 feature 생성

In [34]:
# 1. 시간 및 시즌 정보
df["datetime"] = pd.to_datetime(df["datetime"])
df["month"] = df["datetime"].dt.month
df["weekday"] = df["datetime"].dt.weekday
df["hour"] = df["datetime"].dt.hour
df["is_weekend"] = df["weekday"].isin([5, 6]).astype(int)

# 2. Lag·Rolling 추가
df = df.sort_values(["region", "datetime"])
df["temp_lag1"] = df.groupby("region")["temp"].shift(1)
df["temp_3day_mean"] = df.groupby("region")["temp"].transform(
    lambda x: x.rolling(window=72, min_periods=1).mean()
)

# 3. VPD (증기압 결핍량)
df["VPD"] = (
    0.6108
    * np.exp((17.27 * df["temp"]) / (df["temp"] + 237.3))
    * (1 - df["humidity"] / 100)
)

# 4. rain_presence 컬럼 생성
df["rain_presence"] = (df["rain_indicator"] == 0).astype(int)

# datetime 정렬 및 누적 계산
df["datetime"] = pd.to_datetime(df["datetime"])
df = df.sort_values(["region", "datetime"])

# 7일(=168시간) rolling 합계
df["rain_presence_7day_sum"] = df.groupby("region")["rain_presence"].transform(
    lambda x: x.rolling(window=168, min_periods=1).sum()
)

df["label"] = (df["wildfire_count"] > 0).astype(int)
df.fillna(0, inplace=True)  # Lag나 rolling 후 NaN 처리

# 학습용 컬럼 선택
feature_cols = [
    "temp",
    "wind",
    "humidity",
    "VPD",
    "rain_presence_7day_sum",
    "month",
    "weekday",
    "hour",
    "is_weekend",
    "temp_lag1",
    "temp_3day_mean",
]

X = df[feature_cols]
y = df["label"]

# 샘플링
under = RandomUnderSampler(sampling_strategy={0: 10000}, random_state=42)
over = BorderlineSMOTE(sampling_strategy={1: 10000}, random_state=42, k_neighbors=3)

### 기본 모델별 파이프라인 - 하이퍼파라미터 튜닝 전

In [35]:
models = {
    "RandomForest": Pipeline(
        [
            ("under", under),
            ("over", over),
            (
                "clf",
                RandomForestClassifier(
                    random_state=42,
                ),
            ),
        ]
    ),
    "XGBoost": Pipeline(
        [
            ("under", under),
            ("over", over),
            (
                "clf",
                XGBClassifier(
                    random_state=42,
                ),
            ),
        ]
    ),
    "LightGBM": Pipeline(
        [
            ("under", under),
            ("over", over),
            ("clf", LGBMClassifier(random_state=42, verbose=-1)),
        ]
    ),
    "GradientBoosting": Pipeline(
        [
            ("under", under),
            ("over", over),
            ("gb", GradientBoostingClassifier(random_state=42)),
        ]
    ),
}

### 튜닝 전 결과

In [36]:
# 모델별 최적 k 딕셔너리
best_k_dict = {
    "LightGBM": 10,
    "RandomForest": 5,
    "XGBoost": 10,
    "GradientBoosting": 5,
}

# 최적 k 기준으로 교차 검증
for model_name, k in best_k_dict.items():
    print(f"\n--- {model_name} ---")

    pipe = models[model_name]
    cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    scores = cross_validate(
        pipe,
        X,
        y,
        cv=cv,
        scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
        return_train_score=False,
    )

    for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
        mean = scores[f"test_{metric}"].mean()
        std = scores[f"test_{metric}"].std()
        print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- LightGBM ---
balanced_accuracy   : 0.7498 ± 0.0261
f1_macro            : 0.4764 ± 0.0008
precision_macro     : 0.5011 ± 0.0001
recall_macro        : 0.7498 ± 0.0261

--- RandomForest ---
balanced_accuracy   : 0.7209 ± 0.0167
f1_macro            : 0.4830 ± 0.0011
precision_macro     : 0.5012 ± 0.0001
recall_macro        : 0.7209 ± 0.0167

--- XGBoost ---
balanced_accuracy   : 0.7292 ± 0.0285
f1_macro            : 0.4802 ± 0.0006
precision_macro     : 0.5011 ± 0.0001
recall_macro        : 0.7292 ± 0.0285

--- GradientBoosting ---
balanced_accuracy   : 0.7953 ± 0.0121
f1_macro            : 0.4610 ± 0.0015
precision_macro     : 0.5009 ± 0.0000
recall_macro        : 0.7953 ± 0.0121


## Hyperparameter 튜닝

### 1. RandomForest test

In [37]:
rf_pipeline = Pipeline(
    [
        ("under", under),
        ("over", over),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=300,
                max_depth=3,
                min_samples_split=10,
                min_samples_leaf=4,
                max_features="sqrt",
                random_state=42,
            ),
        ),
    ]
)

cv_rf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores_rf = cross_validate(
    rf_pipeline,
    X,
    y,
    cv=cv_rf,
    scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
    return_train_score=False,
)

print("\n--- RandomForest ---")
for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
    mean = scores_rf[f"test_{metric}"].mean()
    std = scores_rf[f"test_{metric}"].std()
    print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- RandomForest ---
balanced_accuracy   : 0.8012 ± 0.0104
f1_macro            : 0.4356 ± 0.0022
precision_macro     : 0.5007 ± 0.0000
recall_macro        : 0.8012 ± 0.0104


### 2. XGBoost test

In [38]:
xgb_pipeline = Pipeline(
    [
        ("under", under),
        ("over", over),
        (
            "clf",
            XGBClassifier(
                n_estimators=300,
                learning_rate=0.005,
                max_depth=3,
                subsample=0.6,
                colsample_bytree=0.9,
                eval_metric="auc",
                random_state=42,
            ),
        ),
    ]
)

cv_xgb = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scores_xgb = cross_validate(
    xgb_pipeline,
    X,
    y,
    cv=cv_xgb,
    scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
    return_train_score=False,
)

print("\n--- XGBoost ---")
for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
    mean = scores_xgb[f"test_{metric}"].mean()
    std = scores_xgb[f"test_{metric}"].std()
    print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- XGBoost ---
balanced_accuracy   : 0.8020 ± 0.0162
f1_macro            : 0.4379 ± 0.0020
precision_macro     : 0.5007 ± 0.0000
recall_macro        : 0.8020 ± 0.0162


### 3. LightGBM test

In [39]:
lgbm_pipeline = Pipeline(
    [
        ("under", under),
        ("over", over),
        (
            "clf",
            LGBMClassifier(
                n_estimators=500,
                learning_rate=0.01,
                max_depth=7,
                num_leaves=50,
                min_child_samples=30,
                min_child_weight=1e-2,
                subsample=0.8,
                colsample_bytree=0.7,
                reg_alpha=0.1,  # L1 정규화
                reg_lambda=1.0,  # L2 정규화
                random_state=42,
                verbose=-1,
            ),
        ),
    ]
)

cv_lgbm = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scores_lgbm = cross_validate(
    lgbm_pipeline,
    X,
    y,
    cv=cv_lgbm,
    scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
    return_train_score=False,
)

print("\n--- LightGBM ---")
for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
    mean = scores_lgbm[f"test_{metric}"].mean()
    std = scores_lgbm[f"test_{metric}"].std()
    print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- LightGBM ---
balanced_accuracy   : 0.7818 ± 0.0219
f1_macro            : 0.4674 ± 0.0007
precision_macro     : 0.5010 ± 0.0001
recall_macro        : 0.7818 ± 0.0219


### 4. Gradient Boosting test

In [40]:
gb_pipeline = Pipeline(
    [
        ("under", under),
        ("over", over),
        (
            "clf",
            GradientBoostingClassifier(
                learning_rate=0.012287721406968568,
                max_depth=2,
                max_features=1.0,
                min_samples_leaf=7,
                n_estimators=340,
                subsample=0.7257423924305306,
                random_state=42,
            ),
        ),
    ]
)

cv_gb = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores_gb = cross_validate(
    gb_pipeline,
    X,
    y,
    cv=cv_gb,
    scoring=[
        "balanced_accuracy",
        "f1_macro",
        "precision_macro",
        "recall_macro",
    ],
    return_train_score=False,
)

print("\n--- Gradient Boosting ---")
for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
    mean = scores_gb[f"test_{metric}"].mean()
    std = scores_gb[f"test_{metric}"].std()
    print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- Gradient Boosting ---
balanced_accuracy   : 0.8055 ± 0.0075
f1_macro            : 0.4431 ± 0.0020
precision_macro     : 0.5007 ± 0.0000
recall_macro        : 0.8055 ± 0.0075


### 5. Ensemble test

In [41]:
# Ensemble 용 샘플링 정의
under = RandomUnderSampler(sampling_strategy={0: 7000}, random_state=42)
over = SMOTE(sampling_strategy={1: 10000}, random_state=42)

# 개별 분류기 정의
gb_clf = GradientBoostingClassifier(
    n_estimators=340,
    learning_rate=0.0123,
    max_depth=2,
    min_samples_leaf=7,
    max_features=1.0,
    subsample=0.726,
    random_state=42,
)

xgb_clf = XGBClassifier(
    n_estimators=300,
    learning_rate=0.02,
    max_depth=3,
    subsample=0.8,
    eval_metric="logloss",
    random_state=42,
)

rf_clf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)

# VotingClassifier 구성
voting_clf = VotingClassifier(
    estimators=[
        ("gb", gb_clf),
        ("xgb", xgb_clf),
        ("rf", rf_clf),
    ],
    voting="soft",
    n_jobs=-1,
)

# 교차 검증
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
ba_list, pm_list, rm_list, fm_list = [], [], [], []

for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X, y), start=1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # 샘플링 수행
    X_resampled, y_resampled = under.fit_resample(X_train, y_train)
    X_resampled, y_resampled = over.fit_resample(X_resampled, y_resampled)

    # 학습 및 예측
    voting_clf.fit(X_resampled, y_resampled)
    y_pred = voting_clf.predict(X_test)

    # 평가
    ba = balanced_accuracy_score(y_test, y_pred)
    pm = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rm = recall_score(y_test, y_pred, average="macro", zero_division=0)
    fm = f1_score(y_test, y_pred, average="macro", zero_division=0)

    ba_list.append(ba)
    pm_list.append(pm)
    rm_list.append(rm)
    fm_list.append(fm)

# 최종 요약
print("\n--- Ensemble ---")
print(f"balanced_accuracy   : {np.mean(ba_list):.4f} ± {np.std(ba_list):.4f}")
print(f"f1_macro            : {np.mean(fm_list):.4f} ± {np.std(fm_list):.4f}")
print(f"precision_macro     : {np.mean(pm_list):.4f} ± {np.std(pm_list):.4f}")
print(f"recall_macro        : {np.mean(rm_list):.4f} ± {np.std(rm_list):.4f}")


--- Ensemble ---
balanced_accuracy   : 0.8076 ± 0.0091
f1_macro            : 0.4488 ± 0.0011
precision_macro     : 0.5008 ± 0.0000
recall_macro        : 0.8076 ± 0.0091


### 논샘플링 버전 모델

In [42]:
nosampling_models = {
    "RandomForest": Pipeline(
        [
            (
                "clf",
                RandomForestClassifier(
                    n_estimators=300,
                    max_depth=15,
                    min_samples_split=10,
                    min_samples_leaf=4,
                    random_state=42,
                ),
            ),
        ]
    ),
    "XGBoost": Pipeline(
        [
            (
                "clf",
                XGBClassifier(
                    n_estimators=500,
                    learning_rate=0.05,
                    max_depth=6,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    use_label_encoder=False,
                    eval_metric="logloss",
                    random_state=42,
                ),
            ),
        ]
    ),
    "LightGBM": Pipeline(
        [
            ("clf", LGBMClassifier(random_state=42, verbose=-1)),
        ]
    ),
}

### 검증

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate

# 모델별 최적 k 딕셔너리
best_k_dict = {
    "LightGBM": 10,
    "RandomForest": 3,
    "XGBoost": 10,
}

# 최적 k 기준으로 다시 교차 검증
for model_name, k in best_k_dict.items():
    print(f"\n--- {model_name} (k={k}) ---")

    pipe = nosampling_models[model_name]
    cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    scores = cross_validate(
        pipe,
        X,
        y,
        cv=cv,
        scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
        return_train_score=False,
    )

    for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
        mean = scores[f"test_{metric}"].mean()
        std = scores[f"test_{metric}"].std()
        print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")