## Main model

### 라이브러리 import

In [4]:
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

### 샘플링

In [5]:
# 데이터 불러오기
df = pd.read_csv("final.csv")

# 정답 라벨 생성
df["label"] = (df["wildfire_count"] > 0).astype(int)

# 추가 feature 생성

# VPD (증기압 결핍량)
df["VPD"] = (
    0.6108
    * np.exp((17.27 * df["temp"]) / (df["temp"] + 237.3))
    * (1 - df["humidity"] / 100)
)

# rain_presence 컬럼 생성
df["rain_presence"] = (df["rain_indicator"] == 0).astype(int)

# datetime 정렬 및 누적 계산
df["datetime"] = pd.to_datetime(df["datetime"])
df = df.sort_values(["region", "datetime"])

# 7일(=168시간) rolling 합계
df["rain_presence_7day_sum"] = df.groupby("region")["rain_presence"].transform(
    lambda x: x.rolling(window=168, min_periods=1).sum()
)

# 학습용 컬럼 선택
feature_cols = ["temp", "wind", "humidity", "VPD", "rain_presence_7day_sum"]
X = df[feature_cols]
y = df["label"]

# 샘플링
under = RandomUnderSampler(sampling_strategy={0: 10000}, random_state=42)
over = BorderlineSMOTE(sampling_strategy={1: 10000}, random_state=42, k_neighbors=3)

### 기본 모델별 파이프라인 - 하이퍼파라미터 튜닝 전

In [8]:
models = {
    "RandomForest": Pipeline(
        [
            ("under", under),
            ("over", over),
            (
                "clf",
                RandomForestClassifier(
                    random_state=42,
                ),
            ),
        ]
    ),
    "XGBoost": Pipeline(
        [
            ("under", under),
            ("over", over),
            (
                "clf",
                XGBClassifier(
                    random_state=42,
                ),
            ),
        ]
    ),
    "LightGBM": Pipeline(
        [
            ("under", under),
            ("over", over),
            ("clf", LGBMClassifier(random_state=42, verbose=-1)),
        ]
    ),
    "GradientBoosting": Pipeline(
        [
            ("under", under),
            ("over", over),
            ("gb", GradientBoostingClassifier(random_state=42)),
        ]
    ),
}

### 튜닝 전 결과

In [10]:
from sklearn.model_selection import StratifiedKFold, cross_validate

# 모델별 최적 k 딕셔너리 (위 결과 기반)
best_k_dict = {
    "LightGBM": 10,
    "RandomForest": 5,
    "XGBoost": 10,
    "GradientBoosting": 5,
}

# 최적 k 기준으로 다시 교차 검증
for model_name, k in best_k_dict.items():
    print(f"\n--- {model_name} ---")

    pipe = models[model_name]
    cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    scores = cross_validate(
        pipe,
        X,
        y,
        cv=cv,
        scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
        return_train_score=False,
    )

    for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
        mean = scores[f"test_{metric}"].mean()
        std = scores[f"test_{metric}"].std()
        print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- LightGBM ---
balanced_accuracy   : 0.7464 ± 0.0144
f1_macro            : 0.4700 ± 0.0015
precision_macro     : 0.5009 ± 0.0001
recall_macro        : 0.7464 ± 0.0144

--- RandomForest ---
balanced_accuracy   : 0.6834 ± 0.0108
f1_macro            : 0.4770 ± 0.0004
precision_macro     : 0.5008 ± 0.0000
recall_macro        : 0.6834 ± 0.0108

--- XGBoost ---
balanced_accuracy   : 0.7297 ± 0.0189
f1_macro            : 0.4680 ± 0.0014
precision_macro     : 0.5008 ± 0.0001
recall_macro        : 0.7297 ± 0.0189

--- GradientBoosting ---
balanced_accuracy   : 0.7853 ± 0.0028
f1_macro            : 0.4446 ± 0.0022
precision_macro     : 0.5007 ± 0.0000
recall_macro        : 0.7853 ± 0.0028


## Hyperparameter 튜닝

### 1. 랜덤포레스트 test

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from imblearn.pipeline import Pipeline

rf_pipeline = Pipeline(
    [
        ("under", under),
        ("over", over),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=300,
                max_depth=3,
                min_samples_split=10,
                min_samples_leaf=4,
                max_features="sqrt",
                random_state=42,
            ),
        ),
    ]
)

cv_rf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores_rf = cross_validate(
    rf_pipeline,
    X,
    y,
    cv=cv_rf,
    scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
    return_train_score=False,
)

print("\n--- RandomForest ---")
for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
    mean = scores_rf[f"test_{metric}"].mean()
    std = scores_rf[f"test_{metric}"].std()
    print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- RandomForest ---
balanced_accuracy   : 0.7925 ± 0.0032
f1_macro            : 0.4352 ± 0.0046
precision_macro     : 0.5006 ± 0.0000
recall_macro        : 0.7925 ± 0.0032


### 2. XGBoost test

In [69]:
from xgboost import XGBClassifier

xgb_pipeline = Pipeline(
    [
        ("under", under),
        ("over", over),
        (
            "clf",
            XGBClassifier(
                n_estimators=300,
                learning_rate=0.005,
                max_depth=3,
                subsample=0.6,
                colsample_bytree=0.9,
                eval_metric="auc",
                random_state=42,
            ),
        ),
    ]
)

cv_xgb = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scores_xgb = cross_validate(
    xgb_pipeline,
    X,
    y,
    cv=cv_xgb,
    scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
    return_train_score=False,
)

print("\n--- XGBoost ---")
for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
    mean = scores_xgb[f"test_{metric}"].mean()
    std = scores_xgb[f"test_{metric}"].std()
    print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- XGBoost ---
balanced_accuracy   : 0.7928 ± 0.0133
f1_macro            : 0.4343 ± 0.0018
precision_macro     : 0.5006 ± 0.0000
recall_macro        : 0.7928 ± 0.0133


### 3. LightGBM test

In [10]:
from lightgbm import LGBMClassifier

lgbm_pipeline = Pipeline(
    [
        ("under", under),
        ("over", over),
        (
            "clf",
            LGBMClassifier(
                n_estimators=500,
                learning_rate=0.01,
                max_depth=7,
                num_leaves=50,
                min_child_samples=30,
                min_child_weight=1e-2,
                subsample=0.8,
                colsample_bytree=0.7,
                reg_alpha=0.1,  # L1 정규화
                reg_lambda=1.0,  # L2 정규화
                random_state=42,
                verbose=-1,
            ),
        ),
    ]
)

cv_lgbm = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scores_lgbm = cross_validate(
    lgbm_pipeline,
    X,
    y,
    cv=cv_lgbm,
    scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
    return_train_score=False,
)

print("\n--- LightGBM ---")
for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
    mean = scores_lgbm[f"test_{metric}"].mean()
    std = scores_lgbm[f"test_{metric}"].std()
    print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- LightGBM ---
balanced_accuracy   : 0.7772 ± 0.0155
f1_macro            : 0.4502 ± 0.0019
precision_macro     : 0.5007 ± 0.0000
recall_macro        : 0.7772 ± 0.0155


### 4. Gradient Boosting test

In [16]:
### Gradient Boosting with Best Parameters
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold

gb_pipeline = Pipeline(
    [
        ("under", under),
        ("over", over),
        (
            "clf",
            GradientBoostingClassifier(
                learning_rate=0.012287721406968568,
                max_depth=2,
                max_features=1.0,
                min_samples_leaf=7,
                n_estimators=340,
                subsample=0.7257423924305306,
                random_state=42,
            ),
        ),
    ]
)

cv_gb = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores_gb = cross_validate(
    gb_pipeline,
    X,
    y,
    cv=cv_gb,
    scoring=[
        "balanced_accuracy",
        "f1_macro",
        "precision_macro",
        "recall_macro",
    ],
    return_train_score=False,
)

print("\n--- Gradient Boosting ---")
for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
    mean = scores_gb[f"test_{metric}"].mean()
    std = scores_gb[f"test_{metric}"].std()
    print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- Gradient Boosting ---
balanced_accuracy   : 0.7923 ± 0.0049
f1_macro            : 0.4389 ± 0.0035
precision_macro     : 0.5007 ± 0.0000
recall_macro        : 0.7923 ± 0.0049


### 논샘플링 버전 모델

In [66]:
nosampling_models = {
    "RandomForest": Pipeline(
        [
            (
                "clf",
                RandomForestClassifier(
                    n_estimators=300,
                    max_depth=15,
                    min_samples_split=10,
                    min_samples_leaf=4,
                    random_state=42,
                ),
            ),
        ]
    ),
    "XGBoost": Pipeline(
        [
            (
                "clf",
                XGBClassifier(
                    n_estimators=500,
                    learning_rate=0.05,
                    max_depth=6,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    use_label_encoder=False,
                    eval_metric="logloss",
                    random_state=42,
                ),
            ),
        ]
    ),
    "LightGBM": Pipeline(
        [
            ("clf", LGBMClassifier(random_state=42, verbose=-1)),
        ]
    ),
}

### 검증

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate

# 모델별 최적 k 딕셔너리
best_k_dict = {
    "LightGBM": 10,
    "RandomForest": 3,
    "XGBoost": 10,
}

# 최적 k 기준으로 다시 교차 검증
for model_name, k in best_k_dict.items():
    print(f"\n--- {model_name} (k={k}) ---")

    pipe = nosampling_models[model_name]
    cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    scores = cross_validate(
        pipe,
        X,
        y,
        cv=cv,
        scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
        return_train_score=False,
    )

    for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
        mean = scores[f"test_{metric}"].mean()
        std = scores[f"test_{metric}"].std()
        print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")