In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('/Users/daehokim/Desktop/머신러닝 Project/Data/KNN_imputed.csv')
df.head()

Unnamed: 0,datetime,region,temp,rain,wind,humidity,wildfire_count,rain_indicator
0,2020-01-01 00:00:00,속초,-2.6,0.0,3.1,30.0,0.0,1.0
1,2020-01-01 01:00:00,속초,-2.5,0.0,2.1,32.0,0.0,1.0
2,2020-01-01 02:00:00,속초,-2.5,0.0,1.0,38.0,0.0,1.0
3,2020-01-01 03:00:00,속초,-2.0,0.0,2.0,39.0,0.0,1.0
4,2020-01-01 04:00:00,속초,-1.7,0.175,2.6,40.0,0.0,1.0


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_validate
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from scipy.stats import randint, uniform
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 정답 라벨 생성
df["label"] = (df["wildfire_count"] > 0).astype(int)

# 추가 feature 생성

# VPD (증기압 결핍량)
df["VPD"] = (
    0.6108
    * np.exp((17.27 * df["temp"]) / (df["temp"] + 237.3))
    * (1 - df["humidity"] / 100)
)

# rain_presence 컬럼 생성
df["rain_presence"] = (df["rain_indicator"] == 0).astype(int)

# datetime 정렬 및 누적 계산
df["datetime"] = pd.to_datetime(df["datetime"])
df = df.sort_values(["region", "datetime"])

# 7일(=168시간) rolling 합계
df["rain_presence_7day_sum"] = df.groupby("region")["rain_presence"].transform(
    lambda x: x.rolling(window=168, min_periods=1).sum()
)

# 학습용 컬럼 선택
feature_cols = ["temp", "wind", "humidity", "VPD", "rain_presence_7day_sum"]
X = df[feature_cols]
y = df["label"]

# 샘플링
under = RandomUnderSampler(sampling_strategy={0: 10000}, random_state=42)
over = BorderlineSMOTE(sampling_strategy={1: 10000}, random_state=42, k_neighbors=3)

In [4]:
import warnings

# 모든 FutureWarning 무시 (sklearn 포함)
warnings.filterwarnings("ignore", category=FutureWarning)

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from imblearn.pipeline import Pipeline

rf_pipeline = Pipeline(
    [
        ("under", under),
        ("over", over),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=300,
                max_depth=3,
                min_samples_split=10,
                min_samples_leaf=4,
                max_features="sqrt",
                random_state=42,
            ),
        ),
    ]
)

cv_rf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores_rf = cross_validate(
    rf_pipeline,
    X,
    y,
    cv=cv_rf,
    scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
    return_train_score=False,
)

print("\n--- RandomForest ---")
for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
    mean = scores_rf[f"test_{metric}"].mean()
    std = scores_rf[f"test_{metric}"].std()
    print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- RandomForest ---
balanced_accuracy   : 0.7973 ± 0.0090
f1_macro            : 0.4366 ± 0.0017
precision_macro     : 0.5007 ± 0.0000
recall_macro        : 0.7973 ± 0.0090


In [6]:
from xgboost import XGBClassifier

xgb_pipeline = Pipeline(
    [
        ("under", under),
        ("over", over),
        (
            "clf",
            XGBClassifier(
                n_estimators=300,
                learning_rate=0.005,
                max_depth=3,
                subsample=0.6,
                colsample_bytree=0.9,
                eval_metric="auc",
                random_state=42,
            ),
        ),
    ]
)

cv_xgb = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scores_xgb = cross_validate(
    xgb_pipeline,
    X,
    y,
    cv=cv_xgb,
    scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
    return_train_score=False,
)

print("\n--- XGBoost ---")
for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
    mean = scores_xgb[f"test_{metric}"].mean()
    std = scores_xgb[f"test_{metric}"].std()
    print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- XGBoost ---
balanced_accuracy   : 0.7982 ± 0.0170
f1_macro            : 0.4382 ± 0.0014
precision_macro     : 0.5007 ± 0.0000
recall_macro        : 0.7982 ± 0.0170


In [7]:
from lightgbm import LGBMClassifier

lgbm_pipeline = Pipeline(
    [
        ("under", under),
        ("over", over),
        (
            "clf",
            LGBMClassifier(
                n_estimators=500,
                learning_rate=0.01,
                max_depth=7,
                num_leaves=50,
                min_child_samples=30,
                min_child_weight=1e-2,
                subsample=0.8,
                colsample_bytree=0.7,
                reg_alpha=0.1,  # L1 정규화
                reg_lambda=1.0,  # L2 정규화
                random_state=42,
                verbose=-1,
            ),
        ),
    ]
)

cv_lgbm = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scores_lgbm = cross_validate(
    lgbm_pipeline,
    X,
    y,
    cv=cv_lgbm,
    scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
    return_train_score=False,
)

print("\n--- LightGBM ---")
for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
    mean = scores_lgbm[f"test_{metric}"].mean()
    std = scores_lgbm[f"test_{metric}"].std()
    print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- LightGBM ---
balanced_accuracy   : 0.7740 ± 0.0193
f1_macro            : 0.4597 ± 0.0012
precision_macro     : 0.5008 ± 0.0001
recall_macro        : 0.7740 ± 0.0193


In [8]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from imblearn.pipeline import Pipeline

gb_pipeline = Pipeline(
    [
        ("under", under),
        ("over", over),
        (
            "clf",
            GradientBoostingClassifier(
                n_estimators=340,
                learning_rate=0.012287721406968568,
                max_depth=2,
                max_features=1.0,
                min_samples_leaf=7,
                subsample=0.7257423924305306,
                random_state=42,
                verbose=0,
            ),
        ),
    ]
)

cv_gb = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scores_gb = cross_validate(
    gb_pipeline,
    X,
    y,
    cv=cv_gb,
    scoring=["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"],
    return_train_score=False,
)

print("\n--- Gradient Boosting ---")
for metric in ["balanced_accuracy", "f1_macro", "precision_macro", "recall_macro"]:
    mean = scores_gb[f"test_{metric}"].mean()
    std = scores_gb[f"test_{metric}"].std()
    print(f"{metric:<20}: {mean:.4f} ± {std:.4f}")


--- Gradient Boosting ---
balanced_accuracy   : 0.7994 ± 0.0153
f1_macro            : 0.4366 ± 0.0011
precision_macro     : 0.5007 ± 0.0000
recall_macro        : 0.7994 ± 0.0153
