In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.filterwarnings("ignore")


In [2]:
df=pd.read_csv('forest_fire_synthetic_updated.csv')
df.head(10)

Unnamed: 0,Temperature_C,Humidity_pct,Smoke_ppm,Fire_Sensor,Wind_kmh,ForestFire
0,38.5,70.2,12.0,0,3.9,0
1,34.0,61.6,27.0,0,7.1,1
2,39.5,46.1,39.0,0,3.2,0
3,45.7,33.4,247.0,0,6.2,0
4,33.4,57.6,85.0,0,0.0,1
5,33.4,52.1,19.0,0,9.3,0
6,46.1,61.1,81.0,0,8.0,0
7,40.4,56.4,16.0,0,3.1,0
8,31.7,63.9,130.0,1,12.0,1
9,38.8,35.4,37.0,0,13.6,0


In [3]:
df.drop(columns=['Fire_Sensor','Wind_kmh'])

Unnamed: 0,Temperature_C,Humidity_pct,Smoke_ppm,ForestFire
0,38.5,70.2,12.0,0
1,34.0,61.6,27.0,1
2,39.5,46.1,39.0,0
3,45.7,33.4,247.0,0
4,33.4,57.6,85.0,1
...,...,...,...,...
1495,81.2,9.1,2039.0,1
1496,95.0,46.3,2879.0,1
1497,64.7,11.2,393.0,1
1498,118.5,47.8,360.0,1


In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import joblib
import sys

In [5]:
# ---------------------------------------------------------------
# STEP 4: Feature Engineering (new ratio features)
# ---------------------------------------------------------------
df["Temp_Humidity_Ratio"] = df["Temperature_C"] / (df["Humidity_pct"] + 1)
df["Temp_Smoke_Ratio"] = df["Temperature_C"] / (df["Smoke_ppm"] + 1)
df["Smoke_Humidity_Ratio"] = df["Smoke_ppm"] / (df["Humidity_pct"] + 1)

features = [
    "Temperature_C", "Humidity_pct", "Smoke_ppm",
    "Temp_Humidity_Ratio", "Temp_Smoke_Ratio", "Smoke_Humidity_Ratio"
]
target = "ForestFire"

X = df[features].astype(float)
y = df[target].astype(int)

print("âœ… Features created successfully!")


âœ… Features created successfully!


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("âœ… Train-test split done")
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

âœ… Train-test split done
Train shape: (1200, 6) Test shape: (300, 6)


In [7]:
# ---------------------------------------------------------------
# STEP 6: Handle class imbalance with SMOTE
# ---------------------------------------------------------------
from collections import Counter
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

print("âœ… After SMOTE balancing:", Counter(y_train_bal))


âœ… After SMOTE balancing: Counter({1: 750, 0: 750})


In [8]:
# ======================================
# ðŸ”¥ XGBoost Grid Search & Evaluation
# ======================================

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_auc_score

# -------------------------------
# Hyperparameter grid
# -------------------------------
params = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# -------------------------------
# Initialize XGBClassifier (latest version)
# -------------------------------
xgb = XGBClassifier(
    eval_metric='logloss',   # required to avoid warning
    random_state=42
)

# -------------------------------
# Grid Search CV
# -------------------------------
grid = GridSearchCV(
    estimator=xgb,
    param_grid=params,
    cv=3,
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)
grid.fit(X_train_scaled, y_train)

best_model = grid.best_estimator_

print("\nâœ… Best Hyperparameters:", grid.best_params_)

# -------------------------------
# Evaluate Model
# -------------------------------
y_pred = best_model.predict(X_test_scaled)
y_prob = best_model.predict_proba(X_test_scaled)[:, 1]  # for ROC-AUC

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"\nâœ… Accuracy: {accuracy:.4f}")
print(f"âœ… F1 Score: {f1:.4f}")
print(f"âœ… ROC-AUC: {roc_auc:.4f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Fitting 3 folds for each of 72 candidates, totalling 216 fits

âœ… Best Hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}

âœ… Accuracy: 0.7167
âœ… F1 Score: 0.8009
âœ… ROC-AUC: 0.7624

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.39      0.51       112
           1       0.72      0.91      0.80       188

    accuracy                           0.72       300
   macro avg       0.72      0.65      0.65       300
weighted avg       0.72      0.72      0.69       300


Confusion Matrix:
 [[ 44  68]
 [ 17 171]]


In [9]:
joblib.dump(best_model, "forest_fire_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("\nðŸ’¾ Model and Scaler saved successfully!")


ðŸ’¾ Model and Scaler saved successfully!
