In [93]:
# data was accessed from kaggle at the following link on 3-28-25
# https://www.kaggle.com/datasets/rtatman/188-million-us-wildfires
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import label_binarize, StandardScaler
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
np.set_printoptions(precision=10, suppress=True)


os.chdir("c:/Users/AVILA/OneDrive/Documents/GitHub/Forest-fire-prediction")

df = pd.read_csv("forest_fires.csv", low_memory = False)

In [122]:
#preparing dataset similar to how it was prepared in decision_tree_random_forest.ipynb
# normally we would have to scale our data before fitting an SVM model, but since all of the variables in this dataset are binary, we skip this step

fires = pd.DataFrame({"doy": df["DISCOVERY_DOY"],
            "year": df["FIRE_YEAR"],
            "cause_code": df["STAT_CAUSE_CODE"],
            "cause": df["STAT_CAUSE_DESCR"],
            "fire_size": df["FIRE_SIZE"],
            "state": df["STATE"],
            "fire_size_cat": df["FIRE_SIZE_CLASS"]})

fires["season"] = np.where(fires["doy"] < 91, "Q1", "Q2")
fires.loc[(fires["doy"] >= 91*2) & (fires["doy"] < 91*3), "season"] = "Q3"
fires.loc[(fires["doy"] >= 91*3), "season"] = "Q4"
fires = pd.get_dummies(data = fires, columns = ["year", "cause", "state", "season"], drop_first = True)

fires = fires.drop(["doy"], axis = 1)

fires_subset = pd.DataFrame()

for category, group in fires.groupby("fire_size_cat"):
    subset = group.sample(n = 1000, random_state  = 20)
    fires_subset = pd.concat([fires_subset, subset])

fires = fires_subset

print(fires.shape)


(7000, 92)


In [131]:
pca = PCA(n_components = 5)
features = pca.fit_transform(fires.loc[:, "year_1993": "season_Q4"])

X_train, X_test, y_train, y_test = train_test_split(fires.loc[:, "year_1993": "season_Q4"],
                                                    fires.loc[:, "fire_size_cat"],
                                                    test_size = .3,
                                                    random_state = 20,
                                                    shuffle = True)

parameters = {"kernel": ["linear", "rbf"],
                        "C": list(np.arange(7, 15, 1)),
                        "gamma": list(np.arange(7, 15, 1))} # acknowledging some unecessary calculations took place here with linear + gamma

svc = SVC(decision_function_shape = "ovr", class_weight = "balanced")
kfcv = KFold(shuffle = True, random_state = 20, n_splits = 3)
grid_search = GridSearchCV(svc, parameters, cv = kfcv, n_jobs = -1)
grid_search.fit(X_train, y_train)
grid_search.best_estimator_

In [132]:
grid = pd.DataFrame({"params": grid_search.cv_results_["params"], 
              "mean_test_score": grid_search.cv_results_["mean_test_score"],
              "rank_test_score": grid_search.cv_results_["rank_test_score"]})

grid.sort_values("rank_test_score")[0:10]

Unnamed: 0,params,mean_test_score,rank_test_score
40,"{'C': 9, 'gamma': 11, 'kernel': 'linear'}",0.298571,1
42,"{'C': 9, 'gamma': 12, 'kernel': 'linear'}",0.298571,1
46,"{'C': 9, 'gamma': 14, 'kernel': 'linear'}",0.298571,1
44,"{'C': 9, 'gamma': 13, 'kernel': 'linear'}",0.298571,1
36,"{'C': 9, 'gamma': 9, 'kernel': 'linear'}",0.298571,1
38,"{'C': 9, 'gamma': 10, 'kernel': 'linear'}",0.298571,1
34,"{'C': 9, 'gamma': 8, 'kernel': 'linear'}",0.298571,1
32,"{'C': 9, 'gamma': 7, 'kernel': 'linear'}",0.298571,1
0,"{'C': 7, 'gamma': 7, 'kernel': 'linear'}",0.298367,9
2,"{'C': 7, 'gamma': 8, 'kernel': 'linear'}",0.298367,9


In [133]:
svc = SVC(kernel = "linear", C = 9)
svc.fit(X_train, y_train)
y_pred_train = svc.predict(X_train)

y_pred_test = svc.predict(X_test)

# print(confusion_matrix(y_train, y_pred_train))
pd.DataFrame(confusion_matrix(y_test, y_pred_test))
print(f"accuracy score train: {round(accuracy_score(y_train, y_pred_train), 2)}")
print(f"accuracy score test: {round(accuracy_score(y_test, y_pred_test), 2)}")

accuracy score train: 0.35
accuracy score test: 0.29


In [126]:
new_validation = pd.DataFrame({"doy": df["DISCOVERY_DOY"],
            "year": df["FIRE_YEAR"],
            "cause_code": df["STAT_CAUSE_CODE"],
            "cause": df["STAT_CAUSE_DESCR"],
            "fire_size": df["FIRE_SIZE"],
            "state": df["STATE"],
            "fire_size_cat": df["FIRE_SIZE_CLASS"]})

new_validation["season"] = np.where(new_validation["doy"] < 91, "Q1", "Q2")
new_validation.loc[(new_validation["doy"] >=91*2) & (new_validation["doy"] < 91*3), "season"] = "Q3"
new_validation.loc[(new_validation["doy"] >= 91*3), "season"] = "Q4"
new_validation = pd.get_dummies(data = new_validation, columns = ["year", "cause", "state", "season"], drop_first = True)

new_validation_subset = pd.DataFrame()

for category, group in new_validation.groupby("fire_size_cat"):
    subset = group.sample(n = 2500, random_state  = 20)
    new_validation_subset = pd.concat([new_validation_subset, subset])

new_validation = new_validation_subset

new_validation = new_validation.drop(["doy"], axis = 1)
print(new_validation["fire_size_cat"].value_counts())
new_validation.shape

fire_size_cat
A    2500
B    2500
C    2500
D    2500
E    2500
F    2500
G    2500
Name: count, dtype: int64


(17500, 92)

In [135]:
features = new_validation.loc[:, "year_1993": "season_Q4"]
features_pca = pca.fit_transform(features)
labels = new_validation["fire_size_cat"]

y_pred_nv = svc.predict(features)
print(f"accuracy score on resampled new data: {round(accuracy_score(labels, y_pred_nv), 2)}")
pd.DataFrame(confusion_matrix(labels, y_pred_nv))


accuracy score on resampled new data: 0.32


Unnamed: 0,0,1,2,3,4,5,6
0,1062,355,156,108,164,264,391
1,548,712,521,218,205,142,154
2,263,407,871,335,317,139,168
3,304,202,483,432,456,276,347
4,283,120,250,332,559,378,578
5,280,65,131,179,399,506,940
6,269,39,37,62,234,406,1453
