In [46]:
import sys
sys.path.append('/home/sarju/.local/lib/python3.12/site-packages')

In [61]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from scipy.stats import uniform, randint
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import time

In [97]:
CAFire = pd.read_csv("cleanedCAFire.csv", index_col="_id", low_memory=False)

In [98]:
# Get the data and clean if removing any blank and na values
CAFire = pd.read_csv("cleanedCAFire.csv", index_col="_id", low_memory=False)
clean_CAFire = CAFire.replace([' ', 'Unknown'], np.nan)

In [50]:
CAFire.columns

Index(['OBJECTID', '* Damage', '* Street Number', '* Street Name',
       '* Street Type (e.g. road, drive, lane, etc.)', '* City', 'Zip Code',
       '* CAL FIRE Unit', 'County', '* Incident Name',
       'Incident Number (e.g. CAAEU 123456)', 'Incident Start Date',
       'If Affected 1-9% - Where did fire start?',
       'If Affected 1-9% - What started fire?',
       'Structure Defense Actions Taken', '* Structure Type',
       'Structure Category', '# Units in Structure (if multi unit)',
       '# of Damaged Outbuildings < 120 SQFT',
       '# of Non Damaged Outbuildings < 120 SQFT', '* Roof Construction',
       '* Eaves', '* Vent Screen', '* Exterior Siding', '* Window Pane',
       '* Deck/Porch On Grade', '* Deck/Porch Elevated',
       '* Patio Cover/Carport Attached to Structure',
       '* Fence Attached to Structure', 'Distance - Propane Tank to Structure',
       'Distance - Residence to Utility/Misc Structure &gt; 120 SQFT',
       'Fire Name (Secondary)', 'GLOBALID', 'L

In [99]:
CAFire["* Damage"] = CAFire["* Damage"].astype("category").cat.codes

#Does not create column for null values
X = pd.get_dummies(CAFire.loc[:, slice('Structure Defense Actions Taken', 'Distance - Residence to Utility/Misc Structure &gt; 120 SQFT')])
y = CAFire["* Damage"]

#Clean for proper usage of XGBoost
X.columns = X.columns.str.replace('[', '').str.replace(']', '').str.replace('<', 'LT').str.replace('>', 'GT')


In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [82]:
# Computationally expensive (takes a long time)

# Tested max_depth from 5-15 (Best: 10)
# Tested learning_rate from 0.01 - 0.1 (Best: 0.05)
# Tested n_estimators from 150 - 250 (Best: 200)

param_grid = {
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 150, 200]
}

grid_search = GridSearchCV(estimator=xgb.XGBRegressor(), param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

Best parameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 150}


In [104]:
#Fit the model with the optimized parameters

#Ai prompt: How to check model runtime
start_time = time.time()

model = xgb.XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

#Test accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

end_time = time.time()

runtime = end_time - start_time
print(f"Runtime: {runtime:.3f} seconds")

Accuracy: 0.8872
Runtime: 5.579 seconds


In [67]:
# Create dataframe that shows features to importance
df_features = pd.DataFrame({"Features": X.columns, "Importance": model.feature_importances_})


# Ai prompt: How to sort dataframe
df_features = df_features.sort_values(by="Importance", ascending=False)
df_features

Unnamed: 0,Features,Importance
104,* Fence Attached to Structure_Unknown,0.199622
69,* Exterior Siding_Ignition Resistant,0.103504
55,* Eaves_Unknown,0.084554
67,* Exterior Siding_Combustible,0.084350
43,* Roof Construction_Fire Resistant,0.074804
...,...,...
58,* Vent Screen_GT30,0.000000
57,* Vent Screen_21-30,0.000000
59,* Vent Screen_Deck Elevated,0.000000
105,Distance - Propane Tank to Structure_,0.000000


In [84]:
# Find level of importance by columns in original dataframe
df_features['Category'] = df_features['Features'].str.split('_').str[0]

category_importance = df_features.groupby('Category')['Importance'].sum().reset_index()

category_importance = category_importance.sort_values(by='Importance', ascending=False)

print(category_importance)

                                             Category  Importance
6                                   * Exterior Siding    0.220644
7                       * Fence Attached to Structure    0.205315
5                                             * Eaves    0.161369
9                                 * Roof Construction    0.146974
4                               * Deck/Porch On Grade    0.049607
13               Distance - Propane Tank to Structure    0.043412
16                    Structure Defense Actions Taken    0.035373
8         * Patio Cover/Carport Attached to Structure    0.032712
11                                      * Vent Screen    0.019711
10                                   * Structure Type    0.019249
15                                 Structure Category    0.018932
3                               * Deck/Porch Elevated    0.017796
12                                      * Window Pane    0.008375
1               # of Damaged Outbuildings LT 120 SQFT    0.007238
2         

In [85]:
#Retest with only categories that are important

CAFire["* Damage"] = CAFire["* Damage"].astype("category").cat.codes

df_relevant = category_importance[category_importance["Importance"] > 0.1]

X = pd.get_dummies(CAFire[df_relevant["Category"].tolist()])
y = CAFire["* Damage"]

#Clean for proper usage of XGBoost
X.columns = X.columns.str.replace('[', '').str.replace(']', '').str.replace('<', 'LT').str.replace('>', 'GT')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [76]:
param_grid = {
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 150]
}

grid_search = GridSearchCV(estimator=xgb.XGBRegressor(), param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

Best parameters: {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 100}


In [96]:
start_time = time.time()

model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

#Test accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

end_time = time.time()

runtime = end_time - start_time
print(f"Runtime: {runtime:.3f} seconds")

Accuracy: 0.8589
Runtime: 2.072 seconds
