In [12]:
''' Dependencies '''
import pandas as pd
import numpy as np
import statsmodels.tools.tools as stattools
from sklearn.tree import DecisionTreeClassifier, export_graphviz

''' Suppress Warnings '''
import warnings
warnings.filterwarnings('ignore')

In [13]:
wildfires = pd.read_csv(r'./data/df_Cleaned.csv')
wildfires.head()

Unnamed: 0,OBJECTID,Date_Start,Date_Finish,Acres,FireCause,Lat,Long,DispatchCenterID,PredominantFuelGroup,State,...,tempmin,temp,humidity,precip,windspeed,pressure,visibility,solarradiation,conditions,FireOccured
0,127.0,10/18/2020,3/10/2021,562.913504,Unknown,36.07114,-121.4505,CALPCC,Unknown,US-CA,...,49.9,58.8,67.29,0.0,12.7,1015.8,9.2,515.4,Clear,1
1,128.0,5/1/2020,5/15/2020,0.15168,Unknown,39.55669,-119.5585,NVSFC,Grass,US-NV,...,41.2,60.6,25.39,0.0,18.6,1015.5,9.9,510.1,Clear,1
2,129.0,8/8/2020,8/20/2020,0.3,Human,33.29384,-110.45,AZPHC,Grass-Shrub,US-AZ,...,,,,,,,,,,1
3,130.0,5/8/2020,5/26/2020,44.300517,Human,35.87582,-115.2041,NVLIC,Grass-Shrub,US-NV,...,58.6,76.9,11.12,0.0,6.9,1011.0,9.9,482.7,Clear,1
4,133.0,8/21/2020,8/22/2020,4.0,Human,44.035131,-103.036037,SDGPC,Grass,US-SD,...,33.3,47.4,59.35,0.07,20.1,1017.0,9.8,336.0,"Rain, Partially cloudy",1


In [14]:
wildfires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21404 entries, 0 to 21403
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   OBJECTID              7125 non-null   float64
 1   Date_Start            21404 non-null  object 
 2   Date_Finish           7125 non-null   object 
 3   Acres                 7125 non-null   float64
 4   FireCause             7125 non-null   object 
 5   Lat                   21404 non-null  float64
 6   Long                  21404 non-null  float64
 7   DispatchCenterID      7122 non-null   object 
 8   PredominantFuelGroup  7125 non-null   object 
 9   State                 21404 non-null  object 
 10  Fire_Duration         6056 non-null   float64
 11  Fire_Duration_Group   6056 non-null   object 
 12  Address               21404 non-null  object 
 13  URLAddress            21404 non-null  object 
 14  tempmax               20874 non-null  float64
 15  tempmin            

In [50]:
''' Subset of wildfires for input and output variables '''
wf_sub = wildfires.filter(['temp', 'humidity', 'conditions', 'FireOccured'], axis=1)
wf_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21404 entries, 0 to 21403
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   temp         20874 non-null  float64
 1   humidity     20843 non-null  float64
 2   conditions   20874 non-null  object 
 3   FireOccured  21404 non-null  int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 669.0+ KB


### CART Model for Predicting Wildfires w/o Humidity as a Predictor Variable

In [59]:
''' Output variable and names '''
y = wf_sub[['FireOccured']]
y_names = ["yes", "no"]

''' Fill missing temperature values with the mean '''
wf_sub['temp'] = wf_sub['temp'].fillna(value = wf_sub['temp'].mean())

''' Converting categorical variables to dummy variable form '''
con_dum = pd.get_dummies(wf_sub['conditions'])

''' Input variables '''
X = pd.concat((wf_sub[['temp']], con_dum), axis=1)
X_names = ["temp", "Clear", "Rain, Partially cloudy", "Partially cloudy", "Overcast", "Rain", "Rain, Overcast", 
           "Snow, Partially cloudy", "Snow, Overcast", "Snow"]

In [60]:
''' Split the data into training and test sets '''
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.33, random_state=42)

In [61]:
''' Run CART algortihm '''
fire_cart = DecisionTreeClassifier(criterion="gini", max_leaf_nodes=10).fit(X_train, y_train)

In [62]:
''' Obtain tree structure '''
export_graphviz(fire_cart, out_file="./data/fire_cart.dot", feature_names=X_names, class_names=y_names)

In [74]:
''' Obtain predicted values '''
# fire_cart.predict(X_train)

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

### CART Model for Predicting Wildfires w Humidity as a Predictor Variable

In [68]:
''' Fill missing humidity values with the mean '''
wf_sub['humidity'] = wf_sub['humidity'].fillna(value = wf_sub['humidity'].mean())

''' Input variables '''
X2 = pd.concat((wf_sub[['temp', 'humidity']], con_dum), axis=1)
X2_names = ["temp", "humidity", "Clear", "Rain, Partially cloudy", "Partially cloudy", "Overcast", "Rain", 
           "Rain, Overcast", "Snow, Partially cloudy", "Snow, Overcast", "Snow"]

In [69]:
''' Split the data into training and test sets '''
from sklearn.model_selection import train_test_split as tts
X2_train, X2_test, y2_train, y2_test = tts(X2, y, test_size=0.33, random_state=42)

In [70]:
''' Run CART algortihm '''
fire_cart2 = DecisionTreeClassifier(criterion="gini", max_leaf_nodes=10).fit(X2_train, y2_train)

In [72]:
''' Obtain tree structure '''
export_graphviz(fire_cart, out_file="./data/fire_cart2.dot", feature_names=None, class_names=y_names)

### C5.0 Decision Tree w/o Humidity as Predictor Variable

In [73]:
''' Build C5.0 Classifier '''
c50 = DecisionTreeClassifier(criterion="entropy", max_leaf_nodes=20).fit(X_train, y_train)

''' Obtain tree structure '''
export_graphviz(c50, out_file="./data/c50_n.dot", feature_names=X_names, class_names=y_names)

''' Print predicted values '''
# c50.predict(X_train)

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [76]:
''' Build C5.0 Classifier '''
c50_h = DecisionTreeClassifier(criterion="entropy", max_leaf_nodes=20).fit(X2_train, y2_train)

''' Obtain tree structure '''
export_graphviz(c50_h, out_file="./data/c50_n10_h.dot", feature_names=None, class_names=y_names)

# ''' Print predicted values '''
# c50.predict(X2_train)