In [1]:
''' Dependencies '''
import pandas as pd
import numpy as np
import statsmodels.tools.tools as stattools
from sklearn.tree import DecisionTreeClassifier, export_graphviz

''' Suppress Warnings '''
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [2]:
wildfires = pd.read_csv(r'./data/df_Cleaned.csv')
wildfires.head()

Unnamed: 0,OBJECTID,Date_Start,Date_Finish,Acres,FireCause,Lat,Long,DispatchCenterID,PredominantFuelGroup,State,...,tempmin,temp,humidity,precip,windspeed,pressure,visibility,solarradiation,conditions,FireOccured
0,127.0,10/18/2020,3/10/2021,562.913504,Unknown,36.07114,-121.4505,CALPCC,Unknown,US-CA,...,49.9,58.8,67.29,0.0,12.7,1015.8,9.2,515.4,Clear,1
1,128.0,5/1/2020,5/15/2020,0.15168,Unknown,39.55669,-119.5585,NVSFC,Grass,US-NV,...,41.2,60.6,25.39,0.0,18.6,1015.5,9.9,510.1,Clear,1
2,129.0,8/8/2020,8/20/2020,0.3,Human,33.29384,-110.45,AZPHC,Grass-Shrub,US-AZ,...,,,,,,,,,,1
3,130.0,5/8/2020,5/26/2020,44.300517,Human,35.87582,-115.2041,NVLIC,Grass-Shrub,US-NV,...,58.6,76.9,11.12,0.0,6.9,1011.0,9.9,482.7,Clear,1
4,133.0,8/21/2020,8/22/2020,4.0,Human,44.035131,-103.036037,SDGPC,Grass,US-SD,...,33.3,47.4,59.35,0.07,20.1,1017.0,9.8,336.0,"Rain, Partially cloudy",1


In [3]:
wildfires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21404 entries, 0 to 21403
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   OBJECTID              7125 non-null   float64
 1   Date_Start            21404 non-null  object 
 2   Date_Finish           7125 non-null   object 
 3   Acres                 7125 non-null   float64
 4   FireCause             7125 non-null   object 
 5   Lat                   21404 non-null  float64
 6   Long                  21404 non-null  float64
 7   DispatchCenterID      7122 non-null   object 
 8   PredominantFuelGroup  7125 non-null   object 
 9   State                 21404 non-null  object 
 10  Fire_Duration         6056 non-null   float64
 11  Fire_Duration_Group   6056 non-null   object 
 12  Address               21404 non-null  object 
 13  URLAddress            21404 non-null  object 
 14  tempmax               20874 non-null  float64
 15  tempmin            

In [4]:
''' Subset of wildfires for input and output variables '''
wf_sub = wildfires.filter(['temp', 'humidity', 'conditions', 'FireOccured'], axis=1)
wf_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21404 entries, 0 to 21403
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   temp         20874 non-null  float64
 1   humidity     20843 non-null  float64
 2   conditions   20874 non-null  object 
 3   FireOccured  21404 non-null  int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 669.0+ KB


### CART Model for Predicting Wildfires w/o Humidity as a Predictor Variable

In [5]:
''' Output variable and names '''
y = wf_sub[['FireOccured']]
y_names = ["yes", "no"]

''' Fill missing temperature values with the mean '''
wf_sub['temp'] = wf_sub['temp'].fillna(value = wf_sub['temp'].mean())

''' Converting categorical variables to dummy variable form '''
con_dum = pd.get_dummies(wf_sub['conditions'])

''' Input variables '''
X = pd.concat((wf_sub[['temp']], con_dum), axis=1)
X_names = ["temp", "Clear", "Rain, Partially cloudy", "Partially cloudy", "Overcast", "Rain", "Rain, Overcast", 
           "Snow, Partially cloudy", "Snow, Overcast", "Snow"]

In [6]:
''' Split the data into training and test sets '''
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=42)

In [7]:
''' Run CART algortihm '''
fire_cart = DecisionTreeClassifier(criterion="gini", max_leaf_nodes=10).fit(X_train, y_train)

In [8]:
''' Obtain tree structure '''
export_graphviz(fire_cart, out_file="./data/fire_cart.dot", feature_names=X_names, class_names=y_names)

In [25]:
''' View Model '''
fire_cart

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

### CART Model for Predicting Wildfires w Humidity as a Predictor Variable

In [10]:
''' Fill missing humidity values with the mean '''
wf_sub['humidity'] = wf_sub['humidity'].fillna(value = wf_sub['humidity'].mean())

''' Input variables '''
X2 = pd.concat((wf_sub[['temp', 'humidity']], con_dum), axis=1)
X2_names = ["temp", "humidity", "Clear", "Rain, Partially cloudy", "Partially cloudy", "Overcast", "Rain", 
           "Rain, Overcast", "Snow, Partially cloudy", "Snow, Overcast", "Snow"]

In [11]:
''' Split the data into training and test sets '''
from sklearn.model_selection import train_test_split as tts
X2_train, X2_test, y2_train, y2_test = tts(X2, y, test_size=0.3, random_state=42)

In [12]:
''' Run CART algortihm '''
fire_cart2 = DecisionTreeClassifier(criterion="gini", max_leaf_nodes=10).fit(X2_train, y2_train)

In [13]:
''' Obtain tree structure '''
export_graphviz(fire_cart, out_file="./data/fire_cart2.dot", feature_names=None, class_names=y_names)

In [26]:
''' View Model '''
fire_cart2

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

### C5.0 Decision Tree w/o Humidity as Predictor Variable

In [35]:
''' Build C5.0 Classifier '''
c50 = DecisionTreeClassifier(criterion="entropy", max_leaf_nodes=10).fit(X_train, y_train)

''' Obtain tree structure '''
export_graphviz(c50, out_file="./data/c50_n10.dot", feature_names=X_names, class_names=y_names)

''' Print predicted values '''
# c50.predict(X_train)

''' View Model '''
c50

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

### C5.0 Decision Tree w/ Humidity as Predictor Variable

In [36]:
''' Build C5.0 Classifier '''
c50_h = DecisionTreeClassifier(criterion="entropy", max_leaf_nodes=10).fit(X2_train, y2_train)

''' Obtain tree structure '''
export_graphviz(c50_h, out_file="./data/c50_n10_h.dot", feature_names=None, class_names=y_names)

''' Print predicted values '''
# c50.predict(X2_train)

''' View Model '''
c50_h

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [17]:
''' Create confusion matrix '''
from sklearn.metrics import confusion_matrix

conmat = confusion_matrix(y2_train, c50_h.predict(X2_train))
conmat

array([[9322,  661],
       [2909, 2090]], dtype=int64)

### C5.0 Decision Tree Model Evaluation 

In [33]:
''' Model Evaluation w'''
true_pos = conmat[0][0] # True positives
true_neg = conmat[1][1] # False negatives
f_pos = conmat[0][1] # False positives
f_neg = conmat[1][0] # False negatives

precision = true_pos / (true_pos + f_pos)
recall = f_pos / (f_pos + true_neg)
sensitivity = true_pos / (true_pos + f_neg)
specificity = true_neg / (true_neg + f_pos)

f1 = 2 * (precision * recall) / (precision + recall)
f2 = 5 * (precision * recall) / ((4 * precision) + recall)
f05 = 1.25 * (precision * recall) / ((0.25 * precision) + recall)

accuracy = (true_pos + true_neg) / sum([i + j for (i, j) in zip(conmat[0], conmat[1])])
error = 1 - accuracy
f_pos_rate = f_pos / (f_pos + true_neg)
f_neg_rate = f_neg / (f_neg + true_pos)

In [34]:
''' Model Metrics '''
c50_met = {
    "Accuracy"    : [round(accuracy, 3)],
    "Sensitivity" : [round(sensitivity, 3)],
    "Specificity" : [round(specificity, 3)],
    "F1-Measure"  : [round(f1, 3)]}
c50met_df = pd.DataFrame(data = c50_met)
c50met_df.T

Unnamed: 0,0
Accuracy,0.762
Sensitivity,0.762
Specificity,0.76
F1-Measure,0.382


### CART Decision Tree Model Evaluation

In [20]:
cmat_cart = confusion_matrix(y2_train, fire_cart2.predict(X2_train))
cmat_cart

array([[8904, 1079],
       [2518, 2481]], dtype=int64)

In [31]:
''' Model Evaluation w'''
tp = cmat_cart[0][0] # True positives
tn = cmat_cart[1][1] # False negatives
fp = cmat_cart[0][1] # False positives
fn = cmat_cart[1][0] # False negatives

prec = tp / (tp + fp) # Precision
reca = fp / (fp + tn)  # Recall
sens = tp / (tp + fn) # Sensitivity
spec = tn / (tn + fp) # Specificity

f1 = 2 * (prec * reca) / (prec + reca)
f2 = 5 * (prec * reca) / ((4 * prec) + reca)
f05 = 1.25 * (prec * reca) / ((0.25 * prec) + reca)

acc = (tp + tn) / sum([i + j for (i, j) in zip(cmat_cart[0], cmat_cart[1])])
err = 1 - accuracy
fp_rate = fp / (fp + tn)
fn_rate = fn / (fn + tp)

In [32]:
''' Model Metrics '''
cart_met = {
    "Accuracy"    : [round(acc, 3)],
    "Sensitivity" : [round(sens, 3)],
    "Specificity" : [round(spec, 3)],
    "F1-Measure"  : [round(f1, 3)]}
cartmet_df = pd.DataFrame(data = cart_met)
cartmet_df.T

Unnamed: 0,0
Accuracy,0.76
Sensitivity,0.78
Specificity,0.697
F1-Measure,0.452
