In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf

In [2]:
#  Import and read the all_data_merged.csv
fire_df = pd.read_csv("../all_data_merged.csv")
fire_df.head()

Unnamed: 0,County,Year,Name,County IDs,Latitude,Longitude,Started,Acres Burned,Cal Fire Incident,Major Incident,...,Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Avg Soil Temp (F),MWh,Ocean Proximity,Population,Households,Average Income,Average House Value
0,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,f,...,88.760563,50.28169,5.209859,58.569718,791.36,NEAR OCEAN,806,270,30147,94600
1,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,f,...,88.760563,50.28169,5.209859,58.569718,791.36,NEAR OCEAN,1194,465,25179,79000
2,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,f,...,88.760563,50.28169,5.209859,58.569718,791.36,NEAR OCEAN,907,369,23571,111400
3,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,f,...,88.760563,50.28169,5.209859,58.569718,791.36,NEAR OCEAN,434,187,19417,76100
4,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,f,...,88.760563,50.28169,5.209859,58.569718,791.36,NEAR OCEAN,1152,435,30806,106700


In [3]:
# Convert the 'f' to False and 't' to True in Major Incident column
fire_df["Major Incident"] = fire_df["Major Incident"].replace('f', 'False')
fire_df["Major Incident"] = fire_df["Major Incident"].replace('t', 'True')
fire_df["Major Incident"]

0         False
1         False
2         False
3         False
4         False
          ...  
514049    False
514050    False
514051    False
514052    False
514053     True
Name: Major Incident, Length: 514054, dtype: object

In [4]:
# Encode Major Incident column so that it is in integer format
fire_df = pd.get_dummies(fire_df, columns=["Major Incident"])
fire_df.head()

Unnamed: 0,County,Year,Name,County IDs,Latitude,Longitude,Started,Acres Burned,Cal Fire Incident,ETo (in),...,Avg Wind Speed (mph),Avg Soil Temp (F),MWh,Ocean Proximity,Population,Households,Average Income,Average House Value,Major Incident_False,Major Incident_True
0,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,12.21,...,5.209859,58.569718,791.36,NEAR OCEAN,806,270,30147,94600,1,0
1,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,12.21,...,5.209859,58.569718,791.36,NEAR OCEAN,1194,465,25179,79000,1,0
2,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,12.21,...,5.209859,58.569718,791.36,NEAR OCEAN,907,369,23571,111400,1,0
3,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,12.21,...,5.209859,58.569718,791.36,NEAR OCEAN,434,187,19417,76100,1,0
4,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,12.21,...,5.209859,58.569718,791.36,NEAR OCEAN,1152,435,30806,106700,1,0


In [5]:
# Keep the encoded Major Incident column where False=0 and True=1
fire_df = fire_df.drop(columns=["Major Incident_False"])
fire_df.head()

Unnamed: 0,County,Year,Name,County IDs,Latitude,Longitude,Started,Acres Burned,Cal Fire Incident,ETo (in),...,Dew Point (F),Avg Wind Speed (mph),Avg Soil Temp (F),MWh,Ocean Proximity,Population,Households,Average Income,Average House Value,Major Incident_True
0,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,12.21,...,50.28169,5.209859,58.569718,791.36,NEAR OCEAN,806,270,30147,94600,0
1,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,12.21,...,50.28169,5.209859,58.569718,791.36,NEAR OCEAN,1194,465,25179,79000,0
2,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,12.21,...,50.28169,5.209859,58.569718,791.36,NEAR OCEAN,907,369,23571,111400,0
3,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,12.21,...,50.28169,5.209859,58.569718,791.36,NEAR OCEAN,434,187,19417,76100,0
4,Humboldt,2019,Pilot Fire,12,40.62,-123.68,2019-01-01 14:14:00+00:00,30,t,12.21,...,50.28169,5.209859,58.569718,791.36,NEAR OCEAN,1152,435,30806,106700,0


In [6]:
# fire_df = fire_df.rename(columns={"Major Incident_True":"Major Incident"}, inplace=True)

In [8]:
# Checking data types
fire_df.dtypes

County                   object
Year                      int64
Name                     object
County IDs                int64
Latitude                float64
Longitude               float64
Started                  object
Acres Burned              int64
Cal Fire Incident        object
ETo (in)                float64
Precip (in)             float64
Sol Rad (Ly/day)        float64
Avg Vap Pres (mBars)    float64
Max Air Temp (F)        float64
Min Air Temp (F)        float64
Avg Air Temp (F)        float64
Max Rel Hum (%)           int64
Min Rel Hum (%)           int64
Avg Rel Hum (%)         float64
Dew Point (F)           float64
Avg Wind Speed (mph)    float64
Avg Soil Temp (F)       float64
MWh                     float64
Ocean Proximity          object
Population                int64
Households                int64
Average Income            int64
Average House Value       int64
Major Incident_True       uint8
dtype: object

In [9]:
# Drop the non-beneficial ID columns, 'Name'.
fire_df = fire_df.drop(columns=["Name", "Started"])

fire_df

Unnamed: 0,County,Year,County IDs,Latitude,Longitude,Acres Burned,Cal Fire Incident,ETo (in),Precip (in),Sol Rad (Ly/day),...,Dew Point (F),Avg Wind Speed (mph),Avg Soil Temp (F),MWh,Ocean Proximity,Population,Households,Average Income,Average House Value,Major Incident_True
0,Humboldt,2019,12,40.62,-123.68,30,t,12.21,12.07,314.394366,...,50.281690,5.209859,58.569718,791.36,NEAR OCEAN,806,270,30147,94600,0
1,Humboldt,2019,12,40.62,-123.68,30,t,12.21,12.07,314.394366,...,50.281690,5.209859,58.569718,791.36,NEAR OCEAN,1194,465,25179,79000,0
2,Humboldt,2019,12,40.62,-123.68,30,t,12.21,12.07,314.394366,...,50.281690,5.209859,58.569718,791.36,NEAR OCEAN,907,369,23571,111400,0
3,Humboldt,2019,12,40.62,-123.68,30,t,12.21,12.07,314.394366,...,50.281690,5.209859,58.569718,791.36,NEAR OCEAN,434,187,19417,76100,0
4,Humboldt,2019,12,40.62,-123.68,30,t,12.21,12.07,314.394366,...,50.281690,5.209859,58.569718,791.36,NEAR OCEAN,1152,435,30806,106700,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514049,San Bernardino,2013,36,34.29,-117.45,110,f,255.90,17.67,474.436845,...,28.968392,4.885777,56.778815,14375.33,INLAND,1015,472,14936,66900,0
514050,San Bernardino,2013,36,34.20,-117.42,200,f,255.90,17.67,474.436845,...,28.968392,4.885777,56.778815,14375.33,INLAND,1015,472,14936,66900,0
514051,San Bernardino,2013,36,34.35,-117.62,243,f,255.90,17.67,474.436845,...,28.968392,4.885777,56.778815,14375.33,INLAND,1015,472,14936,66900,0
514052,San Bernardino,2013,36,34.30,-117.60,413,f,255.90,17.67,474.436845,...,28.968392,4.885777,56.778815,14375.33,INLAND,1015,472,14936,66900,0


In [10]:
# Determine the number of unique values in each column.
fire_df.nunique()

County                    55
Year                       7
County IDs                55
Latitude                 716
Longitude                581
Acres Burned             616
Cal Fire Incident          2
ETo (in)                 253
Precip (in)              255
Sol Rad (Ly/day)         256
Avg Vap Pres (mBars)     256
Max Air Temp (F)         142
Min Air Temp (F)         150
Avg Air Temp (F)         256
Max Rel Hum (%)            6
Min Rel Hum (%)           23
Avg Rel Hum (%)          256
Dew Point (F)            256
Avg Wind Speed (mph)     256
Avg Soil Temp (F)        256
MWh                      313
Ocean Proximity            5
Population              4942
Households              2430
Average Income          9923
Average House Value     5400
Major Incident_True        2
dtype: int64

In [11]:
# Generate our categorical variable lists
fire_cat = fire_df.dtypes[fire_df.dtypes == "object"].index.tolist()

fire_cat

['County', 'Cal Fire Incident', 'Ocean Proximity']

In [12]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(fire_df[fire_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(fire_cat)
encode_df.head()

Unnamed: 0,County_Alameda,County_Alpine,County_Amador,County_Butte,County_Calaveras,County_Colusa,County_Contra Costa,County_El Dorado,County_Fresno,County_Glenn,...,County_Ventura,County_Yolo,County_Yuba,Cal Fire Incident_f,Cal Fire Incident_t,Ocean Proximity_<1H OCEAN,Ocean Proximity_INLAND,Ocean Proximity_ISLAND,Ocean Proximity_NEAR BAY,Ocean Proximity_NEAR OCEAN
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [13]:
# Merge one-hot encoded features and drop the originals
fire_df = fire_df.merge(encode_df,left_index=True, right_index=True)
fire_df = fire_df.drop(fire_cat,1)
fire_df

Unnamed: 0,Year,County IDs,Latitude,Longitude,Acres Burned,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),...,County_Ventura,County_Yolo,County_Yuba,Cal Fire Incident_f,Cal Fire Incident_t,Ocean Proximity_<1H OCEAN,Ocean Proximity_INLAND,Ocean Proximity_ISLAND,Ocean Proximity_NEAR BAY,Ocean Proximity_NEAR OCEAN
0,2019,12,40.62,-123.68,30,12.21,12.07,314.394366,12.729577,78.2,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,2019,12,40.62,-123.68,30,12.21,12.07,314.394366,12.729577,78.2,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,2019,12,40.62,-123.68,30,12.21,12.07,314.394366,12.729577,78.2,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,2019,12,40.62,-123.68,30,12.21,12.07,314.394366,12.729577,78.2,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,2019,12,40.62,-123.68,30,12.21,12.07,314.394366,12.729577,78.2,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514049,2013,36,34.29,-117.45,110,255.90,17.67,474.436845,6.232580,119.1,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
514050,2013,36,34.20,-117.42,200,255.90,17.67,474.436845,6.232580,119.1,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
514051,2013,36,34.35,-117.62,243,255.90,17.67,474.436845,6.232580,119.1,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
514052,2013,36,34.30,-117.60,413,255.90,17.67,474.436845,6.232580,119.1,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [14]:
# Split our preprocessed data into our features and target arrays
y = fire_df["Major Incident_True"].values
X = fire_df.drop(columns=["Major Incident_True"], axis=1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [15]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  20
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 20)                1720      
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 105       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 6         
Total params: 1,831
Trainable params: 1,831
Non-trainable params: 0
_________________________________________________________________


In [17]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [18]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [19]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

4017/4017 - 3s - loss: 0.1509 - accuracy: 0.9298
Loss: 0.15091753005981445, Accuracy: 0.9297664165496826


In [None]:
# # get importance
# importance = fit_model.coef_
# # summarize feature importance
# for i,v in enumerate(importance):
# 	print('Feature: %0d, Score: %.5f' % (i,v))
# # plot feature importance
# pyplot.bar([x for x in range(len(importance))], importance)
# pyplot.show()

# Trying out Logistic Regression Model (Just in Case)

In [20]:
y = fire_df["Major Incident_True"]
X = fire_df.drop(columns="Major Incident_True")

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(385540, 85)

In [22]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=1000,
                                random_state=1)

In [23]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=1)

In [24]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,1
1,0,0
2,0,0
3,0,0
4,0,0
5,0,1
6,0,1
7,0,0
8,0,0
9,0,1


In [25]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8243070793843472


In [26]:
from sklearn.metrics import confusion_matrix, classification_report

In [27]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[104094    626]
 [ 21953   1841]]


In [28]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.83      0.99      0.90    104720
           1       0.75      0.08      0.14     23794

    accuracy                           0.82    128514
   macro avg       0.79      0.54      0.52    128514
weighted avg       0.81      0.82      0.76    128514



In [None]:
# # get importance
# importance = fit_model.coef_
# # summarize feature importance
# for i,v in enumerate(importance):
# 	print('Feature: %0d, Score: %.5f' % (i,v))
# # plot feature importance
# pyplot.bar([x for x in range(len(importance))], importance)
# pyplot.show()