# Machine Learning Attempts

-------------------------------------

### Neural Networking

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

import pandas as pd
from pathlib import Path



In [2]:
# Import our input dataset
data = Path('Resources_k/fire_size_bins_new.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,fire_size,fire_cause,latitude,longitude,state,discovery_month,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,year,putout_time,fire_size_bin,fire_size_bin_no
0,3,1.0,Debris Burning,39.6414,-119.3083,NV,Jun,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,2005,0,Teacup,1
1,24,40.0,Arson,31.435181,-88.999489,MS,Apr,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,1999,0,Mini,3
2,31,1.2,Debris Burning,48.833,-99.7836,ND,Apr,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,2007,0,Teacup,1
3,35,30.18,Debris Burning,31.259,-84.8956,GA,Oct,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,2000,0,Mini,3
4,36,1420.0,Lightning,33.2418,-104.9122,NM,Jul,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,1994,1,Large,5


In [3]:
df=df.drop(columns=["fire_id", "fire_size", "latitude", "longitude", "fire_size_bin", "year", "fire_cause", "putout_time"])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
df[cat].nunique()

discovery_month    12
state              45
dtype: int64

In [4]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Merge one-hot encoded features and drop the originals
df= df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,fire_size_bin_no,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Remove loan status target from features data
target=df.fire_size_bin_no
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=["fire_size_bin_no", ]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 180
hidden_nodes_layer2 = 120
# hidden_nodes_layer3 = 11


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 180)               12060     
                                                                 
 dense_1 (Dense)             (None, 120)               21720     
                                                                 
 dense_2 (Dense)             (None, 1)                 121       
                                                                 
Total params: 33,901
Trainable params: 33,901
Non-trainable params: 0
_________________________________________________________________


In [8]:
# Compile the model
nn.compile(loss='categorical_crossentropy', optimizer="adam", metrics=["accuracy"])

In [9]:
## Import checkpoint dependencies
# import os
# from tensorflow.keras.callbacks import ModelCheckpoint

# # Define the checkpoint path and filenames
# os.makedirs("checkpoints/",exist_ok=True)
# checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

# # Create a callback that saves the model's weights every 5 epoch
# cp_callback = ModelCheckpoint(
#     filepath=checkpoint_path,
#     verbose=5,
#     save_weights_only=True,
#     save_freq='epoch')

# Train the model
# fit_model = nn.fit(X_train_scaled,y_train,epochs=100,callbacks=[cp_callback])
fit_model = nn.fit(X_train_scaled,y_train,epochs=50)

# # Restore the model weights
# nn.load_weights("checkpoints/weights.100.hdf5")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [10]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

103/103 - 0s - loss: 0.0000e+00 - accuracy: 0.4645 - 264ms/epoch - 3ms/step
Loss: 0.0, Accuracy: 0.46453577280044556


---------------------

------------------------------

### Random Forest Classifier

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report

from pathlib import Path


# Import our input dataset
data = Path('Resources_k/fire_size_bins_new.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,fire_size,fire_cause,latitude,longitude,state,discovery_month,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,year,putout_time,fire_size_bin,fire_size_bin_no
0,3,1.0,Debris Burning,39.6414,-119.3083,NV,Jun,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,2005,0,Teacup,1
1,24,40.0,Arson,31.435181,-88.999489,MS,Apr,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,1999,0,Mini,3
2,31,1.2,Debris Burning,48.833,-99.7836,ND,Apr,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,2007,0,Teacup,1
3,35,30.18,Debris Burning,31.259,-84.8956,GA,Oct,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,2000,0,Mini,3
4,36,1420.0,Lightning,33.2418,-104.9122,NM,Jul,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,1994,1,Large,5


In [12]:
df=df.drop(columns=["fire_id", "fire_size", "latitude", "longitude", "fire_size_bin", "year", "fire_cause", "putout_time"])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
df[cat].nunique()

discovery_month    12
state              45
dtype: int64

In [13]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,fire_size_bin_no,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Remove loan status target from features data
target=df.fire_size_bin_no
y=target
#X = df.loc[:, df.columns !=target]
X=df.copy()
X=df.drop(columns=['fire_size_bin_no'])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
model = RandomForestClassifier(n_estimators=100, random_state=78) 

#Fitting the model
model = model.fit(X_train, y_train)

# Making predictions using the testing data.
y_pred = model.predict(X_test)

In [17]:
# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_pred)
acc_score

0.3041337607874988

In [18]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual Teacup", "Actual Toy", "Actual Mini", "Actual Medium", "Actual Large", "Actual XL"], columns=["Predicted Teacup", "Predicted Toy", "Predicted Mini", "Predicted Medium", "Predicted Large", "Predicted XL"]
)

In [19]:
# Displaying Easy Ensemble Classifier
print("Random Forest Clasifier")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Clasifier
Confusion Matrix


Unnamed: 0,Predicted Teacup,Predicted Toy,Predicted Mini,Predicted Medium,Predicted Large,Predicted XL
Actual Teacup,1302,18,82,18,74,32
Actual Toy,255,14,29,8,14,9
Actual Mini,330,11,46,13,33,12
Actual Medium,180,8,26,18,32,6
Actual Large,153,6,12,9,134,95
Actual XL,82,2,3,3,84,132


Accuracy Score : 0.3041337607874988
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          1       0.57      0.85      0.43      0.68      0.61      0.38      1526
          2       0.24      0.04      0.98      0.07      0.20      0.04       329
          3       0.23      0.10      0.95      0.14      0.31      0.09       445
          4       0.26      0.07      0.98      0.11      0.26      0.06       270
          5       0.36      0.33      0.92      0.34      0.55      0.28       409
          6       0.46      0.43      0.95      0.45      0.64      0.39       306

avg / total       0.43      0.50      0.71      0.44      0.49      0.27      3285



In [20]:
#Calulate Feature Importances
importances = model.feature_importances_

In [21]:
# We can sort the features by their importance.
sorted(zip(model.feature_importances_, X.columns), reverse=True)

[(0.09734946962964271, 'Hum_pre_30'),
 (0.09721904783368264, 'Hum_pre_7'),
 (0.0953652471209638, 'Hum_pre_15'),
 (0.0879341354735827, 'Temp_pre_30'),
 (0.08744500212784666, 'Temp_pre_7'),
 (0.08725379614202776, 'Temp_pre_15'),
 (0.08415483264604201, 'Wind_pre_30'),
 (0.08361131372711372, 'Wind_pre_7'),
 (0.08315435270824707, 'Wind_pre_15'),
 (0.021511351500400986, 'state_AK'),
 (0.00912115524455081, 'discovery_month_Jun'),
 (0.007683323472355947, 'discovery_month_Jul'),
 (0.0074070878271044445, 'discovery_month_Aug'),
 (0.0069676378676676995, 'state_NY'),
 (0.006916376408601078, 'state_GA'),
 (0.006613972819465441, 'state_ID'),
 (0.006519575222277322, 'discovery_month_Mar'),
 (0.006347736403758375, 'discovery_month_Apr'),
 (0.006002206614099295, 'state_OK'),
 (0.005564395710539236, 'state_CA'),
 (0.005426872652731194, 'discovery_month_Sep'),
 (0.005279539754899029, 'discovery_month_May'),
 (0.004906719031606131, 'state_AZ'),
 (0.004737255685218033, 'state_MS'),
 (0.0046019315979311005,

----------------------------

--------------------------

### Balanced Random Forest Classifier

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder
from pathlib import Path

from imblearn.ensemble import BalancedRandomForestClassifier


# Import our input dataset
data = Path('Resources_k/fire_size_bins_new.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,fire_size,fire_cause,latitude,longitude,state,discovery_month,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,year,putout_time,fire_size_bin,fire_size_bin_no
0,3,1.0,Debris Burning,39.6414,-119.3083,NV,Jun,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,2005,0,Teacup,1
1,24,40.0,Arson,31.435181,-88.999489,MS,Apr,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,1999,0,Mini,3
2,31,1.2,Debris Burning,48.833,-99.7836,ND,Apr,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,2007,0,Teacup,1
3,35,30.18,Debris Burning,31.259,-84.8956,GA,Oct,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,2000,0,Mini,3
4,36,1420.0,Lightning,33.2418,-104.9122,NM,Jul,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,1994,1,Large,5


In [23]:
df=df.drop(columns=["fire_id", "fire_size", "latitude", "longitude", "fire_size_bin", "year", "fire_cause", "putout_time"])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
df[cat].nunique()

discovery_month    12
state              45
dtype: int64

In [24]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,fire_size_bin_no,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Remove loan status target from features data
target=df.fire_size_bin_no
y=target
#X = df.loc[:, df.columns !=target]
X=df.copy()
X=df.drop(columns=['fire_size_bin_no'])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [27]:
model = BalancedRandomForestClassifier(n_estimators=100, random_state=78) 

#Fitting the model
model = model.fit(X_train, y_train)

# Making predictions using the testing data.
y_pred = model.predict(X_test)

In [28]:
# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_pred)
acc_score

0.3186823202042197

In [29]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual Teacup", "Actual Toy", "Actual Mini", "Actual Medium", "Actual Large", "Actual XL"], columns=["Predicted Teacup", "Predicted Toy", "Predicted Mini", "Predicted Medium", "Predicted Large", "Predicted XL"]
)

In [30]:
# Displaying Easy Ensemble Classifier
print("Random Forest Clasifier")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Clasifier
Confusion Matrix


Unnamed: 0,Predicted Teacup,Predicted Toy,Predicted Mini,Predicted Medium,Predicted Large,Predicted XL
Actual Teacup,474,321,285,177,167,102
Actual Toy,67,70,76,61,37,18
Actual Mini,75,101,117,68,59,25
Actual Medium,32,62,52,51,55,18
Actual Large,14,20,17,45,168,145
Actual XL,8,9,4,22,102,161


Accuracy Score : 0.3186823202042197
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          1       0.71      0.31      0.89      0.43      0.53      0.26      1526
          2       0.12      0.21      0.83      0.15      0.42      0.17       329
          3       0.21      0.26      0.85      0.23      0.47      0.21       445
          4       0.12      0.19      0.88      0.15      0.41      0.15       270
          5       0.29      0.41      0.85      0.34      0.59      0.34       409
          6       0.34      0.53      0.90      0.42      0.69      0.45       306

avg / total       0.45      0.32      0.87      0.34      0.52      0.26      3285



-------------------------

----------------------------------

### Easy Ensemble Classifier

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder
from pathlib import Path

from imblearn.ensemble import EasyEnsembleClassifier

# Import our input dataset
data = Path('Resources_k/fire_size_bins_new.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,fire_size,fire_cause,latitude,longitude,state,discovery_month,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,year,putout_time,fire_size_bin,fire_size_bin_no
0,3,1.0,Debris Burning,39.6414,-119.3083,NV,Jun,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,2005,0,Teacup,1
1,24,40.0,Arson,31.435181,-88.999489,MS,Apr,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,1999,0,Mini,3
2,31,1.2,Debris Burning,48.833,-99.7836,ND,Apr,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,2007,0,Teacup,1
3,35,30.18,Debris Burning,31.259,-84.8956,GA,Oct,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,2000,0,Mini,3
4,36,1420.0,Lightning,33.2418,-104.9122,NM,Jul,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,1994,1,Large,5


In [32]:
df=df.drop(columns=["fire_id", "fire_size", "latitude", "longitude", "fire_size_bin", "year", "fire_cause", "putout_time"])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
df[cat].nunique()

discovery_month    12
state              45
dtype: int64

In [33]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,fire_size_bin_no,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# Remove loan status target from features data
target=df.fire_size_bin_no
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=['fire_size_bin_no']).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [36]:
# Train the EasyEnsembleClassifier

model = EasyEnsembleClassifier(n_estimators=100, random_state=78) 

#Fitting the model
model = model.fit(X_train, y_train)

# Making predictions using the testing data.
y_pred = model.predict(X_test)

In [37]:
# Calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, y_pred)
acc_score

0.3278717468354149

In [38]:

# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
cm, index=["Actual Teacup", "Actual Toy", "Actual Mini", "Actual Medium", "Actual Large", "Actual XL"], columns=["Predicted Teacup", "Predicted Toy", "Predicted Mini", "Predicted Medium", "Predicted Large", "Predicted XL"]
)

In [39]:
# Displaying Easy Ensemble Classifier
print("Balanced Random Forest Clasifier")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Balanced Random Forest Clasifier
Confusion Matrix


Unnamed: 0,Predicted Teacup,Predicted Toy,Predicted Mini,Predicted Medium,Predicted Large,Predicted XL
Actual Teacup,564,129,296,178,239,86
Actual Toy,88,34,91,45,36,22
Actual Mini,95,47,150,80,71,34
Actual Medium,40,23,56,63,67,34
Actual Large,13,5,6,38,187,153
Actual XL,14,2,3,18,127,151


Accuracy Score : 0.3278717468354149
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          1       0.69      0.38      0.86      0.49      0.57      0.31      1492
          2       0.14      0.11      0.93      0.12      0.32      0.09       316
          3       0.25      0.31      0.84      0.28      0.51      0.25       477
          4       0.15      0.22      0.88      0.18      0.44      0.18       283
          5       0.26      0.47      0.81      0.33      0.61      0.36       402
          6       0.31      0.48      0.89      0.38      0.65      0.41       315

avg / total       0.44      0.35      0.86      0.37      0.54      0.29      3285



--------------------------------------

---------------------------

### Naive Random Oversampling

In [40]:
# Import our input dataset
data = Path('Resources_k/fire_size_bins_new.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,fire_size,fire_cause,latitude,longitude,state,discovery_month,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,year,putout_time,fire_size_bin,fire_size_bin_no
0,3,1.0,Debris Burning,39.6414,-119.3083,NV,Jun,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,2005,0,Teacup,1
1,24,40.0,Arson,31.435181,-88.999489,MS,Apr,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,1999,0,Mini,3
2,31,1.2,Debris Burning,48.833,-99.7836,ND,Apr,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,2007,0,Teacup,1
3,35,30.18,Debris Burning,31.259,-84.8956,GA,Oct,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,2000,0,Mini,3
4,36,1420.0,Lightning,33.2418,-104.9122,NM,Jul,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,1994,1,Large,5


In [41]:
df=df.drop(columns=["fire_id", "fire_size", "latitude", "longitude", "fire_size_bin", "year", "fire_cause", "putout_time"])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
df[cat].nunique()

discovery_month    12
state              45
dtype: int64

In [42]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,fire_size_bin_no,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# Remove loan status target from features data
target=df.fire_size_bin_no
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=['fire_size_bin_no']).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [45]:
import warnings
warnings.filterwarnings('ignore')

from collections import Counter

# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({2: 4463, 1: 4463, 6: 4463, 3: 4463, 4: 4463, 5: 4463})

In [46]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [47]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
acc_score= balanced_accuracy_score(y_test, y_pred)

In [48]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual Teacup", "Actual Toy", "Actual Mini", "Actual Medium", "Actual Large", "Actual XL"], columns=["Predicted Teacup", "Predicted Toy", "Predicted Mini", "Predicted Medium", "Predicted Large", "Predicted XL"]
)

In [49]:
# Displaying Random Oversampling results
print("Naive Random Oversampling")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Naive Random Oversampling
Confusion Matrix


Unnamed: 0,Predicted Teacup,Predicted Toy,Predicted Mini,Predicted Medium,Predicted Large,Predicted XL
Actual Teacup,353,129,557,76,191,186
Actual Toy,58,27,138,19,38,36
Actual Mini,59,60,184,42,70,62
Actual Medium,34,23,105,25,60,36
Actual Large,20,12,28,23,188,131
Actual XL,14,4,17,12,135,133


Accuracy Score : 0.2810009305696014
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          1       0.66      0.24      0.90      0.35      0.46      0.20      1492
          2       0.11      0.09      0.92      0.09      0.28      0.07       316
          3       0.18      0.39      0.70      0.24      0.52      0.26       477
          4       0.13      0.09      0.94      0.10      0.29      0.08       283
          5       0.28      0.47      0.83      0.35      0.62      0.37       402
          6       0.23      0.42      0.85      0.30      0.60      0.34       315

avg / total       0.40      0.28      0.86      0.28      0.47      0.22      3285



### SMOTE Oversampling

In [50]:
# Import our input dataset
data = Path('Resources_k/fire_size_bins_new.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,fire_size,fire_cause,latitude,longitude,state,discovery_month,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,year,putout_time,fire_size_bin,fire_size_bin_no
0,3,1.0,Debris Burning,39.6414,-119.3083,NV,Jun,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,2005,0,Teacup,1
1,24,40.0,Arson,31.435181,-88.999489,MS,Apr,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,1999,0,Mini,3
2,31,1.2,Debris Burning,48.833,-99.7836,ND,Apr,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,2007,0,Teacup,1
3,35,30.18,Debris Burning,31.259,-84.8956,GA,Oct,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,2000,0,Mini,3
4,36,1420.0,Lightning,33.2418,-104.9122,NM,Jul,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,1994,1,Large,5


In [51]:
df=df.drop(columns=["fire_id", "fire_size", "latitude", "longitude", "fire_size_bin", "year", "fire_cause", "putout_time"])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
df[cat].nunique()

discovery_month    12
state              45
dtype: int64

In [52]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,fire_size_bin_no,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
# Remove loan status target from features data
target=df.fire_size_bin_no
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=['fire_size_bin_no']).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [55]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({2: 4463, 1: 4463, 6: 4463, 3: 4463, 4: 4463, 5: 4463})

In [56]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [57]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
acc_score=balanced_accuracy_score(y_test, y_pred)

In [58]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual Teacup", "Actual Toy", "Actual Mini", "Actual Medium", "Actual Large", "Actual XL"], columns=["Predicted Teacup", "Predicted Toy", "Predicted Mini", "Predicted Medium", "Predicted Large", "Predicted XL"]
)

In [59]:
# Displaying SMOTE results
print("SMOTE Oversampling")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

SMOTE Oversampling
Confusion Matrix


Unnamed: 0,Predicted Teacup,Predicted Toy,Predicted Mini,Predicted Medium,Predicted Large,Predicted XL
Actual Teacup,563,149,222,221,157,180
Actual Toy,101,36,62,48,32,37
Actual Mini,106,43,111,97,58,62
Actual Medium,64,40,44,43,54,38
Actual Large,33,13,13,40,158,145
Actual XL,23,8,3,25,117,139


Accuracy Score : 0.2850370712817784
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          1       0.63      0.38      0.82      0.47      0.56      0.29      1492
          2       0.12      0.11      0.91      0.12      0.32      0.10       316
          3       0.24      0.23      0.88      0.24      0.45      0.19       477
          4       0.09      0.15      0.86      0.11      0.36      0.12       283
          5       0.27      0.39      0.86      0.32      0.58      0.32       402
          6       0.23      0.44      0.84      0.30      0.61      0.36       315

avg / total       0.40      0.32      0.85      0.34      0.51      0.25      3285



## Undersampling


In [60]:
# Import our input dataset
data = Path('Resources_k/fire_size_bins_new.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,fire_size,fire_cause,latitude,longitude,state,discovery_month,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,year,putout_time,fire_size_bin,fire_size_bin_no
0,3,1.0,Debris Burning,39.6414,-119.3083,NV,Jun,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,2005,0,Teacup,1
1,24,40.0,Arson,31.435181,-88.999489,MS,Apr,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,1999,0,Mini,3
2,31,1.2,Debris Burning,48.833,-99.7836,ND,Apr,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,2007,0,Teacup,1
3,35,30.18,Debris Burning,31.259,-84.8956,GA,Oct,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,2000,0,Mini,3
4,36,1420.0,Lightning,33.2418,-104.9122,NM,Jul,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,1994,1,Large,5


In [61]:
df=df.drop(columns=["fire_id", "fire_size", "latitude", "longitude", "fire_size_bin", "year", "fire_cause", "putout_time"])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
df[cat].nunique()

discovery_month    12
state              45
dtype: int64

In [62]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,fire_size_bin_no,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
# Remove loan status target from features data
target=df.fire_size_bin_no
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=['fire_size_bin_no']).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [65]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)


Counter({1: 732, 2: 732, 3: 732, 4: 732, 5: 732, 6: 732})

In [66]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [67]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
acc_score= balanced_accuracy_score(y_test, y_pred)

In [68]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual Teacup", "Actual Toy", "Actual Mini", "Actual Medium", "Actual Large", "Actual XL"], columns=["Predicted Teacup", "Predicted Toy", "Predicted Mini", "Predicted Medium", "Predicted Large", "Predicted XL"]
)

In [69]:
# Displaying Undersampling results
print("Cluster Centroid Undersampling")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Cluster Centroid Undersampling
Confusion Matrix


Unnamed: 0,Predicted Teacup,Predicted Toy,Predicted Mini,Predicted Medium,Predicted Large,Predicted XL
Actual Teacup,430,388,220,115,139,200
Actual Toy,78,84,63,28,26,37
Actual Mini,60,131,107,68,50,61
Actual Medium,54,57,46,30,57,39
Actual Large,27,37,1,44,126,167
Actual XL,14,17,1,26,101,156


Accuracy Score : 0.2821705324398425
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          1       0.65      0.29      0.87      0.40      0.50      0.24      1492
          2       0.12      0.27      0.79      0.16      0.46      0.20       316
          3       0.24      0.22      0.88      0.23      0.44      0.18       477
          4       0.10      0.11      0.91      0.10      0.31      0.09       283
          5       0.25      0.31      0.87      0.28      0.52      0.26       402
          6       0.24      0.50      0.83      0.32      0.64      0.40       315

avg / total       0.40      0.28      0.86      0.30      0.49      0.23      3285



## Combination (Over and Under) Sampling

In [95]:
# Import our input dataset
data = Path('Resources_k/fire_size_bins_new.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,fire_size,fire_cause,latitude,longitude,state,discovery_month,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,year,putout_time,fire_size_bin,fire_size_bin_no
0,3,1.0,Debris Burning,39.6414,-119.3083,NV,Jun,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,2005,0,Teacup,1
1,24,40.0,Arson,31.435181,-88.999489,MS,Apr,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,1999,0,Mini,3
2,31,1.2,Debris Burning,48.833,-99.7836,ND,Apr,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,2007,0,Teacup,1
3,35,30.18,Debris Burning,31.259,-84.8956,GA,Oct,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,2000,0,Mini,3
4,36,1420.0,Lightning,33.2418,-104.9122,NM,Jul,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,1994,1,Large,5


In [96]:
df=df.drop(columns=["fire_id", "fire_size", "latitude", "longitude", "fire_size_bin", "year", "fire_cause", "putout_time"])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
df[cat].nunique()

discovery_month    12
state              45
dtype: int64

In [97]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [98]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,fire_size_bin_no,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
# Remove loan status target from features data
target=df.fire_size_bin_no
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=['fire_size_bin_no']).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [100]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({1: 495, 2: 4523, 3: 3706, 4: 4867, 5: 4214, 6: 4637})

In [101]:
# Train the Logistic Regression model using the resampled data
from imblearn.ensemble import EasyEnsembleClassifier
model = EasyEnsembleClassifier(n_estimators=100, random_state=78) 
model.fit(X_resampled, y_resampled)

EasyEnsembleClassifier(n_estimators=100, random_state=78)

In [102]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [103]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
cm_df = pd.DataFrame(
    cm, index=["Actual Teacup", "Actual Toy", "Actual Mini", "Actual Medium", "Actual Large", "Actual XL"], columns=["Predicted Teacup", "Predicted Toy", "Predicted Mini", "Predicted Medium", "Predicted Large", "Predicted XL"]
)

In [104]:
# Displaying SMOTEEN results
print("SMOTEENN Sampling")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

SMOTEENN Sampling
Confusion Matrix


Unnamed: 0,Predicted Teacup,Predicted Toy,Predicted Mini,Predicted Medium,Predicted Large,Predicted XL
Actual Teacup,1355,3,18,12,82,22
Actual Toy,283,1,4,2,19,7
Actual Mini,404,0,17,6,37,13
Actual Medium,218,1,11,4,34,15
Actual Large,160,0,9,3,142,88
Actual XL,96,1,0,2,101,115


Accuracy Score : 0.3235893012604572
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          1       0.60      0.37      0.79      0.46      0.54      0.28      1492
          2       0.12      0.20      0.84      0.15      0.41      0.16       316
          3       0.31      0.31      0.88      0.31      0.52      0.26       477
          4       0.20      0.14      0.95      0.17      0.37      0.13       283
          5       0.27      0.44      0.83      0.33      0.61      0.35       402
          6       0.31      0.47      0.89      0.37      0.65      0.40       315

avg / total       0.41      0.35      0.84      0.36      0.53      0.27      3285



---------------------------------

--------------------------------

### Gradient Boosting Classifier

In [80]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd


In [81]:
# Import our input dataset
data = Path('Resources_k/fire_size_bins_new.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,fire_size,fire_cause,latitude,longitude,state,discovery_month,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,year,putout_time,fire_size_bin,fire_size_bin_no
0,3,1.0,Debris Burning,39.6414,-119.3083,NV,Jun,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,2005,0,Teacup,1
1,24,40.0,Arson,31.435181,-88.999489,MS,Apr,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,1999,0,Mini,3
2,31,1.2,Debris Burning,48.833,-99.7836,ND,Apr,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,2007,0,Teacup,1
3,35,30.18,Debris Burning,31.259,-84.8956,GA,Oct,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,2000,0,Mini,3
4,36,1420.0,Lightning,33.2418,-104.9122,NM,Jul,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,1994,1,Large,5


In [82]:
df=df.drop(columns=["fire_id", "fire_size", "latitude", "longitude", "fire_size_bin", "year", "fire_cause", "putout_time"])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=["discovery_month", "state"]

# Check the number of unique values in each column
df[cat].nunique()

discovery_month    12
state              45
dtype: int64

In [83]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_30,Temp_pre_15,Temp_pre_7,Wind_pre_30,Wind_pre_15,Wind_pre_7,Hum_pre_30,Hum_pre_15,Hum_pre_7,fire_size_bin_no,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,16.275967,18.996181,18.142564,4.054982,3.398329,3.671282,44.778429,37.140811,35.353846,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13.468619,15.067227,15.60479,2.038268,1.737921,1.775904,57.997207,56.747191,59.614458,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.891635,0.372659,-4.273834,5.800667,6.012852,6.658621,77.575012,75.963981,71.173116,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20.07948,17.722714,18.188679,3.65984,3.366443,2.211429,67.551783,61.733788,60.328571,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31.055859,32.523438,34.893333,4.026367,3.844922,3.695833,28.783203,25.789062,18.208333,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
# Remove loan status target from features data
target=df.fire_size_bin_no
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=['fire_size_bin_no']).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  ### Choose best learning rate

In [86]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=10,
                                            learning_rate=learning_rate,
                                            max_features='auto',
                                            max_depth=3,
                                            random_state=1)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.469
Accuracy score (validation): 0.464

Learning rate:  0.1
Accuracy score (training): 0.479
Accuracy score (validation): 0.476

Learning rate:  0.25
Accuracy score (training): 0.518
Accuracy score (validation): 0.491

Learning rate:  0.5
Accuracy score (training): 0.538
Accuracy score (validation): 0.486

Learning rate:  0.75
Accuracy score (training): 0.547
Accuracy score (validation): 0.489

Learning rate:  1
Accuracy score (training): 0.535
Accuracy score (validation): 0.473



  ### Create Gradient Boosting Classifier

In [91]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.1,
                                        max_features='auto',
                                        max_depth=3,
                                        random_state=1)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(20)

Unnamed: 0,Prediction,Actual
4841,5,1
3257,1,1
3796,1,3
5376,1,1
142,1,2
3013,1,1
12112,1,5
6822,1,1
9282,1,1
7166,1,1


  ### Evaluate the model

In [92]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
# Calculating the accuracy score
acc_score = balanced_accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.27990473086719525


In [93]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   index=["Actual Teacup", "Actual Toy", "Actual Mini", "Actual Medium", "Actual Large", "Actual XL"], columns=["Predicted Teacup", "Predicted Toy", "Predicted Mini", "Predicted Medium", "Predicted Large", "Predicted XL"]
)

In [94]:
# Displaying SMOTEEN results
print("Gradient Boosting Sampling")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Gradient Boosting Sampling
Confusion Matrix


Unnamed: 0,Predicted Teacup,Predicted Toy,Predicted Mini,Predicted Medium,Predicted Large,Predicted XL
Actual Teacup,,,,,,
Actual Toy,,,,,,
Actual Mini,,,,,,
Actual Medium,,,,,,
Actual Large,,,,,,
Actual XL,,,,,,


Accuracy Score : 0.27990473086719525
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          1       0.00      0.00      1.00      0.00      0.00      0.00      1492
          2       0.13      0.54      0.61      0.21      0.57      0.33       316
          3       0.25      0.09      0.95      0.13      0.30      0.08       477
          4       0.15      0.29      0.84      0.20      0.49      0.23       283
          5       0.26      0.28      0.89      0.27      0.50      0.23       402
          6       0.20      0.51      0.79      0.29      0.63      0.39       315

avg / total       0.11      0.17      0.91      0.12      0.26      0.13      3285

