# Supervised Machine Learning Attempts

-------------------------------------

### Neural Networking

In [116]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

import pandas as pd
from pathlib import Path



In [117]:
# Import our input dataset
data = Path('Resources_k/df7_k2.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,state,discovery_month,Temp_pre_7,Wind_pre_7,Hum_pre_7,fire_size_bin_no,medium_plus
0,3,NV,Jun,18.142564,3.671282,35.353846,1,0
1,24,MS,Apr,15.60479,1.775904,59.614458,3,0
2,31,ND,Apr,-4.273834,6.658621,71.173116,1,0
3,35,GA,Oct,18.188679,2.211429,60.328571,3,0
4,36,NM,Jul,34.893333,3.695833,18.208333,5,1


In [118]:
df=df.drop(columns=["fire_id", 'fire_size_bin_no'])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
#df[cat].nunique()
df

Unnamed: 0,state,discovery_month,Temp_pre_7,Wind_pre_7,Hum_pre_7,medium_plus
0,NV,Jun,18.142564,3.671282,35.353846,0
1,MS,Apr,15.604790,1.775904,59.614458,0
2,ND,Apr,-4.273834,6.658621,71.173116,0
3,GA,Oct,18.188679,2.211429,60.328571,0
4,NM,Jul,34.893333,3.695833,18.208333,1
...,...,...,...,...,...,...
13133,TX,Aug,28.719917,3.224274,55.426471,1
13134,TX,Oct,24.221869,1.563817,63.196819,1
13135,OR,Sep,10.734328,1.835821,67.266304,1
13136,MT,Mar,7.678571,3.803571,42.638384,1


In [119]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
# Merge one-hot encoded features and drop the originals
df= df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_7,Wind_pre_7,Hum_pre_7,medium_plus,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,18.142564,3.671282,35.353846,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.60479,1.775904,59.614458,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-4.273834,6.658621,71.173116,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18.188679,2.211429,60.328571,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,34.893333,3.695833,18.208333,1,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [121]:
# Remove loan status target from features data
target=df.medium_plus
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=["medium_plus"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [122]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 180
hidden_nodes_layer2 = 120
hidden_nodes_layer3 = 60


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="sigmoid")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 180)               10980     
                                                                 
 dense_5 (Dense)             (None, 120)               21720     
                                                                 
 dense_6 (Dense)             (None, 60)                7260      
                                                                 
 dense_7 (Dense)             (None, 1)                 61        
                                                                 
Total params: 40,021
Trainable params: 40,021
Non-trainable params: 0
_________________________________________________________________


In [123]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer="adam", metrics=["accuracy"])

In [124]:
## Import checkpoint dependencies
# import os
# from tensorflow.keras.callbacks import ModelCheckpoint

# # Define the checkpoint path and filenames
# os.makedirs("checkpoints/",exist_ok=True)
# checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

# # Create a callback that saves the model's weights every 5 epoch
# cp_callback = ModelCheckpoint(
#     filepath=checkpoint_path,
#     verbose=5,
#     save_weights_only=True,
#     save_freq='epoch')

# Train the model
# fit_model = nn.fit(X_train_scaled,y_train,epochs=100,callbacks=[cp_callback])
fit_model = nn.fit(X_train_scaled,y_train,epochs=50)

# # Restore the model weights
# nn.load_weights("checkpoints/weights.100.hdf5")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [125]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

103/103 - 0s - loss: 0.4440 - accuracy: 0.7887 - 272ms/epoch - 3ms/step
Loss: 0.443997859954834, Accuracy: 0.7887367010116577


---------------------

------------------------------

### Random Forest Classifier

In [126]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report

from pathlib import Path

# Import our input dataset
data = Path('Resources_k/df7_k2.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,state,discovery_month,Temp_pre_7,Wind_pre_7,Hum_pre_7,fire_size_bin_no,medium_plus
0,3,NV,Jun,18.142564,3.671282,35.353846,1,0
1,24,MS,Apr,15.60479,1.775904,59.614458,3,0
2,31,ND,Apr,-4.273834,6.658621,71.173116,1,0
3,35,GA,Oct,18.188679,2.211429,60.328571,3,0
4,36,NM,Jul,34.893333,3.695833,18.208333,5,1


In [127]:
df=df.drop(columns=["fire_id", 'fire_size_bin_no'])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
#df[cat].nunique()

In [128]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [129]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_7,Wind_pre_7,Hum_pre_7,medium_plus,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,18.142564,3.671282,35.353846,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.60479,1.775904,59.614458,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-4.273834,6.658621,71.173116,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18.188679,2.211429,60.328571,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,34.893333,3.695833,18.208333,1,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [130]:
# Remove loan status target from features data
target=df.medium_plus
y=target
# X = df.loc[:, df.columns !=target]
X=df.copy
X=df.drop(columns=["medium_plus"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [131]:
model = RandomForestClassifier(n_estimators=100, random_state=78) 

#Fitting the model
model = model.fit(X_train, y_train)

# Making predictions using the testing data.
y_pred = model.predict(X_test)

In [132]:
# Calculating the accuracy score.
rf_acc_score = balanced_accuracy_score(y_test, y_pred)
rf_acc_score

0.7237530346501876

In [133]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual less than medium", "Actual medium plus"], columns=["Predicted less than medium", "Predicted medium +"])

In [134]:
# Displaying Easy Ensemble Classifier
print("Random Forest Clasifier")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {rf_acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Clasifier
Confusion Matrix


Unnamed: 0,Predicted less than medium,Predicted medium +
Actual less than medium,2045,255
Actual medium plus,435,550


Accuracy Score : 0.7237530346501876
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.89      0.56      0.86      0.70      0.51      2300
          1       0.68      0.56      0.89      0.61      0.70      0.48       985

avg / total       0.78      0.79      0.66      0.78      0.70      0.50      3285



In [135]:
#Calulate Feature Importances
importances = model.feature_importances_
importances

array([2.10176354e-01, 1.86817057e-01, 2.68537133e-01, 6.97215844e-03,
       1.48015164e-02, 1.86518075e-03, 4.24819813e-03, 1.04274341e-03,
       1.55745571e-02, 1.82889135e-02, 6.69084190e-03, 5.65032395e-03,
       3.36893213e-03, 4.97361932e-03, 6.14870165e-03, 6.44569551e-02,
       3.02977969e-03, 3.79175693e-03, 9.61726866e-03, 8.45566390e-03,
       3.27445822e-03, 3.04096562e-03, 1.63337049e-02, 9.40987680e-06,
       2.11591851e-02, 3.25639472e-04, 4.76859062e-04, 1.13983337e-03,
       2.21218966e-03, 1.01758397e-03, 1.31987757e-05, 2.42294146e-04,
       2.37633691e-04, 9.58732149e-04, 2.50469995e-03, 1.30689884e-03,
       9.79485845e-03, 7.05771699e-03, 1.75648309e-03, 2.24565081e-03,
       2.26806626e-03, 2.45706323e-04, 7.77628607e-03, 1.80085992e-02,
       3.37304196e-03, 5.17149016e-04, 8.67059836e-03, 5.42056671e-03,
       3.25830045e-04, 3.07790137e-04, 1.79620767e-03, 3.85302807e-03,
       1.72857553e-03, 6.15875059e-03, 5.29470603e-03, 2.15379138e-03,
      

----------------------------

--------------------------

### Balanced Random Forest Classifier

In [136]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder
from pathlib import Path

from imblearn.ensemble import BalancedRandomForestClassifier


# Import our input dataset
data = Path('Resources_k/df7_k2.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,state,discovery_month,Temp_pre_7,Wind_pre_7,Hum_pre_7,fire_size_bin_no,medium_plus
0,3,NV,Jun,18.142564,3.671282,35.353846,1,0
1,24,MS,Apr,15.60479,1.775904,59.614458,3,0
2,31,ND,Apr,-4.273834,6.658621,71.173116,1,0
3,35,GA,Oct,18.188679,2.211429,60.328571,3,0
4,36,NM,Jul,34.893333,3.695833,18.208333,5,1


In [137]:
df=df.drop(columns=["fire_id", 'fire_size_bin_no'])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
#df[cat].nunique()

In [138]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [139]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_7,Wind_pre_7,Hum_pre_7,medium_plus,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,18.142564,3.671282,35.353846,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.60479,1.775904,59.614458,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-4.273834,6.658621,71.173116,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18.188679,2.211429,60.328571,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,34.893333,3.695833,18.208333,1,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
# Remove loan status target from features data
target=df.medium_plus
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=["medium_plus"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [141]:
model = BalancedRandomForestClassifier(n_estimators=100, random_state=78) 

#Fitting the model
model = model.fit(X_train, y_train)

# Making predictions using the testing data.
y_pred = model.predict(X_test)

In [142]:
# Calculating the accuracy score.
brf_acc_score = balanced_accuracy_score(y_test, y_pred)
brf_acc_score

0.7541105716177444

In [143]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual less than medium", "Actual medium plus"], columns=["Predicted less than medium", "Predicted medium +"])

In [144]:
# Displaying Easy Ensemble Classifier
print("Random Forest Clasifier")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {brf_acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Clasifier
Confusion Matrix


Unnamed: 0,Predicted less than medium,Predicted medium +
Actual less than medium,1755,545
Actual medium plus,251,734


Accuracy Score : 0.7541105716177444
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.87      0.76      0.75      0.82      0.75      0.57      2300
          1       0.57      0.75      0.76      0.65      0.75      0.57       985

avg / total       0.78      0.76      0.75      0.77      0.75      0.57      3285



-------------------------

----------------------------------

### Easy Ensemble Classifier

In [145]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder
from pathlib import Path

from imblearn.ensemble import EasyEnsembleClassifier

# Import our input dataset
data = Path('Resources_k/df7_k2.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,state,discovery_month,Temp_pre_7,Wind_pre_7,Hum_pre_7,fire_size_bin_no,medium_plus
0,3,NV,Jun,18.142564,3.671282,35.353846,1,0
1,24,MS,Apr,15.60479,1.775904,59.614458,3,0
2,31,ND,Apr,-4.273834,6.658621,71.173116,1,0
3,35,GA,Oct,18.188679,2.211429,60.328571,3,0
4,36,NM,Jul,34.893333,3.695833,18.208333,5,1


In [146]:
df=df.drop(columns=["fire_id", 'fire_size_bin_no'])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
#df[cat].nunique()

In [147]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_7,Wind_pre_7,Hum_pre_7,medium_plus,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,18.142564,3.671282,35.353846,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.60479,1.775904,59.614458,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-4.273834,6.658621,71.173116,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18.188679,2.211429,60.328571,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,34.893333,3.695833,18.208333,1,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
# Remove loan status target from features data
target=df.medium_plus
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=["medium_plus"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [150]:
# Train the EasyEnsembleClassifier

model = EasyEnsembleClassifier(n_estimators=100, random_state=78) 

#Fitting the model
model = model.fit(X_train, y_train)

# Making predictions using the testing data.
y_pred = model.predict(X_test)

In [151]:
# Calculated the balanced accuracy score
ee_acc_score = balanced_accuracy_score(y_test, y_pred)
ee_acc_score

0.7537486206135511

In [152]:

# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual less than medium", "Actual medium plus"], columns=["Predicted less than medium", "Predicted medium +"])

In [153]:
# Displaying Easy Ensemble Classifier
print("Easy Ensemble Clasifier")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {ee_acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble Clasifier
Confusion Matrix


Unnamed: 0,Predicted less than medium,Predicted medium +
Actual less than medium,1751,549
Actual medium plus,250,735


Accuracy Score : 0.7537486206135511
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.76      0.75      0.81      0.75      0.57      2300
          1       0.57      0.75      0.76      0.65      0.75      0.57       985

avg / total       0.78      0.76      0.75      0.76      0.75      0.57      3285



--------------------------------------

---------------------------

### Naive Random Oversampling

In [154]:
# Import our input dataset
data = Path('Resources_k/df7_k2.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,state,discovery_month,Temp_pre_7,Wind_pre_7,Hum_pre_7,fire_size_bin_no,medium_plus
0,3,NV,Jun,18.142564,3.671282,35.353846,1,0
1,24,MS,Apr,15.60479,1.775904,59.614458,3,0
2,31,ND,Apr,-4.273834,6.658621,71.173116,1,0
3,35,GA,Oct,18.188679,2.211429,60.328571,3,0
4,36,NM,Jul,34.893333,3.695833,18.208333,5,1


In [155]:
df=df.drop(columns=["fire_id", 'fire_size_bin_no'])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
#df[cat].nunique()

In [156]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [157]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_7,Wind_pre_7,Hum_pre_7,medium_plus,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,18.142564,3.671282,35.353846,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.60479,1.775904,59.614458,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-4.273834,6.658621,71.173116,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18.188679,2.211429,60.328571,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,34.893333,3.695833,18.208333,1,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [158]:
# Remove loan status target from features data
target=df.medium_plus
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=["medium_plus"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [159]:
import warnings
warnings.filterwarnings('ignore')

from collections import Counter

# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=78)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 6964, 1: 6964})

In [160]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=78)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=78)

In [161]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
nr_acc_score= balanced_accuracy_score(y_test, y_pred)

In [162]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual less than medium", "Actual medium plus"], columns=["Predicted less than medium", "Predicted medium +"])

In [163]:
# Displaying Random Oversampling results
print("Naive Random Oversampling")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {nr_acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Naive Random Oversampling
Confusion Matrix


Unnamed: 0,Predicted less than medium,Predicted medium +
Actual less than medium,1714,586
Actual medium plus,218,767


Accuracy Score : 0.7619487971750165
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.75      0.78      0.81      0.76      0.58      2300
          1       0.57      0.78      0.75      0.66      0.76      0.58       985

avg / total       0.79      0.76      0.77      0.76      0.76      0.58      3285



### SMOTE Oversampling

In [164]:
# Import our input dataset
data = Path('Resources_k/df7_k2.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,state,discovery_month,Temp_pre_7,Wind_pre_7,Hum_pre_7,fire_size_bin_no,medium_plus
0,3,NV,Jun,18.142564,3.671282,35.353846,1,0
1,24,MS,Apr,15.60479,1.775904,59.614458,3,0
2,31,ND,Apr,-4.273834,6.658621,71.173116,1,0
3,35,GA,Oct,18.188679,2.211429,60.328571,3,0
4,36,NM,Jul,34.893333,3.695833,18.208333,5,1


In [165]:
df=df.drop(columns=["fire_id", 'fire_size_bin_no'])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
#df[cat].nunique()

In [166]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [167]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_7,Wind_pre_7,Hum_pre_7,medium_plus,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,18.142564,3.671282,35.353846,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.60479,1.775904,59.614458,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-4.273834,6.658621,71.173116,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18.188679,2.211429,60.328571,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,34.893333,3.695833,18.208333,1,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [168]:
# Remove loan status target from features data
target=df.medium_plus
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=["medium_plus"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [169]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=78, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 6964, 1: 6964})

In [170]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=78)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=78)

In [171]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
smote_acc_score=balanced_accuracy_score(y_test, y_pred)

In [172]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual less than medium", "Actual medium plus"], columns=["Predicted less than medium", "Predicted medium +"])

In [173]:
# Displaying SMOTE results
print("SMOTE Oversampling")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {smote_acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

SMOTE Oversampling
Confusion Matrix


Unnamed: 0,Predicted less than medium,Predicted medium +
Actual less than medium,1735,565
Actual medium plus,235,750


Accuracy Score : 0.7578845729419554
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.75      0.76      0.81      0.76      0.57      2300
          1       0.57      0.76      0.75      0.65      0.76      0.57       985

avg / total       0.79      0.76      0.76      0.76      0.76      0.57      3285



## Undersampling


In [174]:
# Import our input dataset
data = Path('Resources_k/df7_k2.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,state,discovery_month,Temp_pre_7,Wind_pre_7,Hum_pre_7,fire_size_bin_no,medium_plus
0,3,NV,Jun,18.142564,3.671282,35.353846,1,0
1,24,MS,Apr,15.60479,1.775904,59.614458,3,0
2,31,ND,Apr,-4.273834,6.658621,71.173116,1,0
3,35,GA,Oct,18.188679,2.211429,60.328571,3,0
4,36,NM,Jul,34.893333,3.695833,18.208333,5,1


In [175]:
df=df.drop(columns=["fire_id", 'fire_size_bin_no'])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
#df[cat].nunique()

In [176]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [177]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_7,Wind_pre_7,Hum_pre_7,medium_plus,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,18.142564,3.671282,35.353846,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.60479,1.775904,59.614458,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-4.273834,6.658621,71.173116,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18.188679,2.211429,60.328571,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,34.893333,3.695833,18.208333,1,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [178]:
# Remove loan status target from features data
target=df.medium_plus
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=["medium_plus"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [179]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)


Counter({0: 2889, 1: 2889})

In [180]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=78)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=78)

In [181]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
cc_acc_score= balanced_accuracy_score(y_test, y_pred)

In [182]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual less than medium", "Actual medium plus"], columns=["Predicted less than medium", "Predicted medium +"])

In [183]:
# Displaying Undersampling results
print("Cluster Centroid Undersampling")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {cc_acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Cluster Centroid Undersampling
Confusion Matrix


Unnamed: 0,Predicted less than medium,Predicted medium +
Actual less than medium,1736,564
Actual medium plus,251,734


Accuracy Score : 0.7499801368351358
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.87      0.75      0.75      0.81      0.75      0.56      2300
          1       0.57      0.75      0.75      0.64      0.75      0.56       985

avg / total       0.78      0.75      0.75      0.76      0.75      0.56      3285



## Combination (Over and Under) Sampling

In [184]:
# Import our input dataset
data = Path('Resources_k/df7_k2.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,state,discovery_month,Temp_pre_7,Wind_pre_7,Hum_pre_7,fire_size_bin_no,medium_plus
0,3,NV,Jun,18.142564,3.671282,35.353846,1,0
1,24,MS,Apr,15.60479,1.775904,59.614458,3,0
2,31,ND,Apr,-4.273834,6.658621,71.173116,1,0
3,35,GA,Oct,18.188679,2.211429,60.328571,3,0
4,36,NM,Jul,34.893333,3.695833,18.208333,5,1


In [185]:
df=df.drop(columns=["fire_id", 'fire_size_bin_no'])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
#df[cat].nunique()

In [186]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [187]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_7,Wind_pre_7,Hum_pre_7,medium_plus,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,18.142564,3.671282,35.353846,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.60479,1.775904,59.614458,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-4.273834,6.658621,71.173116,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18.188679,2.211429,60.328571,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,34.893333,3.695833,18.208333,1,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [188]:
# Remove loan status target from features data
target=df.medium_plus
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=["medium_plus"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [189]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=78)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 4802, 1: 7012})

In [190]:
# Train the Logistic Regression model using the resampled data
from imblearn.ensemble import EasyEnsembleClassifier
model = EasyEnsembleClassifier(n_estimators=100, random_state=78) 
model.fit(X_resampled, y_resampled)

EasyEnsembleClassifier(n_estimators=100, random_state=78)

In [191]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
smoteenn_acc_score = balanced_accuracy_score(y_test, y_pred)

In [192]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual less than medium", "Actual medium plus"], columns=["Predicted less than medium", "Predicted medium +"])

In [193]:
# Displaying SMOTEEN results
print("SMOTEENN Sampling")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {smoteenn_acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

SMOTEENN Sampling
Confusion Matrix


Unnamed: 0,Predicted less than medium,Predicted medium +
Actual less than medium,1728,572
Actual medium plus,230,755


Accuracy Score : 0.7589009048775104
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.75      0.77      0.81      0.76      0.57      2300
          1       0.57      0.77      0.75      0.65      0.76      0.58       985

avg / total       0.79      0.76      0.76      0.76      0.76      0.58      3285



---------------------------------

--------------------------------

### Gradient Boosting Classifier

In [194]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd


In [195]:
# Import our input dataset
data = Path('Resources_k/df7_k2.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,state,discovery_month,Temp_pre_7,Wind_pre_7,Hum_pre_7,fire_size_bin_no,medium_plus
0,3,NV,Jun,18.142564,3.671282,35.353846,1,0
1,24,MS,Apr,15.60479,1.775904,59.614458,3,0
2,31,ND,Apr,-4.273834,6.658621,71.173116,1,0
3,35,GA,Oct,18.188679,2.211429,60.328571,3,0
4,36,NM,Jul,34.893333,3.695833,18.208333,5,1


In [196]:
df=df.drop(columns=["fire_id", 'fire_size_bin_no'])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
#df[cat].nunique()

In [197]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [198]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_7,Wind_pre_7,Hum_pre_7,medium_plus,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,18.142564,3.671282,35.353846,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.60479,1.775904,59.614458,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-4.273834,6.658621,71.173116,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18.188679,2.211429,60.328571,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,34.893333,3.695833,18.208333,1,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [199]:
# Remove loan status target from features data
target=df.medium_plus
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=["medium_plus"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  ### Choose best learning rate

In [200]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=10,
                                            learning_rate=learning_rate,
                                            max_features='auto',
                                            max_depth=3,
                                            random_state=1)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.735
Accuracy score (validation): 0.724

Learning rate:  0.1
Accuracy score (training): 0.768
Accuracy score (validation): 0.755

Learning rate:  0.25
Accuracy score (training): 0.799
Accuracy score (validation): 0.778

Learning rate:  0.5
Accuracy score (training): 0.809
Accuracy score (validation): 0.781

Learning rate:  0.75
Accuracy score (training): 0.813
Accuracy score (validation): 0.775

Learning rate:  1
Accuracy score (training): 0.810
Accuracy score (validation): 0.775



  ### Create Gradient Boosting Classifier

In [201]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.05,
                                        max_features='auto',
                                        max_depth=3,
                                        random_state=78)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(20)

Unnamed: 0,Prediction,Actual
10678,1,1
11312,1,1
3461,0,0
5068,1,0
1688,0,0
1576,0,0
12862,0,1
10709,1,1
11784,1,1
4772,0,0


  ### Evaluate the model

In [202]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
# Calculating the accuracy score
gb_acc_score = balanced_accuracy_score(y_test, predictions)
gb_acc_score

0.7192584418450674

In [203]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual less than medium", "Actual medium plus"], columns=["Predicted less than medium", "Predicted medium +"])

In [204]:
# Displaying SMOTEEN results
print("Gradient Boosting Sampling")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {gb_acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Gradient Boosting Sampling
Confusion Matrix


Unnamed: 0,Predicted less than medium,Predicted medium +
Actual less than medium,1728,572
Actual medium plus,230,755


Accuracy Score : 0.7192584418450674
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.75      0.77      0.81      0.76      0.57      2300
          1       0.57      0.77      0.75      0.65      0.76      0.58       985

avg / total       0.79      0.76      0.76      0.76      0.76      0.58      3285



---------------------------------

-----------------------

### Logistic Regression

In [205]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [206]:
# Import our input dataset
data = Path('Resources_k/df7_k2.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,fire_id,state,discovery_month,Temp_pre_7,Wind_pre_7,Hum_pre_7,fire_size_bin_no,medium_plus
0,3,NV,Jun,18.142564,3.671282,35.353846,1,0
1,24,MS,Apr,15.60479,1.775904,59.614458,3,0
2,31,ND,Apr,-4.273834,6.658621,71.173116,1,0
3,35,GA,Oct,18.188679,2.211429,60.328571,3,0
4,36,NM,Jul,34.893333,3.695833,18.208333,5,1


In [207]:
df=df.drop(columns=["fire_id", 'fire_size_bin_no'])

# Generate our categorical variable list
#cat = df.dtypes[df.dtypes == "object"].index.tolist()
cat=['discovery_month', 'state']

# Check the number of unique values in each column
#df[cat].nunique()
df

Unnamed: 0,state,discovery_month,Temp_pre_7,Wind_pre_7,Hum_pre_7,medium_plus
0,NV,Jun,18.142564,3.671282,35.353846,0
1,MS,Apr,15.604790,1.775904,59.614458,0
2,ND,Apr,-4.273834,6.658621,71.173116,0
3,GA,Oct,18.188679,2.211429,60.328571,0
4,NM,Jul,34.893333,3.695833,18.208333,1
...,...,...,...,...,...,...
13133,TX,Aug,28.719917,3.224274,55.426471,1
13134,TX,Oct,24.221869,1.563817,63.196819,1
13135,OR,Sep,10.734328,1.835821,67.266304,1
13136,MT,Mar,7.678571,3.803571,42.638384,1


In [208]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,discovery_month_Jun,discovery_month_Mar,discovery_month_May,discovery_month_Nov,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [209]:
# Merge one-hot encoded features and drop the originals
df= df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

Unnamed: 0,Temp_pre_7,Wind_pre_7,Hum_pre_7,medium_plus,discovery_month_Apr,discovery_month_Aug,discovery_month_Dec,discovery_month_Feb,discovery_month_Jan,discovery_month_Jul,...,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_WA,state_WI,state_WV,state_WY
0,18.142564,3.671282,35.353846,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.60479,1.775904,59.614458,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-4.273834,6.658621,71.173116,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18.188679,2.211429,60.328571,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,34.893333,3.695833,18.208333,1,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [210]:
# Remove loan status target from features data
target=df.medium_plus
y=target
# X = df.loc[:, df.columns !=target]
X=df.drop(columns=["medium_plus"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [211]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=78)

In [212]:
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,0,0
3,0,0
4,0,0
5,0,0
6,1,1
7,1,1
8,1,1
9,0,0


In [213]:
from sklearn.metrics import accuracy_score
log_acc_score=accuracy_score(y_test, y_pred)
log_acc_score

0.7796042617960426

In [214]:
print(f'Supervised Machine Learning Models Using OneHot Encoder')
print(f"Neural Networking Accuracy: {model_accuracy}")
print(f"Random Forest Accuracy Score : {rf_acc_score}")
print(f"Balanced Random Forest Accuracy Score : {brf_acc_score}")
print(f"Easy Ensemble Accuracy Score : {ee_acc_score}")
print(f"Naive Random Oversampling Accuracy Score : {nr_acc_score}")
print(f"SMOTE Oversampling Accuracy Score : {smote_acc_score}")
print(f"Centroid Clustering Undersampling Accuracy Score : {cc_acc_score}")
print(f"SMOTEENN Over/Undersampling Accuracy Score : {smoteenn_acc_score}")
print(f"Gradient Boosting Accuracy Score : {gb_acc_score}")
print(f"Logistic Regression Accuracy Score : {log_acc_score}")


Supervised Machine Learning Models Using OneHot Encoder
Neural Networking Accuracy: 0.7887367010116577
Random Forest Accuracy Score : 0.7237530346501876
Balanced Random Forest Accuracy Score : 0.7541105716177444
Easy Ensemble Accuracy Score : 0.7537486206135511
Naive Random Oversampling Accuracy Score : 0.7619487971750165
SMOTE Oversampling Accuracy Score : 0.7578845729419554
Centroid Clustering Undersampling Accuracy Score : 0.7499801368351358
SMOTEENN Over/Undersampling Accuracy Score : 0.7589009048775104
Gradient Boosting Accuracy Score : 0.7192584418450674
Logistic Regression Accuracy Score : 0.7796042617960426
