In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import f1_score
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, RFECV, chi2, f_classif, mutual_info_classif
 
import warnings
warnings.filterwarnings('ignore')
pd.pandas.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('cleaned_fire_data.csv')

In [3]:
conditions = [df['stat_cause_descr'] == 'Missing/Undefined',
 df['stat_cause_descr'] == 'Arson',
 df['stat_cause_descr'] == 'Debris Burning',
 df['stat_cause_descr'] == 'Campfire',
 df['stat_cause_descr'] == 'Miscellaneous',
 df['stat_cause_descr'] == 'Fireworks',
 df['stat_cause_descr'] == 'Lightning',
 df['stat_cause_descr'] == 'Equipment Use',
 df['stat_cause_descr'] == 'Children',
 df['stat_cause_descr'] == 'Smoking',
 df['stat_cause_descr'] == 'Railroad',
 df['stat_cause_descr'] == 'Structure',
 df['stat_cause_descr'] == 'Powerline']
outputs = range(0,(len(conditions)))
df['stat_cause_descr'] = np.select(conditions, outputs)

In [4]:
cols_to_drop = [
    'Unnamed: 0',
    'fire_size',
    'fire_size_class',
    'disc_clean_date'
]

In [5]:
df = df.drop(columns=cols_to_drop,axis=1)

In [6]:
df.shape

(36535, 100)

In [7]:
X = df.drop(columns='target',axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=88)

scaler = StandardScaler()  
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)

## Logistic Regression Function

In [None]:
# def LogRegTest(input_x,input_y):
#     lr = LogisticRegression(random_state=1,C=1e9)
#     lr.fit(input_x,input_y)
#     pred_train = lr.predict(input_x)
#     score = f1_score(input_y,pred_train,zero_division=1)
#     return score

## KNN Function

In [None]:
# def KnnTest(input_x,input_y):
#     knn = KNeighborsClassifier(n_neighbors=5)
#     knn.fit(input_x,input_y)
#     pred_train = knn.predict(input_x)
#     score = f1_score(input_y,pred_train,zero_division=1)
#     return score

## Decision Tree Function

In [None]:
# def DTreeTest(input_x,input_y):
#     tree = DecisionTreeClassifier()
#     tree.fit(input_x,input_y)
#     pred_train = tree.predict(input_x)
#     score = f1_score(input_y,pred_train,zero_division=1)
#     return score

### Training Evaluations (F1)

In [None]:
print('LogReg F1: ', LogRegTest(X_train,y_train))
print('Knn F1: ', KnnTest(X_train,y_train))
print('Dtree F1: ', DTreeTest(X_train,y_train))

In [8]:
# Logistic Regression

lr = LogisticRegression(random_state=2,C=1e9,class_weight='balanced',penalty='l2')

lr.fit(X_train,y_train)

pred_train = lr.predict(X_train)

pred_test = lr.predict(X_test)

score_train = f1_score(y_train,pred_train)
score_test = f1_score(y_test,pred_test)

print('Train: ',score_train)
print('Test: ',score_test)

Train:  0.5380921895006401
Test:  0.5264650283553874


In [9]:
# KNN
knn = KNeighborsClassifier(n_neighbors=5,algorithm='auto',weights='uniform')

knn.fit(X_train, y_train)

y_pred_train = knn.predict(X_train)

y_pred_test = knn.predict(X_test)

score_train = f1_score(y_train,pred_train)
score_test = f1_score(y_test,pred_test)

print('Train: ',score_train)
print('Test: ',score_test)

Train:  0.5380921895006401
Test:  0.5264650283553874


In [19]:
#Decision Tree
tree = DecisionTreeClassifier(max_depth=12, min_samples_split=17,criterion='gini',min_samples_leaf=4)

tree = tree.fit(X_train,y_train)

pred_train = tree.predict(X_train)

pred_test = tree.predict(X_test)

score_train = f1_score(y_train,pred_train)
score_test = f1_score(y_test,pred_test)

print('Train: ',score_train)
print('Test: ',score_test)

Train:  0.8418935228644226
Test:  0.8004827031375703


### Decision Tree GridSearchCV

In [22]:
parameters = {'max_depth': range(3,15,1),
              'max_leaf_nodes': range(0,50,2),
              'min_samples_split': range(2,12,1),
              'criterion': ['gini', 'entropy'],
              'class_weight': [None, 'balanced'],
             }

In [23]:
dtc = DecisionTreeClassifier(random_state=88)

grid_tree = GridSearchCV(dtc, parameters, cv=10, scoring='f1', verbose=1, n_jobs=-1)

grid_tree.fit(X_train, y_train)

Fitting 10 folds for each of 12000 candidates, totalling 120000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 336 tasks      | elapsed:   31.8s
[Parallel(n_jobs=-1)]: Done 686 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1036 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1486 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 2036 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 2840 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 3612 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 4462 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 5504 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 6554 tasks      | elapsed: 15.2min
[Parallel(n_jobs=-1)]: Done 7872 tasks      | elapsed: 18.6min
[Parallel(n_jobs=-1)]: Done 9134 tasks      | elapsed: 22.5min
[Parallel(n_jobs=-1)]: Done 10680 tasks      | elapsed: 26.8min
[Parallel(n_jobs=-1)]: Done 12130 tasks     

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=88),
             n_jobs=-1,
             param_grid={'class_weight': [None, 'balanced'],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': range(3, 15),
                         'max_leaf_nodes': range(0, 50, 2),
                         'min_samples_split': range(2, 12)},
             scoring='f1', verbose=1)

In [24]:
print("Best Score: ", grid_tree.best_score_)
print("Best Parameters: ", grid_tree.best_params_)
print("Best Estimator: ", grid_tree.best_estimator_)

Best Score:  0.8118010088419844
Best Parameters:  {'class_weight': None, 'criterion': 'gini', 'max_depth': 13, 'max_leaf_nodes': 40, 'min_samples_split': 2}
Best Estimator:  DecisionTreeClassifier(max_depth=13, max_leaf_nodes=40, random_state=88)


In [25]:
y_pred = grid_tree.best_estimator_.predict(X_test)

print('F1 Test: ', f1_score(y_test, y_pred))

F1 Test:  0.8110749185667753
