
## Initial Analysis of Wildfire Cover Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('covtype.csv')

In [None]:
df

In [None]:
df.info()

In [None]:
X = df.drop("Cover_Type",axis=1)
y = df["Cover_Type"]

In [None]:
# From Kaggle website
y = pd.get_dummies(y, drop_first = True)
y = y.rename(columns={2:'Lodgepole Pine', 3:'Ponderosa Pine', 4:'Cottonwood/Willow', 
                      5:'Aspen', 6:'Douglas-fir', 7:'Krummholz'})

In [None]:
y

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

In [None]:
# Hyperparameter Tuning

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

param_grid = {'n_estimators':[100,250],'max_depth':[5,7,9],'ccp_alpha':[0.0,0.1,0.25,0.5]}
clf = RandomForestClassifier(max_features = "sqrt",class_weight="balanced")

grid = GridSearchCV(clf, param_grid)

grid.fit(X_train, y_train)

print(grid.best_params_)
print(classification_report(y_test, grid.predict(X_test)))

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_features = "sqrt", class_weight="balanced")

In [None]:
sss = StratifiedShuffleSplit(n_splits=5, random_state=42,test_size=0.25)

In [None]:
i = 0
y_pred_all = []
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    y_pred_all.append(y_pred)
    print("Fold " + str(i))
    print(classification_report(y_test, y_pred))
    
    print()
    
    importances = clf.feature_importances_
    feature_names = [f"feature {i}" for i in range(X.shape[1])]
    forest_importances = pd.Series(importances, index=feature_names)

    plt.figure(figsize=(10,10))
    sns.barplot(x = forest_importances.index, y = forest_importances.values)
    plt.xticks(rotation=70)
    plt.tight_layout()
    plt.show()

    i += 1
    

In [None]:
pd.concat([pd.DataFrame(feature_names),pd.DataFrame(X.columns)],axis=1)