# Reproduce Staley (2016) Results

Based on the calculations in the USGS Emergency Debris Flow Assessment database (https://landslides.usgs.gov/hazards/postfire_debrisflow/), the Intensity I15 is transformed into the total accumulation during the 15 minutes, i.e. I15/4.  This is the same as the value stored in the column "Acc015_mm".

In [1]:
import pandas as pd
pd.set_option("max_colwidth", None)

In [2]:
import sklearn
import numpy as np

In [3]:
xl=pd.ExcelFile("ofr20161106_appx-1.xlsx")
desc=xl.parse(xl.sheet_names[0])
modelData=xl.parse(xl.sheet_names[1])

In [4]:
modelData.columns

Index(['Fire Name', 'Year', 'Fire_ID', 'Fire_SegID', 'Database', 'State',
       'UTM_Zone', 'UTM_X', 'UTM_Y', 'Response', 'StormDate', 'GaugeDist_m',
       'StormStart', 'StormEnd', 'StormDur_H', 'StormAccum_mm',
       'StormAvgI_mm/h', 'Peak_I15_mm/h', 'Peak_I30_mm/h', 'Peak_I60_mm/h',
       'ContributingArea_km2', 'PropHM23', 'dNBR/1000', 'KF', 'Acc015_mm',
       'Acc030_mm', 'Acc060_mm'],
      dtype='object')

In [5]:
from sklearn.neighbors import KNeighborsClassifier, DistanceMetric
from sklearn.metrics import accuracy_score, f1_score, jaccard_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, DistanceMetric
from sklearn.neural_network import MLPClassifier

In [6]:
def get_scores(trues, preds):
    scores=[func(trues, preds) for func in [accuracy_score, jaccard_score, f1_score, roc_auc_score]]
    return scores

def get_scoredf(TrTr, TrPr, TeTr, TePr):
    train_scores=get_scores(TrTr, TrPr)
    test_scores=get_scores(TeTr, TePr)
    
    scoredf=pd.DataFrame({"Training": train_scores, "Test": test_scores}, index=["Accurary", "Jaccard", "F1", "AUC"])
    return scoredf

In [7]:
modelData.columns

Index(['Fire Name', 'Year', 'Fire_ID', 'Fire_SegID', 'Database', 'State',
       'UTM_Zone', 'UTM_X', 'UTM_Y', 'Response', 'StormDate', 'GaugeDist_m',
       'StormStart', 'StormEnd', 'StormDur_H', 'StormAccum_mm',
       'StormAvgI_mm/h', 'Peak_I15_mm/h', 'Peak_I30_mm/h', 'Peak_I60_mm/h',
       'ContributingArea_km2', 'PropHM23', 'dNBR/1000', 'KF', 'Acc015_mm',
       'Acc030_mm', 'Acc060_mm'],
      dtype='object')

In [8]:
usecols=["Acc015_mm",
         "PropHM23",
         "dNBR/1000",
         "KF",
         "Response"]

usecols = usecols + ["Database"]

In [9]:
cdata=modelData[usecols].copy()
cdata=cdata.dropna()
len(cdata)

1243

Adjusting unrealistic $K_f$ values - commented out, as it's not sure if Staley did this:

In [10]:
#mask=cdata["KF"] > 0.64
#cdata.loc[mask,"KF"] = cdata["KF"].median()

In [11]:
#computing input data just as in Staley'16

cdata["PropHM23_x_i15"] = cdata["PropHM23"] * cdata["Acc015_mm"]
cdata["dNBR_x_i15"] = cdata["dNBR/1000"] * cdata["Acc015_mm"]
cdata["KF_x_i15"] = cdata["KF"] * cdata["Acc015_mm"]

In [12]:
usecols2=["PropHM23_x_i15","dNBR_x_i15", "KF_x_i15"]

In [13]:
trainX=cdata.query("Database == 'Training'")[usecols2]
trainY=cdata.query("Database == 'Training'")["Response"]

testX=cdata.query("Database == 'Test'")[usecols2]
testY=cdata.query("Database == 'Test'")["Response"]

### 1. Logistic Regression

In [14]:
clfl = LogisticRegression(random_state=0, penalty='l2').fit(trainX, trainY)
trainYp=clfl.predict(trainX)
testYp=clfl.predict(testX)

scoredf_lr=get_scoredf(trainY, trainYp, testY, testYp)
scoredf_lr

Unnamed: 0,Training,Test
Accurary,0.833129,0.647196
Jaccard,0.411255,0.386179
F1,0.582822,0.557185
AUC,0.711888,0.703778


### 2.  Random Forest Classifier

In [15]:
clfl = RandomForestClassifier(random_state=0).fit(trainX, trainY)
trainYp=clfl.predict(trainX)
testYp=clfl.predict(testX)

scoredf_lr=get_scoredf(trainY, trainYp, testY, testYp)
scoredf_lr

Unnamed: 0,Training,Test
Accurary,1.0,0.581776
Jaccard,1.0,0.353791
F1,1.0,0.522667
AUC,1.0,0.667301


### 3. KNeighbors Classifier

In [16]:
clfl = KNeighborsClassifier().fit(trainX, trainY)
trainYp=clfl.predict(trainX)
testYp=clfl.predict(testX)

scoredf_lr=get_scoredf(trainY, trainYp, testY, testYp)
scoredf_lr

Unnamed: 0,Training,Test
Accurary,0.871166,0.600467
Jaccard,0.556962,0.337209
F1,0.715447,0.504348
AUC,0.799042,0.649826


### 4. Decision Tree Classifier

In [17]:
clfl = DecisionTreeClassifier(random_state=0).fit(trainX, trainY)
trainYp=clfl.predict(trainX)
testYp=clfl.predict(testX)

scoredf_lr=get_scoredf(trainY, trainYp, testY, testYp)
scoredf_lr

Unnamed: 0,Training,Test
Accurary,1.0,0.544393
Jaccard,1.0,0.31338
F1,1.0,0.477212
AUC,1.0,0.616988


### 5. Neural Network

In [18]:
clfl = MLPClassifier(hidden_layer_sizes=(30, 30, 30), max_iter=1000).fit(trainX, trainY)
trainYp=clfl.predict(trainX)
testYp=clfl.predict(testX)

scoredf_lr=get_scoredf(trainY, trainYp, testY, testYp)
scoredf_lr

Unnamed: 0,Training,Test
Accurary,0.840491,0.609813
Jaccard,0.460581,0.345098
F1,0.630682,0.51312
AUC,0.743546,0.658967
