# Exploratory Data Analysis Demo 

- Alejandro Hohmann
- Bhanu Muvva
- Chunxia Tong

The purpose of this notebook is to examine the data used in the Staley et al (ST16) model and to recreate their debris flow risk prediction model.

# Table of Contents

### - [EDA](#EDA)
### - [Logistic Regression](#LR)

In [1]:
# file structure libraries
import pandas as pd
import os

# # visualization libraries
# from matplotlib import pyplot as plt
# # Have plots display in notebook
# %matplotlib inline
# import seaborn as sns

# # ML libraries
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix

# # for sigmoid function, in case we need to manually implement in LR
# from scipy.stats import logistic

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split

## The Staley et al. (2016) model report and data 
https://pubs.er.usgs.gov/publication/ofr20161106

## Read in and Examine Staley Raw Data <a id="EDA">

In [1]:
file = "../data/ofr20161106_appx-1.xlsx"

xl = pd.ExcelFile(file)

sheets = xl.sheet_names  # see all sheet names
sheets

NameError: name 'pd' is not defined

In [2]:
# # can be helpful for reference
# data_dict = pd.read_excel(file, sheet_name=sheets[0])

# dict_cols = data_dict.columns

# data_dict.sort_values(by=dict_cols[0])

In [3]:
raw_data = pd.read_excel(file, sheet_name=sheets[1])
raw_data.head()

NameError: name 'pd' is not defined

In [4]:
raw_data['State'].value_counts()

NameError: name 'raw_data' is not defined

In [5]:
raw_data['Database'].value_counts()

NameError: name 'raw_data' is not defined

In [6]:
raw_data.shape

NameError: name 'raw_data' is not defined

In [7]:
# raw_data.describe().T

## Handling Missing Values

### Determine features with missing values

In [9]:
print("NAs for each feature:")
raw_data.isna().sum()

NAs for each feature:


Fire Name                 0
Year                      0
Fire_ID                   0
Fire_SegID                0
Database                  0
State                     0
UTM_Zone                  0
UTM_X                     0
UTM_Y                     0
Response                  0
StormDate                 0
GaugeDist_m               0
StormStart              160
StormEnd                160
StormDur_H                0
StormAccum_mm             0
StormAvgI_mm/h            0
Peak_I15_mm/h           230
Peak_I30_mm/h           214
Peak_I60_mm/h           256
ContributingArea_km2      0
PropHM23                  0
dNBR/1000                77
KF                        0
Acc015_mm               230
Acc030_mm               214
Acc060_mm               256
dtype: int64

In [10]:
# Features without missing values
list(raw_data.columns[raw_data.isna().sum() == 0])

['Fire Name',
 'Year',
 'Fire_ID',
 'Fire_SegID',
 'Database',
 'State',
 'UTM_Zone',
 'UTM_X',
 'UTM_Y',
 'Response',
 'StormDate',
 'GaugeDist_m',
 'StormDur_H',
 'StormAccum_mm',
 'StormAvgI_mm/h',
 'ContributingArea_km2',
 'PropHM23',
 'KF']

### Remove rows with missing values

In [11]:
print("Dimensions before removing null values:\n ", raw_data.shape)

raw_data.dropna(inplace=True)   # NOTE: Original raw_data is overwritten
print("Dimensions after removing null values: \n ", raw_data.shape)

Dimensions before removing null values:
  (1550, 27)
Dimensions after removing null values: 
  (1091, 27)


In [12]:
# raw_data.describe().round(2).T

### Save data without missing values to new file

In [13]:
# Save  data in csv file format, not including index as a column
raw_data.to_csv("ofr20161106-na-omit.csv", index=False)

## Explore by Data Types

In [14]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1091 entries, 0 to 1546
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Fire Name             1091 non-null   object        
 1   Year                  1091 non-null   int64         
 2   Fire_ID               1091 non-null   object        
 3   Fire_SegID            1091 non-null   object        
 4   Database              1091 non-null   object        
 5   State                 1091 non-null   object        
 6   UTM_Zone              1091 non-null   int64         
 7   UTM_X                 1091 non-null   float64       
 8   UTM_Y                 1091 non-null   float64       
 9   Response              1091 non-null   int64         
 10  StormDate             1091 non-null   object        
 11  GaugeDist_m           1091 non-null   float64       
 12  StormStart            1091 non-null   datetime64[ns]
 13  StormEnd          

In [15]:
# raw_data

In [16]:
# select the feature and result columns
raw_data_feat_15 = raw_data[['Peak_I15_mm/h','PropHM23','dNBR/1000','KF','Response']]

# save to a new csv file with Peak_I15_mm/h
raw_data_feat_15.to_csv('ofr20161106-na-omit-feat-15.csv', index=False)

In [17]:
raw_data_feat_15

Unnamed: 0,Peak_I15_mm/h,PropHM23,dNBR/1000,KF,Response
0,3.2,0.217933,0.297853,0.250000,0
1,3.2,0.061249,0.224896,0.250000,0
2,3.2,0.042968,0.065537,0.248541,0
3,1.6,0.092164,0.141711,0.250000,0
4,1.6,0.058353,0.210158,0.250000,0
...,...,...,...,...,...
1530,63.0,0.066777,0.373291,0.000000,0
1534,63.0,0.115890,0.604177,0.000000,0
1538,63.0,0.112607,0.428204,0.000000,1
1542,63.0,0.009801,0.187053,0.000000,1


In [18]:
# select the feature and result columns
raw_data_feat_30 = raw_data[['Peak_I30_mm/h','PropHM23','dNBR/1000','KF','Response']]

# save to a new csv file with Peak_I30_mm/h
raw_data_feat_30.to_csv('ofr20161106-na-omit-feat-30.csv', index=False)

In [19]:
raw_data_feat_30

Unnamed: 0,Peak_I30_mm/h,PropHM23,dNBR/1000,KF,Response
0,2.0,0.217933,0.297853,0.250000,0
1,2.0,0.061249,0.224896,0.250000,0
2,2.0,0.042968,0.065537,0.248541,0
3,1.2,0.092164,0.141711,0.250000,0
4,1.2,0.058353,0.210158,0.250000,0
...,...,...,...,...,...
1530,54.0,0.066777,0.373291,0.000000,0
1534,54.0,0.115890,0.604177,0.000000,0
1538,54.0,0.112607,0.428204,0.000000,1
1542,54.0,0.009801,0.187053,0.000000,1


In [20]:
# select the feature and result columns
raw_data_feat_60 = raw_data[['Peak_I60_mm/h','PropHM23','dNBR/1000','KF','Response']]

# save to a new csv file with Peak_I60_mm/h
raw_data_feat_60.to_csv('ofr20161106-na-omit-feat-60.csv', index=False)

In [21]:
raw_data_feat_60

Unnamed: 0,Peak_I60_mm/h,PropHM23,dNBR/1000,KF,Response
0,2.0,0.217933,0.297853,0.250000,0
1,2.0,0.061249,0.224896,0.250000,0
2,2.0,0.042968,0.065537,0.248541,0
3,0.8,0.092164,0.141711,0.250000,0
4,0.8,0.058353,0.210158,0.250000,0
...,...,...,...,...,...
1530,39.0,0.066777,0.373291,0.000000,0
1534,39.0,0.115890,0.604177,0.000000,0
1538,39.0,0.112607,0.428204,0.000000,1
1542,39.0,0.009801,0.187053,0.000000,1


In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(4, 500)
        self.fc2 = nn.Linear(500, 500)
        self.fc3 = nn.Linear(500, 500)
        self.fc4 = nn.Linear(500, 500)
        self.fc5 = nn.Linear(500, 1)
        

        
    # def forward(self, x):
    #     x = torch.relu(self.fc1(x))
    #     x = torch.relu(self.fc2(x))
    #     x = torch.relu(self.fc3(x))
    #     x = torch.relu(self.fc4(x))
    #     x = self.fc5(x)
    #     return x
    
    # 0.13260960578918457
    
    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        x = self.fc5(x)
        return x

     # 0.13114412128925323
    
if __name__ == '__main__':
    # Load data
    data = pd.read_csv('ofr20161106-na-omit-feat-15.csv')
    X = data.iloc[:, :4]
    y = data.iloc[:, 4]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train = torch.tensor(X_train.values).float()
    y_train = torch.tensor(y_train.values).float().view(-1, 1)
    X_val = torch.tensor(X_val.values).float()
    y_val = torch.tensor(y_val.values).float().view(-1, 1)

    model = Net()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters())

    for epoch in range(1000):
        y_train_pred = model(X_train)
        loss = criterion(y_train_pred, y_train)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    torch.save(model.state_dict(), 'model.pth')
    y_val_pred = model(X_val)
    val_loss = criterion(y_val_pred, y_val)
    print(f'Validation loss: {val_loss}')

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(4, 500)
        self.fc2 = nn.Linear(500, 500)
        self.fc3 = nn.Linear(500, 500)
        self.fc4 = nn.Linear(500, 500)
        self.fc5 = nn.Linear(500, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = self.fc5(x)
        return x


if __name__ == '__main__':
    # Load data
    data = pd.read_csv('ofr20161106-na-omit-feat-30.csv')
    X = data.iloc[:, :4]
    y = data.iloc[:, 4]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train = torch.tensor(X_train.values).float()
    y_train = torch.tensor(y_train.values).float().view(-1, 1)
    X_val = torch.tensor(X_val.values).float()
    y_val = torch.tensor(y_val.values).float().view(-1, 1)

    model = Net()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters())

    for epoch in range(1000):
        y_train_pred = model(X_train)
        loss = criterion(y_train_pred, y_train)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    torch.save(model.state_dict(), 'model.pth')
    y_val_pred = model(X_val)
    val_loss = criterion(y_val_pred, y_val)
    print(f'Validation loss: {val_loss}')

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(4, 500)
        self.fc2 = nn.Linear(500, 500)
        self.fc3 = nn.Linear(500, 500)
        self.fc4 = nn.Linear(500, 500)
        self.fc5 = nn.Linear(500, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = self.fc5(x)
        return x


if __name__ == '__main__':
    # Load data
    data = pd.read_csv('ofr20161106-na-omit-feat-60.csv')
    X = data.iloc[:, :4]
    y = data.iloc[:, 4]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train = torch.tensor(X_train.values).float()
    y_train = torch.tensor(y_train.values).float().view(-1, 1)
    X_val = torch.tensor(X_val.values).float()
    y_val = torch.tensor(y_val.values).float().view(-1, 1)

    model = Net()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters())

    for epoch in range(1000):
        y_train_pred = model(X_train)
        loss = criterion(y_train_pred, y_train)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    torch.save(model.state_dict(), 'model.pth')
    y_val_pred = model(X_val)
    val_loss = criterion(y_val_pred, y_val)
    print(f'Validation loss: {val_loss}')