In [2]:
# The following is a program to estimate the damage to buildings using various risk factors: the strength
# of the storm, the physical features of a building, and the assesed social vulnerability metric of
# a given area. The events of Hurricane Harvey, ETC NEED TO LOOK UP are used in this case study.

import pandas as pd
import numpy as np

df = pd.read_csv("frontiers_all.csv")

In [3]:
# Clean and format data.

# Drop null values.
df = df.dropna()

# Map the categorical data names for roof types to numbers so they can be used in the random
# forest model.
roof_shape_mapping = {'Gable': 1, 'Complex': 2, 'Hip': 3, 'Flat': 4, 'Gambrel': 5, 'Monoslope': 6, 'Other': 7}
df['roof_shape_1'] = df['roof_shape_1'].map(roof_shape_mapping).fillna(df['roof_shape_1'])
df['roof_shape_1'] = df['roof_shape_1'].astype('int64')

# Replace values in the 'damage_rating' column to reduce the number of categories.
df['damage_rating'].replace({2: 1, 4: 3}, inplace=True)

In [28]:
# Select training data for the damage estimate that does NOT include social vulnerability data.

from sklearn.model_selection import train_test_split

selected_columns = df[['damage_rating', 'design_exceeded_7_16', 'age', 'roof_shape_1', 
                    'dist_to_coast_m', 'density_100m', 'density_500m', 'surge_depth_ft']]

X = selected_columns[['design_exceeded_7_16', 'age', 'roof_shape_1', 
                    'dist_to_coast_m', 'density_100m', 'density_500m', 'surge_depth_ft']]
y = selected_columns['damage_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Create Random Forest model and test on the test data that was set aside.

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 100, max_features=2, min_samples_split = 7)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [30]:
# Print the results of the test using the classification report and confusion matrix

from sklearn.metrics import classification_report, confusion_matrix


print('Classification Report----------')
print(classification_report(y_test, y_predict))

report = classification_report(y_test, y_predict, output_dict = True) # Store the accuracy for comparison
accuracy = report["accuracy"]
accuracy_comparison = pd.DataFrame(columns = ["replication", "sovi"])
accuracy_comparison.loc[0, "replication"] = accuracy

print('Confusion Matrix---------------')
cm = pd.DataFrame(confusion_matrix(y_test, y_predict))
print(cm)

Classification Report----------
              precision    recall  f1-score   support

           0       0.77      0.50      0.60       123
           1       0.73      0.89      0.80       288
           3       0.69      0.55      0.61        99

    accuracy                           0.73       510
   macro avg       0.73      0.64      0.67       510
weighted avg       0.73      0.73      0.72       510

Confusion Matrix---------------
    0    1   2
0  61   57   5
1  12  257  19
2   6   39  54


In [31]:
# Select training data for the damage estimate that DOES include social vulnerability data.

from sklearn.model_selection import train_test_split

selected_columns_sovi = df[['damage_rating', 'design_exceeded_7_16', 'age', 'roof_shape_1', 
                    'dist_to_coast_m', 'density_100m', 'density_500m','surge_depth_ft', 'RPL_THEMES']]

X = selected_columns_sovi[['design_exceeded_7_16', 'age', 'roof_shape_1', 
                    'dist_to_coast_m', 'density_100m', 'density_500m','surge_depth_ft', 'RPL_THEMES']]
y = selected_columns_sovi['damage_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# Create Random Forest model and test on the test data that was set aside.

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 100, max_features=2, min_samples_split = 7)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [33]:
# Print the results of the test using the classification report and confusion matrix

from sklearn.metrics import classification_report, confusion_matrix


print('Classification Report----------')
print(classification_report(y_test, y_predict))

report = classification_report(y_test, y_predict, output_dict = True) # Store the accuracy for comparison
accuracy = report["accuracy"]
accuracy_comparison = pd.DataFrame(columns = ["replication", "sovi"])
accuracy_comparison.loc[0, "replication"] = accuracy

print('Confusion Matrix---------------')
cm = pd.DataFrame(confusion_matrix(y_test, y_predict))
print(cm)

Classification Report----------
              precision    recall  f1-score   support

           0       0.77      0.50      0.61       123
           1       0.74      0.90      0.81       288
           3       0.71      0.57      0.63        99

    accuracy                           0.74       510
   macro avg       0.74      0.66      0.68       510
weighted avg       0.74      0.74      0.73       510

Confusion Matrix---------------
    0    1   2
0  62   56   5
1  12  258  18
2   7   36  56
