In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', 100)

In [2]:
df = pd.read_csv('2000-2018_disasters_min50k_for_modeling.csv')
df.drop(columns = ['BEGIN_DATE_TIME','END_DATE_TIME','BEGIN_TIME','END_TIME'], inplace = True)
df.head()

Unnamed: 0,STATE,CZ_TYPE,EVENT_TYPE,CZ_TIMEZONE,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,BEGIN_RANGE,BEGIN_AZIMUTH,END_RANGE,END_AZIMUTH,MAGNITUDE_TYPE,MAGNITUDE,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,YEAR,BEGIN_MONTH,BEGIN_DAY,END_MONTH,END_DAY,DURATION_DAYS,DURATION_HOURS,DURATION_MINUTES,TOTAL_DURATION_HR,DISTANCE_COVERED(KM),BEGIN_HOUR,BEGIN_MINUTE,END_HOUR,END_MINUTE
0,MISSOURI,C,Thunderstorm Wind,CST,3,0,0,0,250000.0,0.0,1.0,E,0.0,N,E,83.0,37.25,-89.67,37.3,-89.53,2000,1,3,1,3,0,0,10,0.2,16.651331,8,50,9,0
1,ILLINOIS,C,Thunderstorm Wind,CST,0,0,0,0,150000.0,0.0,2.0,NE,4.0,NE,E,100.0,37.73,-89.18,37.75,-89.17,2000,1,3,1,3,0,0,4,0.1,4.535308,9,18,9,22
2,ILLINOIS,C,Thunderstorm Wind,CST,0,0,0,0,100000.0,0.0,1.0,S,4.0,NE,E,75.0,37.78,-89.12,37.78,-89.12,2000,1,3,1,3,0,0,2,0.0,0.0,9,22,9,24
3,INDIANA,C,Thunderstorm Wind,CST,0,0,0,0,100000.0,0.0,0.0,E,0.0,E,E,65.0,37.98,-87.55,37.98,-87.55,2000,1,3,1,3,0,0,0,0.0,0.0,12,10,12,10
4,ILLINOIS,C,Thunderstorm Wind,CST,0,0,0,0,100000.0,0.0,0.0,E,1.0,E,E,87.0,37.73,-88.93,37.73,-88.93,2000,1,3,1,3,0,0,0,0.0,0.0,14,7,14,7


In [3]:
df.shape

(11047, 34)

In [6]:
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,BEGIN_RANGE,END_RANGE,MAGNITUDE,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,YEAR,BEGIN_MONTH,BEGIN_DAY,END_MONTH,END_DAY,DURATION_DAYS,DURATION_HOURS,DURATION_MINUTES,TOTAL_DURATION_HR,DISTANCE_COVERED(KM),BEGIN_HOUR,BEGIN_MINUTE,END_HOUR,END_MINUTE,STATE_ARIZONA,STATE_ARKANSAS,STATE_ATLANTIC NORTH,STATE_ATLANTIC SOUTH,STATE_CALIFORNIA,STATE_COLORADO,STATE_CONNECTICUT,STATE_DELAWARE,STATE_DISTRICT OF COLUMBIA,STATE_E PACIFIC,STATE_FLORIDA,STATE_GEORGIA,STATE_GULF OF MEXICO,STATE_HAWAII,STATE_IDAHO,STATE_ILLINOIS,STATE_INDIANA,STATE_IOWA,STATE_KANSAS,STATE_KENTUCKY,STATE_LAKE MICHIGAN,STATE_LAKE ST CLAIR,STATE_LOUISIANA,...,EVENT_TYPE_Marine High Wind,EVENT_TYPE_Marine Strong Wind,EVENT_TYPE_Marine Thunderstorm Wind,EVENT_TYPE_Thunderstorm Wind,EVENT_TYPE_Tornado,CZ_TIMEZONE_CST,CZ_TIMEZONE_CST-6,CZ_TIMEZONE_EST,CZ_TIMEZONE_EST-5,CZ_TIMEZONE_HST,CZ_TIMEZONE_MST,CZ_TIMEZONE_MST-7,CZ_TIMEZONE_PST,CZ_TIMEZONE_PST-8,BEGIN_AZIMUTH_ENE,BEGIN_AZIMUTH_ESE,BEGIN_AZIMUTH_N,BEGIN_AZIMUTH_NE,BEGIN_AZIMUTH_NNE,BEGIN_AZIMUTH_NNW,BEGIN_AZIMUTH_NW,BEGIN_AZIMUTH_S,BEGIN_AZIMUTH_SE,BEGIN_AZIMUTH_SSE,BEGIN_AZIMUTH_SSW,BEGIN_AZIMUTH_SW,BEGIN_AZIMUTH_W,BEGIN_AZIMUTH_WNW,BEGIN_AZIMUTH_WSW,END_AZIMUTH_ENE,END_AZIMUTH_ESE,END_AZIMUTH_N,END_AZIMUTH_NE,END_AZIMUTH_NNE,END_AZIMUTH_NNW,END_AZIMUTH_NW,END_AZIMUTH_S,END_AZIMUTH_SE,END_AZIMUTH_SSE,END_AZIMUTH_SSW,END_AZIMUTH_SW,END_AZIMUTH_W,END_AZIMUTH_WNW,END_AZIMUTH_WSW,MAGNITUDE_TYPE_EG,MAGNITUDE_TYPE_ES,MAGNITUDE_TYPE_HA,MAGNITUDE_TYPE_M,MAGNITUDE_TYPE_MG,MAGNITUDE_TYPE_MS
0,3,0,0,0,250000.0,0.0,1.0,0.0,83.0,37.25,-89.67,37.3,-89.53,2000,1,3,1,3,0,0,10,0.2,16.651331,8,50,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,150000.0,0.0,2.0,4.0,100.0,37.73,-89.18,37.75,-89.17,2000,1,3,1,3,0,0,4,0.1,4.535308,9,18,9,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,100000.0,0.0,1.0,4.0,75.0,37.78,-89.12,37.78,-89.12,2000,1,3,1,3,0,0,2,0.0,0.0,9,22,9,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,100000.0,0.0,0.0,0.0,65.0,37.98,-87.55,37.98,-87.55,2000,1,3,1,3,0,0,0,0.0,0.0,12,10,12,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,100000.0,0.0,0.0,1.0,87.0,37.73,-88.93,37.73,-88.93,2000,1,3,1,3,0,0,0,0.0,0.0,14,7,14,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
X = df.drop(columns = ['INJURIES_DIRECT','INJURIES_INDIRECT','DEATHS_DIRECT','DEATHS_INDIRECT','DAMAGE_PROPERTY','DAMAGE_CROPS'])
y = df['DAMAGE_PROPERTY']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 9)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

# Predicting `DAMAGE_PROPERTY`

## Random Forest

In [None]:
params = {'n_estimators': np.arange(50,500,50),
          'max_depth': np.arange(50,500,50),
          'max_features': np.arange(50,130,20)}
rf = RandomForestRegressor(n_jobs = -1, verbose = 1) 
rfr = GridSearchCV(rf, params, cv=5)
rfr.fit(X_train, y_train)
rfr.best_params_

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jo

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    6.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    6.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    5.2s finished
[Parallel(n_jobs=4)]: Done  42 

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.7s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jo

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    8.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    7.7s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   10.0s finished
[Parallel(n_jobs=4)]: Done  42 

In [None]:
rfr.score(X_train, y_train)

In [None]:
rfr.score(X_test, y_test)