In [78]:
import numpy as np
import os
import pandas as pd
import glob
from ast import literal_eval
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import KFold

In [79]:
data = pd.read_hdf('data.h5')
data.head()

Unnamed: 0,date_key,zip5,impact_score,grid_lat,grid_lon,Time,ForecastRange,x,y,5_Wave_Geopotential_Height_isobaric,...,Vertical_velocity_geometric_isobaric,Ice_growth_rate_altitude_above_msl,density,state,Region,USR,Weekday,day,month,year
0,2017-01-01,2722,20.268081,41.5,-71.0,0.0,0.0,578.0,97.0,5493.18457,...,,,0.0,MA,New England Northeast,Rural,6,1,1,2017
1,2017-01-01,2722,20.268081,41.5,-71.0,6.0,0.0,578.0,97.0,5514.240723,...,,,0.0,MA,New England Northeast,Rural,6,1,1,2017
2,2017-01-01,2722,20.268081,41.5,-71.0,12.0,0.0,578.0,97.0,5544.382324,...,,,0.0,MA,New England Northeast,Rural,6,1,1,2017
3,2017-01-01,2722,20.268081,41.5,-71.0,18.0,0.0,578.0,97.0,5575.850586,...,,,0.0,MA,New England Northeast,Rural,6,1,1,2017
4,2017-01-02,2722,16.868994,41.5,-71.0,0.0,0.0,578.0,97.0,5614.513184,...,,,0.0,MA,New England Northeast,Rural,0,2,1,2017


## Dataframe transformation

In [80]:
data = data[data['date_key'].dt.year < 2019]
data['year'] = pd.Categorical(data['year'])
data['month'] = pd.Categorical(data['month'])
data['day'] = pd.Categorical(data['day'])
data['zip5'] = pd.Categorical(data['zip5'])
#data.dropna(inplace=True)
data = data.drop(['ForecastRange', 'x', 'y', 'grid_lat', 'grid_lon'], axis = 1)

In [81]:
# Add dummy variables to introduce sparseness
cols = ['state', 'USR', 'Region', 'zip5', 'Weekday', 'year', 'month', 'day']
df = data[cols]
dataDummies = pd.get_dummies(df,drop_first=True)
data = pd.concat([data, dataDummies], axis = 1)
data = data.drop(['state', 'USR', 'Region', 'zip5'], axis = 1)

In [82]:
data = data.dropna(axis= 1,how="any")

Should date_key and Time be dropped?

In [83]:
data.head()

Unnamed: 0,date_key,impact_score,Time,5_Wave_Geopotential_Height_isobaric,Absolute_vorticity_isobaric,Apparent_temperature_height_above_ground,Best_4_layer_Lifted_Index_surface,Cloud_mixing_ratio_isobaric,Cloud_water_entire_atmosphere_single_layer,Convective_available_potential_energy_pressure_difference_layer,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,2017-01-01,20.268081,0.0,5493.18457,9.9e-05,275.700012,11.8,0.0,0.19,19.0,...,0,0,0,0,0,0,0,0,0,0
1,2017-01-01,20.268081,6.0,5514.240723,9.4e-05,275.399994,10.7,0.0,0.65,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2017-01-01,20.268081,12.0,5544.382324,9.3e-05,274.299988,11.1,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2017-01-01,20.268081,18.0,5575.850586,9.3e-05,275.700012,19.1,0.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0
4,2017-01-02,16.868994,0.0,5614.513184,8.6e-05,272.899994,22.9,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## Train/ Test Functions

In [96]:
def randomsplit(data, n): # n = test size
    X = data.drop(['date_key','impact_score'], axis = 1)
    y = data['impact_score']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = n)
    return X_train, X_test, y_train, y_test

In [88]:
def split_by_year(data, train_year, test_year):
    train = data[data['date_key'].dt.year == train_year]
    test = data[data['date_key'].dt.year == test_year]
    X_train = train.drop(['date_key','impact_score'], axis = 1)
    y_train = train['impact_score']
    X_test = test.drop(['date_key','impact_score'], axis = 1)
    y_test = test['impact_score']
    return X_train, X_test, y_train, y_test

## Model 1

In [128]:
test_size = 0.3
n_est = 100
max_d = 10
min_samples = 100

In [129]:
# Random split
X_train, X_test, y_train, y_test = randomsplit(data, test_size)
m1 = RandomForestRegressor(n_estimators = n_est, max_depth = max_d, min_samples_leaf = min_samples)
m1.fit(X_train,y_train)
y_pred = m1.predict(X_test)
print("Error:", metrics.mean_squared_error(y_test, y_pred))

Error: 20.90037517321647


In [130]:
# Test on 2017
X_train, X_test, y_train, y_test = split_by_year(data, 2018, 2017)
m1 = RandomForestRegressor(n_estimators = n_est, max_depth = max_d, min_samples_leaf = min_samples)
m1.fit(X_train,y_train)
y_pred = m1.predict(X_test)
print("Error:", metrics.mean_squared_error(y_test, y_pred))

Error: 40.24558618124567


In [131]:
# Test on 2018
X_train, X_test, y_train, y_test = split_by_year(data, 2017, 2018)
m1 = RandomForestRegressor(n_estimators = n_est, max_depth = max_d, min_samples_leaf = min_samples)
m1.fit(X_train,y_train)
y_pred = m1.predict(X_test)
print("Error:", metrics.mean_squared_error(y_test, y_pred))

Error: 28.952202135194693


n_estimators|max_depth|min_sampels_leaf|Random split|Train on 2018, Test on 2017| Train on 2017, Test on 2018|
---|---|---|---|---|---|
100|5|50|30.600|40.636|31.345|
100|10|50|20.852|40.270|28.972|
100|10|100|20.900|40.245|28.952
100|20|50|10.123|39.242|32.593|
100|10|1000|23.731|45.756|33.375
200|5|50|31.494|40.791|31.488|
