In [16]:
import numpy as np
import os
import pandas as pd
import glob
from ast import literal_eval
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_hdf('random_forest.h5')
data.head()

Unnamed: 0,zip5,impact_score,x,y,lat,lng,5_Wave_Geopotential_Height_isobaric,Absolute_vorticity_isobaric,Apparent_temperature_height_above_ground,Best_4_layer_Lifted_Index_surface,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,2722,20.268081,578.0,97.0,41.5,-71.0,5493.18457,9.9e-05,275.700012,11.8,...,0,0,0,0,0,0,0,0,0,0
1,2722,20.268081,578.0,97.0,41.5,-71.0,5514.240723,9.4e-05,275.399994,10.7,...,0,0,0,0,0,0,0,0,0,0
2,2722,20.268081,578.0,97.0,41.5,-71.0,5544.382324,9.3e-05,274.299988,11.1,...,0,0,0,0,0,0,0,0,0,0
3,2722,20.268081,578.0,97.0,41.5,-71.0,5575.850586,9.3e-05,275.700012,19.1,...,0,0,0,0,0,0,0,0,0,0
4,2722,16.868994,578.0,97.0,41.5,-71.0,5614.513184,8.6e-05,272.899994,22.9,...,0,0,0,0,0,0,0,0,0,0


In [36]:
cols = data.columns.tolist()

In [37]:
cols

['impact_score',
 'x',
 'y',
 'lat',
 'lng',
 '5_Wave_Geopotential_Height_isobaric',
 'Absolute_vorticity_isobaric',
 'Apparent_temperature_height_above_ground',
 'Best_4_layer_Lifted_Index_surface',
 'Cloud_mixing_ratio_isobaric',
 'Cloud_water_entire_atmosphere_single_layer',
 'Convective_available_potential_energy_pressure_difference_layer',
 'Convective_available_potential_energy_surface',
 'Convective_inhibition_pressure_difference_layer',
 'Convective_inhibition_surface',
 'Dewpoint_temperature_height_above_ground',
 'Field_Capacity_surface',
 'Geopotential_height_highest_tropospheric_freezing',
 'Geopotential_height_isobaric',
 'Geopotential_height_maximum_wind',
 'Geopotential_height_surface',
 'Geopotential_height_tropopause',
 'Geopotential_height_zeroDegC_isotherm',
 'Haines_Index_surface',
 'ICAO_Standard_Atmosphere_Reference_Height_maximum_wind',
 'ICAO_Standard_Atmosphere_Reference_Height_tropopause',
 'Ice_cover_surface',
 'Land_cover_0__sea_1__land_surface',
 'MSLP_Eta_

## Dataframe transformation

In [3]:
data = data.drop(['zip5'], axis = 1)

In [None]:
data = data[data['datetime'].dt.year < 2019]
#data['year'] = pd.Categorical(data['year'])
#data['month'] = pd.Categorical(data['month'])
#data['day'] = pd.Categorical(data['day'])
#data['zip5'] = pd.Categorical(data['zip5'])
#data.dropna(inplace=True)
data = data.drop(['ForecastRange', 'x', 'y', 'grid_lat', 'grid_lon'], axis = 1)

In [None]:
['grid_lat', 'grid_lon', 'Date', 'date_key', 'Time','Weekday']

In [None]:
# Add dummy variables to introduce sparseness
cols = ['state', 'USR', 'Region', 'zip5', 'Weekday', 'year', 'month', 'day']
df = data[cols]
dataDummies = pd.get_dummies(df,drop_first=True)
data = pd.concat([data, dataDummies], axis = 1)
data = data.drop(['state', 'USR', 'Region', 'zip5'], axis = 1)

In [None]:
data = data.dropna(axis= 1,how="any")

Should date_key and Time be dropped?

In [None]:
data.head()

## Train/ Test Functions

In [4]:
def randomsplit(data, n): # n = test size
    X = data.drop(['datetime','impact_score'], axis = 1)
    y = data['impact_score']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = n)
    return X_train, X_test, y_train, y_test

In [5]:
def split_by_year(data, train_year, test_year):
    train = data[data['datetime'].dt.year == train_year]
    test = data[data['datetime'].dt.year == test_year]
    X_train = train.drop(['datetime','impact_score'], axis = 1)
    y_train = train['impact_score']
    X_test = test.drop(['datetime','impact_score'], axis = 1)
    y_test = test['impact_score']
    return X_train, X_test, y_train, y_test

## Model 1

In [50]:
test_size = 0.3
n_est = 300
max_d = 15
min_samples = 50

In [None]:
# Random split
X_train, X_test, y_train, y_test = randomsplit(data, test_size)
m1 = RandomForestRegressor(n_estimators = n_est, max_depth = max_d, min_samples_leaf = min_samples)
m1.fit(X_train,y_train)
y_pred = m1.predict(X_test)
print("Error:", metrics.mean_squared_error(y_test, y_pred))

In [None]:
# Test on 2017
X_train, X_test, y_train, y_test = split_by_year(data, 2018, 2017)
m1 = RandomForestRegressor(n_estimators = n_est, max_depth = max_d, min_samples_leaf = min_samples)
m1.fit(X_train,y_train)
y_pred = m1.predict(X_test)
print("Error:", metrics.mean_squared_error(y_test, y_pred))

In [51]:
# Test on 2018
X_train, X_test, y_train, y_test = split_by_year(data, 2017, 2018)
m1 = RandomForestRegressor(n_estimators = n_est, max_depth = max_d, min_samples_leaf = min_samples)
m1.fit(X_train,y_train)
y_pred = m1.predict(X_test)
print("Error:", metrics.mean_squared_error(y_test, y_pred))

Error: 25.78019268035011


n_estimators|max_depth|min_sampels_leaf|Train on 2017, Test on 2018|
---|---|---|---|
100|15|100|25.71629|
100|5|100|28.9782|
100|10|100|25.683|
25|10|100|25.890|
25|10|150|25.678|
25|10|200|25.592|
25|10|300|25.603|
25|15|300|25.780|

In [59]:
y_test

1460      18.051772
1461      18.051772
1462      18.051772
1463      18.051772
1464      17.500546
1465      17.500546
1466      17.500546
1467      17.500546
1468      18.677563
1469      18.677563
1470      18.677563
1471      18.677563
1472      16.278987
1473      16.278987
1474      16.278987
1475      16.278987
1476      25.751868
1477      25.751868
1478      25.751868
1479      25.751868
1480      21.095345
1481      21.095345
1482      21.095345
1483      21.095345
1484      18.110282
1485      18.110282
1486      18.110282
1487      18.110282
1488      14.689284
1489      14.689284
1490      14.689284
1491      14.689284
1492      13.895124
1493      13.895124
1494      13.895124
1495      13.895124
1496      14.937986
1497      14.937986
1498      14.937986
1499      14.937986
1500      12.718678
1501      12.718678
1502      12.718678
1503      12.718678
1504      14.241020
1505      14.241020
1506      14.241020
1507      14.241020
1508      16.923277
1509      16.923277


In [58]:
list(y_pred)

[15.567918341760636,
 15.564354805843168,
 15.570012827236388,
 15.565914935597217,
 15.53625847276378,
 15.582486652469727,
 15.548607885324436,
 15.582142343737187,
 15.44143704463717,
 15.426284775289254,
 15.431053100490635,
 15.378418838118474,
 15.399783200086201,
 15.41298242339328,
 15.476117127473174,
 15.812398301645807,
 15.530811223218812,
 15.647519708161854,
 15.548949180520285,
 15.515966383331818,
 16.81958364147753,
 16.815031350285555,
 16.817005059692118,
 16.87925455938891,
 16.832408659588314,
 16.844486743568602,
 16.73846313378405,
 16.908263151051212,
 15.640494617705398,
 15.632286319174346,
 15.642673118203456,
 15.59560322635762,
 15.595036209284402,
 15.654509642567794,
 15.655370245798162,
 15.631795516602272,
 15.619519043267065,
 15.45595180490066,
 15.479518869144577,
 15.443782521446659,
 15.367709096291613,
 15.367158325640405,
 14.953491642649205,
 15.334145786693549,
 15.345940946690776,
 14.889296234367045,
 14.898192091075392,
 14.789747661879238,
