In [14]:
from os.path import join, basename, splitext
from glob import glob
from dask import dataframe as dd
from matplotlib import rcParams
import pandas as pd
import dask
from collections import Counter
import pickle
import numpy as np
from datetime import datetime
import seaborn as sns
from bokeh.plotting import figure, output_notebook, show


from sklearn.metrics import mean_absolute_error, median_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

from deep_aqi import ROOT


pd.set_option('expand_frame_repr', False)
pd.set_option('max_rows', 24)

output_notebook()

In [2]:
def MAPE(data):
    data = data[data['True'] > 0]
    return np.mean(np.abs(data['True'] - data.Pred) / data['True'])

In [3]:
def MdAPE(data):
    data = data[data['True'] > 0]
    return np.median(np.abs(data['True'] - data.Pred) / data['True'])

In [4]:
def shift_target(data, number_of_days=1):
    timeshift = data.loc[:, target_column].reset_index()

    timeshift['PredictTime'] = timeshift.LocalDate
    timeshift.PredictTime = timeshift.PredictTime + pd.Timedelta(f'{number_of_days}D')
    timeshift = timeshift.rename(columns={target_column: f'{target_column}-{number_of_days*24}h'})
    timeshift.drop('LocalDate', axis=1, inplace=True)
    data.reset_index(inplace=True)
    data = pd.merge(data, timeshift, left_on='LocalDate', right_on='PredictTime', how='left')
    data.drop('PredictTime', axis=1, inplace=True)
    return data.sort_values('LocalDate').set_index('LocalDate')

In [5]:
def clean_outliers(data, sigma=10):
    # according to chebyshev inequality 10sigma covers 99% percent of data despite the distribution
    top = data[target_column].mean() + sigma * data[target_column].std()
    return data.loc[data[target_column] < top]

In [6]:
def prev_day_feature(data, fun, feature_name):
    feature = data.groupby(data.LocalDate.dt.date)[target_column].apply(fun)
    feature = pd.DataFrame(feature).reset_index()
    
    feature.LocalDate = pd.to_datetime(feature.LocalDate)
    feature.LocalDate = feature.LocalDate + pd.Timedelta('1D')
    feature.rename(columns={target_column: feature_name}, inplace=True)
    return pd.merge(df, feature, on='LocalDate', how='outer').sort_values('LocalDate').fillna(method='ffill')


In [7]:
def datetime_feature(data, feature_name):
    feature = getattr(data.LocalDate.dt, feature_name)
    dummies = pd.get_dummies(feature, prefix=feature_name)
    return pd.concat([data, dummies], axis=1)


In [8]:
def plot_predictions(data):
    plot = figure(plot_width=1200,
                 plot_height=600,
                 x_axis_type='datetime',
                 title='Prediction vs True')
    
    plot.line(data.index, data['True'], color='blue', legend='True')
    plot.line(data.index, data.Pred, color='orange', legend='Prediction')
    
    show(plot)

In [9]:
PROCESSED_DATA = join(ROOT, 'data', 'processed')
INTERIM_DATA = join(ROOT, 'data', 'interim')
RAW_DATA = join(ROOT, 'data', 'raw')

In [10]:
file_path = join(PROCESSED_DATA, '88101.parquet')

data_source = pd.read_parquet(file_path)
target_column = data_source.columns[-1]

## grid search parameters for RF

In [11]:
to_concat = []
prediction_dicts = {}
model_dicts = {}
for site_code, df in data_source.groupby(by='SiteCode'):
    df = clean_outliers(df)
    
    df = datetime_feature(df, 'hour')    
    df = datetime_feature(df, 'month')    
    df = datetime_feature(df, 'weekday')    
    
    df = prev_day_feature(df, np.mean, 'mean-24h')
    df = prev_day_feature(df, np.sum, 'sum-24h')
    df = prev_day_feature(df, np.min, 'min-24h')
    df = prev_day_feature(df, np.max, 'max-24h')
    df = prev_day_feature(df, np.std, 'std-24h')
    
    df = df.sort_values('LocalDate').set_index('LocalDate')
    
    df = shift_target(df, number_of_days=1)
    df = shift_target(df, number_of_days=2)
    df.dropna(inplace=True)
    
    Y = df.loc[:, target_column]
    X = df.drop([target_column, 'SiteCode'], axis=1)
    Y = MinMaxScaler().fit_transform(Y.values.reshape(-1, 1))
    Y = pd.Series(Y.flatten(), index=X.index)    
    
    SPLIT_POINT = int(0.8 * len(df))
    X_train, X_test = X.iloc[:SPLIT_POINT, :], X.iloc[SPLIT_POINT:, :]
    y_train, y_test = Y[:SPLIT_POINT], Y[SPLIT_POINT:]

    # first iteration
#     param_grid = {'bootstrap': [True],
# #                   'max_depth': [80, 90, 100, 110],
# #                   'max_features': ['auto', 'sqrt'],
#                   'min_samples_leaf': [3, 4, 5],
#                   'min_samples_split': [8, 10, 12],
#                   'n_estimators': [100, 200]
#                  }
  
    # second iteration
#     param_grid = {'bootstrap': [True],
# #                   'max_depth': [80, 90, 100, 110],
#                   'max_features': ['auto', 'sqrt'],
#                   'min_samples_leaf': [4, 5, 6, 7],
#                   'min_samples_split': [1, 2, 4, 6, 8],
#                   'n_estimators': [200, 300]
#                  }
    
#     third iteration
#     param_grid = {'bootstrap': [False, True],
#                   'max_depth': [80, 90, 100, 110],
#                   'max_features': ['sqrt'],
#                   'min_samples_leaf': [4, 5, 6, 7],
#                   'min_samples_split': [2, 4, 6, 8],
#                   'n_estimators': [200]
#                  }
    
    # final params
    param_grid = {'max_depth': [100],
                  'max_features': ['sqrt'],
                  'min_samples_leaf': [4],
                  'min_samples_split': [4],
                  'n_estimators': [200]
                 }
    
    model = GridSearchCV(estimator=RandomForestRegressor(),
                         param_grid=param_grid,
                         cv=3,
                         n_jobs=-1,
                         verbose=2)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mdae = median_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mean_ = np.mean(y_test)
    
    y_test = pd.DataFrame(y_test, columns=['True']).reset_index(drop=False)  
    y_pred = pd.DataFrame(y_pred, columns=['Pred'])    
    prediction = pd.concat([y_test, y_pred], axis=1).set_index('LocalDate')
    
    mape = MAPE(prediction)
    mdape = MdAPE(prediction)

    result = pd.DataFrame(index = [site_code],
                          data={'MAE': mae,
                                'MSE': mse,
                                'MdAE': mdae,
                                'R2': r2,
                                'MAPE': mape,
                                'MdAPE': mdape,
                                'MEAN': mean_,
                               })
    to_concat.append(result)
    prediction_dicts[site_code] = prediction
    model_dicts[site_code] = model.best_estimator_
results = pd.concat(to_concat)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    5.9s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.5s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.8s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.9s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.5s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.9s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    5.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.6s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.7s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    5.7s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.7s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.9s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.9s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.6s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    5.8s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.6s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.5s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.7s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.8s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.5s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.6s finished


In [12]:
file_name = 'RF_nonRandom.pkl'

to_save = {}
to_save['Results'] = results
to_save['Models'] = model_dicts
to_save['Predictions'] = prediction_dicts

with open(file_name, 'wb') as f:
    pickle.dump(to_save, f, pickle.HIGHEST_PROTOCOL)


In [19]:
plot_predictions(prediction_dicts['Vermont_Rutland_2.0'])

In [17]:
results.sort_values('R2')

Unnamed: 0,MAE,MSE,MdAE,R2,MAPE,MdAPE,MEAN
Vermont_Chittenden_7.0,0.050443,0.005135,0.034270,0.028674,0.125971,0.098828,0.370813
Missouri_Jefferson_19.0,0.042578,0.003051,0.035273,0.095433,0.300202,0.184413,0.186065
New Hampshire_Hillsborough_5001.0,0.052667,0.004715,0.042521,0.123493,0.324560,0.201442,0.208872
Missouri_Clay_5.0,0.041145,0.003155,0.033155,0.132244,0.287550,0.162472,0.196895
Vermont_Bennington_4.0,0.055439,0.005029,0.046281,0.134904,0.217746,0.163294,0.285049
Missouri_St. Louis City_94.0,0.050631,0.005036,0.037751,0.157848,0.328795,0.184377,0.209918
Ohio_Hamilton_40.0,0.083533,0.013556,0.064211,0.158384,0.861825,0.391230,0.179237
Wyoming_Fremont_99.0,0.036226,0.002755,0.027227,0.162015,0.194650,0.129465,0.211803
New Jersey_Essex_3.0,0.068626,0.008650,0.053457,0.178533,0.417626,0.270125,0.206694
Missouri_St. Louis City_85.0,0.037481,0.002446,0.030263,0.198936,0.246917,0.172025,0.178110


In [20]:
results.describe()

Unnamed: 0,MAE,MSE,MdAE,R2,MAPE,MdAPE,MEAN
count,49.0,49.0,49.0,49.0,49.0,49.0,49.0
mean,0.048759,0.004674,0.037703,0.27131,0.324389,0.191944,0.214043
std,0.010608,0.002219,0.008467,0.10795,0.246536,0.059799,0.05428
min,0.027264,0.001476,0.020433,0.028674,0.125971,0.098828,0.099616
25%,0.042484,0.003269,0.032721,0.206243,0.217746,0.150474,0.17811
50%,0.045937,0.004298,0.035664,0.258404,0.272928,0.176184,0.208872
75%,0.051973,0.005036,0.04053,0.311019,0.329193,0.212975,0.228465
max,0.083533,0.013556,0.064211,0.53769,1.773546,0.39123,0.370813


In [13]:
# first iteration
model.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=5, min_samples_split=8,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [18]:
# second iteration
model.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [51]:
# third iteration
model.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=100,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [53]:
# fourth iteration
model.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=90,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [21]:
# features importance
list(zip(X_test.columns, model.best_estimator_.feature_importances_))

[('WindDir', 0.0848235030865132),
 ('WindSpeed', 0.05453981740820993),
 ('Temperature', 0.07203999155114571),
 ('Pressure', 0.06740103524715729),
 ('RelHum', 0.06072780929654791),
 ('hour_0', 0.001123732292905874),
 ('hour_1', 0.0006867741553498399),
 ('hour_2', 0.0006013965169171858),
 ('hour_3', 0.0005751540085242645),
 ('hour_4', 0.0005834334988859949),
 ('hour_5', 0.0008032910821627859),
 ('hour_6', 0.0011384328706746684),
 ('hour_7', 0.0009855654810145221),
 ('hour_8', 0.0037988405688315964),
 ('hour_9', 0.003560713210159311),
 ('hour_10', 0.0029697191034426095),
 ('hour_11', 0.0019109890536318716),
 ('hour_12', 0.0011181869463264924),
 ('hour_13', 0.0007525677932021585),
 ('hour_14', 0.000833960300250228),
 ('hour_15', 0.0010176609940514174),
 ('hour_16', 0.0015913561225962272),
 ('hour_17', 0.0013363256321759123),
 ('hour_18', 0.0020538587936905876),
 ('hour_19', 0.002454684854726534),
 ('hour_20', 0.0011848942564772818),
 ('hour_21', 0.0010965672739756472),
 ('hour_22', 0.00180

## grid search them parameters for RF | randomize train\test

In [22]:
to_concat = []
prediction_dicts = {}
model_dicts = {}
for site_code, df in data_source.groupby(by='SiteCode'):
    df = clean_outliers(df)
    
    df = datetime_feature(df, 'hour')    
    df = datetime_feature(df, 'month')    
    df = datetime_feature(df, 'weekday')    
    
    df = prev_day_feature(df, np.mean, 'mean-24h')
    df = prev_day_feature(df, np.sum, 'sum-24h')
    df = prev_day_feature(df, np.min, 'min-24h')
    df = prev_day_feature(df, np.max, 'max-24h')
    df = prev_day_feature(df, np.std, 'std-24h')
    
    df = df.sort_values('LocalDate').set_index('LocalDate')
    
    df = shift_target(df, number_of_days=1)
    df = shift_target(df, number_of_days=2)
    df.dropna(inplace=True)
    
    Y = df.loc[:, target_column]
    X = df.drop([target_column, 'SiteCode'], axis=1)
    Y = MinMaxScaler().fit_transform(Y.values.reshape(-1, 1))
    Y = pd.Series(Y.flatten(), index=X.index)    
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=23)

#     SPLIT_POINT = int(0.8 * len(df))
#     X_train, X_test = X.iloc[:SPLIT_POINT, :], X.iloc[SPLIT_POINT:, :]
#     y_train, y_test = Y[:SPLIT_POINT], Y[SPLIT_POINT:]


    # final params
    param_grid = {'max_depth': [100],
                  'max_features': ['sqrt'],
                  'min_samples_leaf': [4],
                  'min_samples_split': [4],
                  'n_estimators': [200]
                 }
    
    model = GridSearchCV(estimator=RandomForestRegressor(),
                         param_grid=param_grid,
                         cv=3,
                         n_jobs=-1,
                         verbose=2)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mdae = median_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mean_ = np.mean(y_test)
    
    y_test = pd.DataFrame(y_test, columns=['True']).reset_index(drop=False)  
    y_pred = pd.DataFrame(y_pred, columns=['Pred'])    
    prediction = pd.concat([y_test, y_pred], axis=1).set_index('LocalDate')
    
    mape = MAPE(prediction)
    mdape = MdAPE(prediction)

    result = pd.DataFrame(index = [site_code],
                          data={'MAE': mae,
                                'MSE': mse,
                                'MdAE': mdae,
                                'R2': r2,
                                'MAPE': mape,
                                'MdAPE': mdape,
                                'MEAN': mean_,
                               })
    to_concat.append(result)
    prediction_dicts[site_code] = prediction
    model_dicts[site_code] = model.best_estimator_
results = pd.concat(to_concat)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.9s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.5s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    5.9s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    5.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.6s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    6.7s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.5s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.9s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.5s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.5s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.9s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.8s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    6.8s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    6.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.7s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    7.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.7s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.5s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.5s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.7s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.3s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.6s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.4s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.7s finished


In [23]:
file_name = 'RF_Random.pkl'

to_save = {}
to_save['Results'] = results
to_save['Models'] = model_dicts
to_save['Predictions'] = prediction_dicts

with open(file_name, 'wb') as f:
    pickle.dump(to_save, f, pickle.HIGHEST_PROTOCOL)


In [24]:
plot_predictions(prediction_dicts['California_Fresno_5001.0'].sort_index())

In [25]:
results.sort_values('R2')

Unnamed: 0,MAE,MSE,MdAE,R2,MAPE,MdAPE,MEAN
New Mexico_Bernalillo_23.0,0.043371,0.004659,0.030033,0.409452,0.455458,0.272167,0.127165
Missouri_St. Louis City_94.0,0.041008,0.003412,0.030241,0.421852,0.253612,0.146882,0.210615
Wyoming_Laramie_100.0,0.036719,0.002970,0.025667,0.430143,0.249954,0.136725,0.191222
Maryland_Dorchester_4.0,0.051344,0.004848,0.038918,0.479174,0.202837,0.126386,0.315310
Missouri_Clay_5.0,0.037882,0.002682,0.029078,0.485019,0.221948,0.136513,0.216677
Missouri_Cass_3.0,0.042082,0.003659,0.031660,0.486943,0.155316,0.111688,0.289390
Missouri_Jackson_42.0,0.036804,0.002666,0.027755,0.490673,0.198264,0.127917,0.223623
Missouri_St. Louis City_93.0,0.036077,0.002554,0.026955,0.499956,0.182451,0.125381,0.224611
Missouri_Buchanan_5.0,0.033642,0.002406,0.025035,0.513144,0.180609,0.125243,0.207260
Colorado_Rio Blanco_6.0,0.035989,0.002536,0.026500,0.523492,0.144248,0.098444,0.274241


In [26]:
results.describe()

Unnamed: 0,MAE,MSE,MdAE,R2,MAPE,MdAPE,MEAN
count,49.0,49.0,49.0,49.0,49.0,49.0,49.0
mean,0.038211,0.002953,0.028748,0.583952,0.25207,0.146882,0.216847
std,0.00645,0.000917,0.005478,0.076641,0.202827,0.045637,0.051593
min,0.027034,0.001705,0.018501,0.409452,0.103522,0.074359,0.10969
25%,0.033867,0.002414,0.025282,0.536986,0.177999,0.122444,0.184692
50%,0.036743,0.002664,0.028098,0.59355,0.199534,0.135611,0.210615
75%,0.041151,0.003299,0.030688,0.631005,0.245107,0.154122,0.241394
max,0.057663,0.00603,0.047323,0.77691,1.489671,0.312522,0.341996


In [27]:
# features importance
list(zip(X_test.columns, model.best_estimator_.feature_importances_))

[('WindDir', 0.07897039157910928),
 ('WindSpeed', 0.050649933251062665),
 ('Temperature', 0.06881827126816123),
 ('Pressure', 0.06763632313777351),
 ('RelHum', 0.05363740453013465),
 ('hour_0', 0.0011822577235058075),
 ('hour_1', 0.000637931974703947),
 ('hour_2', 0.0006012283932737514),
 ('hour_3', 0.0005676602300014367),
 ('hour_4', 0.0005644417623821282),
 ('hour_5', 0.0007076878475028948),
 ('hour_6', 0.000883570213467461),
 ('hour_7', 0.0011338733083739471),
 ('hour_8', 0.0031697029019714783),
 ('hour_9', 0.003768412700045413),
 ('hour_10', 0.0036991132961848366),
 ('hour_11', 0.002809284410600607),
 ('hour_12', 0.0014424975581418024),
 ('hour_13', 0.0008085365305090399),
 ('hour_14', 0.000899499458835145),
 ('hour_15', 0.0010164164703378425),
 ('hour_16', 0.0012404623834925735),
 ('hour_17', 0.0014495099633068179),
 ('hour_18', 0.00215125547029936),
 ('hour_19', 0.00402687418727591),
 ('hour_20', 0.0015289514935474717),
 ('hour_21', 0.0013056013394480111),
 ('hour_22', 0.00195869

## gradient boosting| randomize train\test

In [28]:
to_concat = []
prediction_dicts = {}
model_dicts = {}
for site_code, df in data_source.groupby(by='SiteCode'):
    df = clean_outliers(df)
    
    df = datetime_feature(df, 'hour')    
    df = datetime_feature(df, 'month')    
    df = datetime_feature(df, 'weekday')    
    
    df = prev_day_feature(df, np.mean, 'mean-24h')
    df = prev_day_feature(df, np.sum, 'sum-24h')
    df = prev_day_feature(df, np.min, 'min-24h')
    df = prev_day_feature(df, np.max, 'max-24h')
    df = prev_day_feature(df, np.std, 'std-24h')
    
    df = df.sort_values('LocalDate').set_index('LocalDate')
    
    df = shift_target(df, number_of_days=1)
    df = shift_target(df, number_of_days=2)
    df.dropna(inplace=True)
    
    Y = df.loc[:, target_column]
    X = df.drop([target_column, 'SiteCode'], axis=1)
    Y = MinMaxScaler().fit_transform(Y.values.reshape(-1, 1))
    Y = pd.Series(Y.flatten(), index=X.index)    
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=23)

#     SPLIT_POINT = int(0.8 * len(df))
#     X_train, X_test = X.iloc[:SPLIT_POINT, :], X.iloc[SPLIT_POINT:, :]
#     y_train, y_test = Y[:SPLIT_POINT], Y[SPLIT_POINT:]


    # final params
    param_grid = {'max_depth': [100],
                  'max_features': ['sqrt'],
                  'min_samples_leaf': [4],
                  'min_samples_split': [4],
                  'n_estimators': [200]
                 }
    
    model = GridSearchCV(estimator=GradientBoostingRegressor(),
                         param_grid=param_grid,
                         cv=2,
                         n_jobs=-1,
                         verbose=2)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mdae = median_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mean_ = np.mean(y_test)
    
    y_test = pd.DataFrame(y_test, columns=['True']).reset_index(drop=False)  
    y_pred = pd.DataFrame(y_pred, columns=['Pred'])    
    prediction = pd.concat([y_test, y_pred], axis=1).set_index('LocalDate')
    
    mape = MAPE(prediction)
    mdape = MdAPE(prediction)

    result = pd.DataFrame(index = [site_code],
                          data={'MAE': mae,
                                'MSE': mse,
                                'MdAE': mdae,
                                'R2': r2,
                                'MAPE': mape,
                                'MdAPE': mdape,
                                'MEAN': mean_,
                               })
    to_concat.append(result)
    prediction_dicts[site_code] = prediction
    model_dicts[site_code] = model.best_estimator_
results = pd.concat(to_concat)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   15.5s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   10.1s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    5.0s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   10.5s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.9s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.3s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.4s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.3s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.0s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   40.2s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   39.3s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   10.9s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    3.5s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   10.8s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.5s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   49.1s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   17.6s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   10.2s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    5.7s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   19.9s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    5.2s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   26.8s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   18.9s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   18.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.7s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.5s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   49.8s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.5s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.8s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   17.1s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    8.3s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   35.5s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   17.0s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.2s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.5s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    3.4s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   49.4s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.6s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.6s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.3s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   27.3s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.6s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   18.5s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.8s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   16.3s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.6s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.6s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.8s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   19.5s finished


In [29]:
file_name = 'gradient_boosting_random.pkl'

to_save = {}
to_save['Results'] = results
to_save['Models'] = model_dicts
to_save['Predictions'] = prediction_dicts

with open(file_name, 'wb') as f:
    pickle.dump(to_save, f, pickle.HIGHEST_PROTOCOL)


In [30]:
results.sort_values('R2')

Unnamed: 0,MAE,MSE,MdAE,R2,MAPE,MdAPE,MEAN
Missouri_St. Louis City_94.0,0.040217,0.003314,0.029435,0.438434,0.248092,0.144453,0.210615
New Mexico_Bernalillo_23.0,0.040311,0.003979,0.027620,0.495640,0.411990,0.250277,0.127165
Wyoming_Laramie_100.0,0.035080,0.002570,0.025293,0.506891,0.233252,0.136419,0.191222
Missouri_Clay_5.0,0.036613,0.002530,0.028080,0.514038,0.214629,0.131909,0.216677
Maryland_Dorchester_4.0,0.048222,0.004268,0.037136,0.541521,0.184053,0.120176,0.315310
Missouri_Jackson_42.0,0.034682,0.002384,0.025954,0.544392,0.186277,0.120152,0.223623
Missouri_St. Louis City_93.0,0.033989,0.002299,0.025271,0.549957,0.169235,0.117184,0.224611
Missouri_Cass_3.0,0.039686,0.003149,0.029858,0.558535,0.147212,0.107827,0.289390
Missouri_Buchanan_5.0,0.031524,0.002111,0.023465,0.572688,0.168685,0.117486,0.207260
Missouri_Cedar_1.0,0.032807,0.002295,0.024766,0.591003,0.167472,0.110869,0.231379


In [32]:
plot_predictions(prediction_dicts['Vermont_Rutland_2.0'].sort_index())