In [1]:
import sys, pandas_profiling
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
import plotly.graph_objects as go

from collections import OrderedDict
np.set_printoptions(suppress=True, threshold=sys.maxsize)

In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
%matplotlib inline
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_rows', 1000)
pd.options.display.float_format = '{:.2f}'.format

In [4]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor

In [5]:
dataset_hourly = pd.read_csv('dataset_formatted.csv', index_col=0)
dataset_hourly.shape

(127470, 12)

In [6]:
dataset_hourly.head()

Unnamed: 0,Date,Clock_Time,Temperature,Weather,Wind_Speed,Humidity,Visibility,Formatted_Date,Year,Month,Day,Weather_Category
0,09-Sep-09,0:40,30.0,Haze.,15.0,0.75,4.0,2009-09-09 00:30:00,2009,9,9,Foggy
1,09-Sep-09,1:40,30.0,Haze.,17.0,0.75,4.0,2009-09-09 01:30:00,2009,9,9,Foggy
2,09-Sep-09,11:40,31.0,Scattered clouds.,17.0,0.7,6.0,2009-09-09 11:30:00,2009,9,9,Cloudy
3,09-Sep-09,13:40,33.0,Clear.,19.0,0.59,6.0,2009-09-09 13:30:00,2009,9,9,Clear
4,09-Sep-09,14:40,34.0,Clear.,17.0,0.56,6.0,2009-09-09 14:30:00,2009,9,9,Clear


In [7]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    
    column_names = data.columns
    
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('%s(t-%d)' % (column_names[j], i)) for j in range(n_vars)]
    
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('%s(t)' % column_names[j]) for j in range(n_vars)]
        else:
            names += [('%s(t+%d)' % (column_names[j], i) for j in range(n_vars))]
    
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    
    return agg



In [8]:
dataset_hourly.head()

Unnamed: 0,Date,Clock_Time,Temperature,Weather,Wind_Speed,Humidity,Visibility,Formatted_Date,Year,Month,Day,Weather_Category
0,09-Sep-09,0:40,30.0,Haze.,15.0,0.75,4.0,2009-09-09 00:30:00,2009,9,9,Foggy
1,09-Sep-09,1:40,30.0,Haze.,17.0,0.75,4.0,2009-09-09 01:30:00,2009,9,9,Foggy
2,09-Sep-09,11:40,31.0,Scattered clouds.,17.0,0.7,6.0,2009-09-09 11:30:00,2009,9,9,Cloudy
3,09-Sep-09,13:40,33.0,Clear.,19.0,0.59,6.0,2009-09-09 13:30:00,2009,9,9,Clear
4,09-Sep-09,14:40,34.0,Clear.,17.0,0.56,6.0,2009-09-09 14:30:00,2009,9,9,Clear


In [9]:
reqd_df = dataset_hourly[(~dataset_hourly.Year.isin([2019, 2020])) & (dataset_hourly.Wind_Speed < 20)][
    ['Temperature', 'Humidity', 'Visibility', 'Weather_Category', 'Wind_Speed']].copy()

In [10]:
encoder = LabelEncoder()
reqd_df['Weather_Category'] = encoder.fit_transform(reqd_df['Weather_Category'])

In [11]:
n_sup_df = series_to_supervised(data=reqd_df,
                     n_in=30,
                     n_out=1)

In [12]:
n_sup_df.head()

Unnamed: 0,Temperature(t-30),Humidity(t-30),Visibility(t-30),Weather_Category(t-30),Wind_Speed(t-30),Temperature(t-29),Humidity(t-29),Visibility(t-29),Weather_Category(t-29),Wind_Speed(t-29),Temperature(t-28),Humidity(t-28),Visibility(t-28),Weather_Category(t-28),Wind_Speed(t-28),Temperature(t-27),Humidity(t-27),Visibility(t-27),Weather_Category(t-27),Wind_Speed(t-27),Temperature(t-26),Humidity(t-26),Visibility(t-26),Weather_Category(t-26),Wind_Speed(t-26),Temperature(t-25),Humidity(t-25),Visibility(t-25),Weather_Category(t-25),Wind_Speed(t-25),Temperature(t-24),Humidity(t-24),Visibility(t-24),Weather_Category(t-24),Wind_Speed(t-24),Temperature(t-23),Humidity(t-23),Visibility(t-23),Weather_Category(t-23),Wind_Speed(t-23),Temperature(t-22),Humidity(t-22),Visibility(t-22),Weather_Category(t-22),Wind_Speed(t-22),Temperature(t-21),Humidity(t-21),Visibility(t-21),Weather_Category(t-21),Wind_Speed(t-21),...,Temperature(t-9),Humidity(t-9),Visibility(t-9),Weather_Category(t-9),Wind_Speed(t-9),Temperature(t-8),Humidity(t-8),Visibility(t-8),Weather_Category(t-8),Wind_Speed(t-8),Temperature(t-7),Humidity(t-7),Visibility(t-7),Weather_Category(t-7),Wind_Speed(t-7),Temperature(t-6),Humidity(t-6),Visibility(t-6),Weather_Category(t-6),Wind_Speed(t-6),Temperature(t-5),Humidity(t-5),Visibility(t-5),Weather_Category(t-5),Wind_Speed(t-5),Temperature(t-4),Humidity(t-4),Visibility(t-4),Weather_Category(t-4),Wind_Speed(t-4),Temperature(t-3),Humidity(t-3),Visibility(t-3),Weather_Category(t-3),Wind_Speed(t-3),Temperature(t-2),Humidity(t-2),Visibility(t-2),Weather_Category(t-2),Wind_Speed(t-2),Temperature(t-1),Humidity(t-1),Visibility(t-1),Weather_Category(t-1),Wind_Speed(t-1),Temperature(t),Humidity(t),Visibility(t),Weather_Category(t),Wind_Speed(t)
30,30.0,0.75,4.0,2.0,15.0,30.0,0.75,4.0,2.0,17.0,31.0,0.7,6.0,1.0,17.0,33.0,0.59,6.0,0.0,19.0,34.0,0.56,6.0,0.0,17.0,35.0,0.53,6.0,0.0,19.0,35.0,0.56,6.0,1.0,17.0,29.0,0.66,4.0,2.0,15.0,28.0,0.74,4.0,2.0,11.0,27.0,0.79,4.0,2.0,9.0,...,34.0,0.53,6.0,1.0,17.0,34.0,0.53,6.0,1.0,15.0,34.0,0.53,6.0,1.0,9.0,33.0,0.56,6.0,1.0,11.0,32.0,0.63,4.0,1.0,9.0,31.0,0.66,4.0,1.0,13.0,30.0,0.7,4.0,2.0,11.0,29.0,0.74,4.0,2.0,9.0,29.0,0.74,4.0,2.0,7.0,28.0,0.79,4.0,2,9.0
31,30.0,0.75,4.0,2.0,17.0,31.0,0.7,6.0,1.0,17.0,33.0,0.59,6.0,0.0,19.0,34.0,0.56,6.0,0.0,17.0,35.0,0.53,6.0,0.0,19.0,35.0,0.56,6.0,1.0,17.0,29.0,0.66,4.0,2.0,15.0,28.0,0.74,4.0,2.0,11.0,27.0,0.79,4.0,2.0,9.0,27.0,0.79,4.0,1.0,15.0,...,34.0,0.53,6.0,1.0,15.0,34.0,0.53,6.0,1.0,9.0,33.0,0.56,6.0,1.0,11.0,32.0,0.63,4.0,1.0,9.0,31.0,0.66,4.0,1.0,13.0,30.0,0.7,4.0,2.0,11.0,29.0,0.74,4.0,2.0,9.0,29.0,0.74,4.0,2.0,7.0,28.0,0.79,4.0,2.0,9.0,28.0,0.74,4.0,2,17.0
32,31.0,0.7,6.0,1.0,17.0,33.0,0.59,6.0,0.0,19.0,34.0,0.56,6.0,0.0,17.0,35.0,0.53,6.0,0.0,19.0,35.0,0.56,6.0,1.0,17.0,29.0,0.66,4.0,2.0,15.0,28.0,0.74,4.0,2.0,11.0,27.0,0.79,4.0,2.0,9.0,27.0,0.79,4.0,1.0,15.0,27.0,0.79,4.0,1.0,15.0,...,34.0,0.53,6.0,1.0,9.0,33.0,0.56,6.0,1.0,11.0,32.0,0.63,4.0,1.0,9.0,31.0,0.66,4.0,1.0,13.0,30.0,0.7,4.0,2.0,11.0,29.0,0.74,4.0,2.0,9.0,29.0,0.74,4.0,2.0,7.0,28.0,0.79,4.0,2.0,9.0,28.0,0.74,4.0,2.0,17.0,27.0,0.74,4.0,2,11.0
33,33.0,0.59,6.0,0.0,19.0,34.0,0.56,6.0,0.0,17.0,35.0,0.53,6.0,0.0,19.0,35.0,0.56,6.0,1.0,17.0,29.0,0.66,4.0,2.0,15.0,28.0,0.74,4.0,2.0,11.0,27.0,0.79,4.0,2.0,9.0,27.0,0.79,4.0,1.0,15.0,27.0,0.79,4.0,1.0,15.0,27.0,0.79,4.0,1.0,15.0,...,33.0,0.56,6.0,1.0,11.0,32.0,0.63,4.0,1.0,9.0,31.0,0.66,4.0,1.0,13.0,30.0,0.7,4.0,2.0,11.0,29.0,0.74,4.0,2.0,9.0,29.0,0.74,4.0,2.0,7.0,28.0,0.79,4.0,2.0,9.0,28.0,0.74,4.0,2.0,17.0,27.0,0.74,4.0,2.0,11.0,27.0,0.74,4.0,2,11.0
34,34.0,0.56,6.0,0.0,17.0,35.0,0.53,6.0,0.0,19.0,35.0,0.56,6.0,1.0,17.0,29.0,0.66,4.0,2.0,15.0,28.0,0.74,4.0,2.0,11.0,27.0,0.79,4.0,2.0,9.0,27.0,0.79,4.0,1.0,15.0,27.0,0.79,4.0,1.0,15.0,27.0,0.79,4.0,1.0,15.0,28.0,0.79,3.0,1.0,11.0,...,32.0,0.63,4.0,1.0,9.0,31.0,0.66,4.0,1.0,13.0,30.0,0.7,4.0,2.0,11.0,29.0,0.74,4.0,2.0,9.0,29.0,0.74,4.0,2.0,7.0,28.0,0.79,4.0,2.0,9.0,28.0,0.74,4.0,2.0,17.0,27.0,0.74,4.0,2.0,11.0,27.0,0.74,4.0,2.0,11.0,27.0,0.79,4.0,2,11.0


In [13]:
n_sup_df.shape

(100147, 155)

In [14]:
X_columns = n_sup_df.columns.drop(['Temperature(t)', 'Humidity(t)', 'Visibility(t)', 
                                   'Weather_Category(t)', 'Wind_Speed(t)'])
label = 'Wind_Speed(t)'

In [15]:
def X_y(dataset, columns_of_interest, label):
    print(f"Shape of dataset: {dataset.shape}")
    dataset = dataset[pd.notnull(dataset[label])]
    X = dataset[columns_of_interest]
    y = dataset[label]
    print(f"X.shape: {X.shape}, y.shape: {y.shape}")
    return X, y

In [16]:
X, y = X_y(dataset=n_sup_df.copy(), 
           columns_of_interest=X_columns,
           label=label)

Shape of dataset: (100147, 155)
X.shape: (100147, 150), y.shape: (100147,)


In [18]:
SCALERS = [
    None,
#     StandardScaler(),
#     RobustScaler(),
#     MinMaxScaler()
]
REGRESSORS = [
    LinearRegression(),
#     DecisionTreeRegressor(),
#     AdaBoostRegressor(),
#     GradientBoostingRegressor(),
#     RandomForestRegressor()
]

In [19]:
def run_all_models(X, y):
    import warnings; warnings.filterwarnings(action='ignore')
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    print(f"X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}, X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}")
    
    cv_scores_df = pd.DataFrame()
    all_pipes = OrderedDict()
        
    for scaler in SCALERS:
        print("\n\n****************************************************************************************")
        print(f"Current Scaler: {scaler.__class__.__name__}")
        print("****************************************************************************************")
        
        cv_scores_dict, cost_estimations_dict  = OrderedDict(), OrderedDict()
        all_pipes[scaler.__class__.__name__] = {}
        
        for regressor in REGRESSORS:
            print("\n****************************************************************************************")
            print(f"Current Regressor: {regressor.__class__.__name__}")
            pipe = Pipeline(
                steps=[('scaler', scaler),
                       ('regressor', regressor)])
            pipe.fit(X_train, y_train)            
            
            y_predicted = pipe.predict(X_test)
            # print(y_predicted)
            print(f"Test MAE: {mean_absolute_error(y_true=y_test, y_pred=y_predicted):.2f}")
            print(f"Test MSE: {mean_squared_error(y_true=y_test, y_pred=y_predicted):.2f}")
            
            cross_val_scores = cross_val_score(estimator=pipe, 
                                               X=X_train, 
                                               y=y_train, 
                                               cv=3,
                                               scoring='r2')
            print(f"3 fold cross validation scores {cross_val_scores}")
            print(f"Mean CV R2: {cross_val_scores.mean():.2f}\n")
            
            all_pipes[scaler.__class__.__name__][regressor.__class__.__name__] = pipe
            cv_scores_dict[regressor.__class__.__name__] = cross_val_scores.mean()
                        
        cv_scores_df = cv_scores_df.append(pd.DataFrame(cv_scores_dict, index = [scaler.__class__.__name__]))
        print("****************************************************************************************")

    return cv_scores_df, all_pipes


In [20]:
#%%time
cv_scores_df, all_pipes = run_all_models(
    X=X, 
    y=y)

X_train.shape: (80117, 150), y_train.shape: (80117,), X_test.shape: (20030, 150), y_test.shape: (20030,)


****************************************************************************************
Current Scaler: NoneType
****************************************************************************************

****************************************************************************************
Current Regressor: LinearRegression
Test MAE: 2.03
Test MSE: 7.56
3 fold cross validation scores [0.6525312  0.70638018 0.74019268]
Mean CV R2: 0.70

****************************************************************************************


In [23]:
best_pipe = all_pipes['NoneType']['LinearRegression']

In [27]:
test_df = dataset_hourly[(dataset_hourly.Year.isin([2019, 2020])) & (dataset_hourly.Wind_Speed < 20)][
    ['Formatted_Date', 'Temperature', 'Humidity', 'Visibility', 'Weather_Category', 'Wind_Speed']
].copy().set_index('Formatted_Date')

In [28]:
test_df.shape

(16621, 5)

In [29]:
test_df['Weather_Category'] = encoder.fit_transform(test_df['Weather_Category'])

In [30]:
test_sup_df = series_to_supervised(data=test_df,
                     n_in=30,
                     n_out=1)

In [31]:
test_sup_df.head()

Unnamed: 0_level_0,Temperature(t-30),Humidity(t-30),Visibility(t-30),Weather_Category(t-30),Wind_Speed(t-30),Temperature(t-29),Humidity(t-29),Visibility(t-29),Weather_Category(t-29),Wind_Speed(t-29),Temperature(t-28),Humidity(t-28),Visibility(t-28),Weather_Category(t-28),Wind_Speed(t-28),Temperature(t-27),Humidity(t-27),Visibility(t-27),Weather_Category(t-27),Wind_Speed(t-27),Temperature(t-26),Humidity(t-26),Visibility(t-26),Weather_Category(t-26),Wind_Speed(t-26),Temperature(t-25),Humidity(t-25),Visibility(t-25),Weather_Category(t-25),Wind_Speed(t-25),Temperature(t-24),Humidity(t-24),Visibility(t-24),Weather_Category(t-24),Wind_Speed(t-24),Temperature(t-23),Humidity(t-23),Visibility(t-23),Weather_Category(t-23),Wind_Speed(t-23),Temperature(t-22),Humidity(t-22),Visibility(t-22),Weather_Category(t-22),Wind_Speed(t-22),Temperature(t-21),Humidity(t-21),Visibility(t-21),Weather_Category(t-21),Wind_Speed(t-21),...,Temperature(t-9),Humidity(t-9),Visibility(t-9),Weather_Category(t-9),Wind_Speed(t-9),Temperature(t-8),Humidity(t-8),Visibility(t-8),Weather_Category(t-8),Wind_Speed(t-8),Temperature(t-7),Humidity(t-7),Visibility(t-7),Weather_Category(t-7),Wind_Speed(t-7),Temperature(t-6),Humidity(t-6),Visibility(t-6),Weather_Category(t-6),Wind_Speed(t-6),Temperature(t-5),Humidity(t-5),Visibility(t-5),Weather_Category(t-5),Wind_Speed(t-5),Temperature(t-4),Humidity(t-4),Visibility(t-4),Weather_Category(t-4),Wind_Speed(t-4),Temperature(t-3),Humidity(t-3),Visibility(t-3),Weather_Category(t-3),Wind_Speed(t-3),Temperature(t-2),Humidity(t-2),Visibility(t-2),Weather_Category(t-2),Wind_Speed(t-2),Temperature(t-1),Humidity(t-1),Visibility(t-1),Weather_Category(t-1),Wind_Speed(t-1),Temperature(t),Humidity(t),Visibility(t),Weather_Category(t),Wind_Speed(t)
Formatted_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
2019-01-01 15:00:00,14.0,0.55,4.0,0.0,0.0,13.0,0.55,4.0,0.0,6.0,13.0,0.59,4.0,0.0,0.0,13.0,0.55,4.0,0.0,0.0,12.0,0.54,4.0,0.0,0.0,11.0,0.62,4.0,0.0,0.0,11.0,0.67,4.0,2.0,0.0,11.0,0.62,4.0,2.0,0.0,12.0,0.58,4.0,2.0,0.0,10.0,0.67,4.0,2.0,0.0,...,20.0,0.35,5.0,0.0,7.0,22.0,0.33,5.0,0.0,6.0,23.0,0.31,5.0,0.0,6.0,25.0,0.28,5.0,0.0,6.0,27.0,0.26,5.0,0.0,6.0,28.0,0.25,5.0,0.0,6.0,28.0,0.23,5.0,0.0,4.0,29.0,0.22,5.0,0.0,0.0,29.0,0.19,5.0,0.0,0.0,30.0,0.17,5.0,0,6.0
2019-01-01 15:30:00,13.0,0.55,4.0,0.0,6.0,13.0,0.59,4.0,0.0,0.0,13.0,0.55,4.0,0.0,0.0,12.0,0.54,4.0,0.0,0.0,11.0,0.62,4.0,0.0,0.0,11.0,0.67,4.0,2.0,0.0,11.0,0.62,4.0,2.0,0.0,12.0,0.58,4.0,2.0,0.0,10.0,0.67,4.0,2.0,0.0,9.0,0.71,4.0,2.0,4.0,...,22.0,0.33,5.0,0.0,6.0,23.0,0.31,5.0,0.0,6.0,25.0,0.28,5.0,0.0,6.0,27.0,0.26,5.0,0.0,6.0,28.0,0.25,5.0,0.0,6.0,28.0,0.23,5.0,0.0,4.0,29.0,0.22,5.0,0.0,0.0,29.0,0.19,5.0,0.0,0.0,30.0,0.17,5.0,0.0,6.0,29.0,0.19,5.0,0,0.0
2019-01-01 16:00:00,13.0,0.59,4.0,0.0,0.0,13.0,0.55,4.0,0.0,0.0,12.0,0.54,4.0,0.0,0.0,11.0,0.62,4.0,0.0,0.0,11.0,0.67,4.0,2.0,0.0,11.0,0.62,4.0,2.0,0.0,12.0,0.58,4.0,2.0,0.0,10.0,0.67,4.0,2.0,0.0,9.0,0.71,4.0,2.0,4.0,11.0,0.62,4.0,2.0,0.0,...,23.0,0.31,5.0,0.0,6.0,25.0,0.28,5.0,0.0,6.0,27.0,0.26,5.0,0.0,6.0,28.0,0.25,5.0,0.0,6.0,28.0,0.23,5.0,0.0,4.0,29.0,0.22,5.0,0.0,0.0,29.0,0.19,5.0,0.0,0.0,30.0,0.17,5.0,0.0,6.0,29.0,0.19,5.0,0.0,0.0,29.0,0.19,5.0,0,6.0
2019-01-01 16:30:00,13.0,0.55,4.0,0.0,0.0,12.0,0.54,4.0,0.0,0.0,11.0,0.62,4.0,0.0,0.0,11.0,0.67,4.0,2.0,0.0,11.0,0.62,4.0,2.0,0.0,12.0,0.58,4.0,2.0,0.0,10.0,0.67,4.0,2.0,0.0,9.0,0.71,4.0,2.0,4.0,11.0,0.62,4.0,2.0,0.0,9.0,0.71,4.0,2.0,0.0,...,25.0,0.28,5.0,0.0,6.0,27.0,0.26,5.0,0.0,6.0,28.0,0.25,5.0,0.0,6.0,28.0,0.23,5.0,0.0,4.0,29.0,0.22,5.0,0.0,0.0,29.0,0.19,5.0,0.0,0.0,30.0,0.17,5.0,0.0,6.0,29.0,0.19,5.0,0.0,0.0,29.0,0.19,5.0,0.0,6.0,29.0,0.19,5.0,0,0.0
2019-01-01 17:00:00,12.0,0.54,4.0,0.0,0.0,11.0,0.62,4.0,0.0,0.0,11.0,0.67,4.0,2.0,0.0,11.0,0.62,4.0,2.0,0.0,12.0,0.58,4.0,2.0,0.0,10.0,0.67,4.0,2.0,0.0,9.0,0.71,4.0,2.0,4.0,11.0,0.62,4.0,2.0,0.0,9.0,0.71,4.0,2.0,0.0,9.0,0.71,3.0,2.0,0.0,...,27.0,0.26,5.0,0.0,6.0,28.0,0.25,5.0,0.0,6.0,28.0,0.23,5.0,0.0,4.0,29.0,0.22,5.0,0.0,0.0,29.0,0.19,5.0,0.0,0.0,30.0,0.17,5.0,0.0,6.0,29.0,0.19,5.0,0.0,0.0,29.0,0.19,5.0,0.0,6.0,29.0,0.19,5.0,0.0,0.0,29.0,0.19,5.0,0,0.0


In [32]:
test_X, test_y = X_y(dataset=test_sup_df.copy(), 
           columns_of_interest=X_columns,
           label=label)

Shape of dataset: (16278, 155)
X.shape: (16278, 150), y.shape: (16278,)


In [34]:
pred_y = best_pipe.predict(test_X)

# Important!

In [121]:
for time_gap in ['0.5H', '3H', '6H', '12H', '1d']:
    lr_pred_y_sampled = pd.Series(pred_y)
    lr_pred_y_sampled.index = pd.to_datetime(test_y.index)
    lr_pred_y_sampled = lr_pred_y_sampled.resample(f'{time_gap}').mean()

    actual_y_sampled = test_y.copy()
    actual_y_sampled.index = pd.to_datetime(test_y.index)
    actual_y_sampled = actual_y_sampled.resample(f'{time_gap}').mean()
    
    lr_pred_y_sampled.dropna(inplace=True)
    actual_y_sampled.dropna(inplace=True)
    
    mae = mean_absolute_error(y_true=actual_y_sampled,
                              y_pred=lr_pred_y_sampled)
    r2 = r2_score(y_true=actual_y_sampled,
                  y_pred=lr_pred_y_sampled)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=actual_y_sampled.index, y=actual_y_sampled, name="Actual",
                             line_color='deepskyblue'))
    fig.add_trace(go.Scatter(x=lr_pred_y_sampled.index, y=lr_pred_y_sampled, name="Predicted",
                             line_color='dimgray'))
    fig.update_layout(title_text=f'LR: {time_gap}: Actual vs Predicted, \tMAE: {mae:.3f}, \tR2: {r2:.3f}',
                      xaxis_rangeslider_visible=True)
    fig.show()

# MLP

In [41]:
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras import backend as K

import tensorflow as tf

Using TensorFlow backend.


In [42]:
K.clear_session()
model = Sequential()

model.add(Dense(200, input_dim=X.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(200, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(1, kernel_initializer='normal', activation='relu'))

loss = 'mean_absolute_error'
# loss = tf.keras.losses.Huber()
# loss = tf.keras.losses.KLDivergence()
model.compile(loss=loss, optimizer='adam', metrics=['mse', tf.keras.losses.Huber()])

model.summary()






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 200)               30200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 200)               800       
_________________________________________________________________
dense_2 (Dense)              (None, 200)               40200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 200)               800       
_____________________

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
print(f"X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}, X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}")
    

X_train.shape: (80117, 150), y_train.shape: (80117,), X_test.shape: (20030, 150), y_test.shape: (20030,)


In [44]:
history = model.fit(x=X_train, 
                    y=y_train, 
                    epochs=10, 
                    batch_size=64, 
                    validation_data=(X_test, y_test),
                    verbose=1)

Train on 80117 samples, validate on 20030 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
mlp_pred_y = np.rint(model.predict(test_X).reshape(test_y.shape))

In [120]:
for time_gap in ['0.5H', '3H', '6H', '12H', '1d']:
    mlp_pred_y_sampled = pd.Series(mlp_pred_y)
    mlp_pred_y_sampled.index = pd.to_datetime(test_y.index)
    mlp_pred_y_sampled = mlp_pred_y_sampled.resample(f'{time_gap}').mean()

    actual_y_sampled = test_y.copy()
    actual_y_sampled.index = pd.to_datetime(test_y.index)
    actual_y_sampled = actual_y_sampled.resample(f'{time_gap}').mean()
    
    mlp_pred_y_sampled.dropna(inplace=True)
    actual_y_sampled.dropna(inplace=True)
    
    mae = mean_absolute_error(y_true=actual_y_sampled,
                              y_pred=mlp_pred_y_sampled)
    r2 = r2_score(y_true=actual_y_sampled,
                  y_pred=mlp_pred_y_sampled)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=actual_y_sampled.index, y=actual_y_sampled, name="Actual",
                             line_color='deepskyblue'))
    fig.add_trace(go.Scatter(x=mlp_pred_y_sampled.index, y=mlp_pred_y_sampled, name="Predicted",
                             line_color='dimgray'))
    fig.update_layout(title_text=f'MLP: {time_gap}: Actual vs Predicted, \tMAE: {mae:.3f}, \tR2: {r2:.3f}',
                      xaxis_rangeslider_visible=True)
    fig.show()