In [1]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
warnings.filterwarnings('ignore',category=DeprecationWarning)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_excel("../Dataset.xlsx", sheet_name=['Total Consumers'])
df = data['Total Consumers']
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,2.964,2.322959,1.544607,0.778310,1.962012,2.677445,0.237877,0.689194,0.358525,0.814643,...,0.898895,0.203825,0.221624,0.319531,0.830996,0.924987,0.219128,0.274880,0.990488,0.779475
1,2.584,2.371797,1.544607,0.778310,1.962012,2.733737,0.192929,0.558967,0.358525,0.660712,...,0.917793,0.165311,0.179747,0.319531,0.848467,0.944434,0.177722,0.222940,1.011313,0.795863
2,3.071,2.415961,1.319880,0.665072,1.676555,2.784640,0.382869,1.109272,0.377198,1.311186,...,0.934883,0.328060,0.356708,0.336174,0.864266,0.962019,0.352691,0.442426,1.030144,0.810682
3,2.694,2.302538,1.319880,0.665072,1.676555,2.653908,0.442052,1.280743,0.377198,1.513868,...,0.890992,0.378772,0.411848,0.336174,0.823691,0.916855,0.407209,0.510816,0.981781,0.772623
4,2.569,2.363063,0.913154,0.460128,1.159919,2.723669,0.192242,0.556976,0.668500,0.658358,...,0.914413,0.164722,0.179106,0.595793,0.845343,0.940956,0.177089,0.222146,1.007588,0.792932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35131,1.664,2.244719,1.455982,0.733653,1.849437,2.587266,0.205654,0.595835,0.171793,0.704291,...,0.868619,0.176214,0.191602,0.153109,0.803007,0.893832,0.189444,0.237645,0.957128,0.753222
35132,1.659,2.136340,1.201186,0.605264,1.525786,2.462348,0.201219,0.582985,0.067223,0.689101,...,0.826680,0.172414,0.187470,0.059912,0.764237,0.850676,0.185359,0.232519,0.910916,0.716855
35133,1.664,2.192805,1.201186,0.605264,1.525786,2.527430,0.228585,0.662271,0.067223,0.782819,...,0.848530,0.195862,0.212966,0.059912,0.784436,0.873160,0.210568,0.264142,0.934992,0.735802
35134,1.697,1.446083,0.259545,0.130782,0.329682,1.666757,0.189302,0.548459,0.070958,0.648292,...,0.559578,0.162203,0.176368,0.063241,0.517310,0.575820,0.174381,0.218749,0.616596,0.485237


## Auxiliary Functions

In [3]:
def plot_results(preds: np.array, actuals: np.array, title: str):
    
    plt.scatter(actuals, preds, c='b', label='predicted')
    plt.xlabel('actual')
    plt.ylabel('predicted')
    plt.title(title)
    plt.xlim(0, plt.xlim()[1])
    plt.ylim(0, plt.ylim()[1])
    _ = plt.plot([0, 100], [0, 100], '--r', label='y=x')
    plt.legend()
    plt.show()
    
    
def performance_metrics(preds: np.array, actuals: np.array):

    # calculate performance metrics
    
    mse = mean_squared_error(actuals, preds)
    rmse = np.sqrt(mse)
    #wmape = np.sum(np.abs(preds - actuals)) / np.sum(np.abs(actuals)) * 100
    #mape = np.mean(np.abs((actuals - preds) / actuals)) * 100
    mape = np.round(mean_absolute_error(actuals, preds)/actuals.mean(), 2)
    
    r2 = r2_score(actuals, preds)

    # print performance metrics
    print('RMSE: %.4f' % rmse)
    print('MSE: %.4f' % mse)
    print('MAPE: %.4f' % (mape * 100))
    print('R2: %.4f' % r2)
    return rmse, mse,mape,r2

def build_model(estimator, X_train: np.array, y_train: np.array, X_test: np.array):
    
    model = estimator
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    return model, preds


## Last Energy point (15 minutes)

In [20]:
X_tmp = df.copy()
X_15 = []

for h in range(0,X_tmp.shape[1]):
    X_house = pd.DataFrame()
    X_house[f'Energy_15'] = X_tmp[h].shift(1)
    X_house = X_house.dropna().reset_index(drop=True)
    X_15.append(X_house)
X_15

[       Energy_15
 0          2.964
 1          2.584
 2          3.071
 3          2.694
 4          2.569
 ...          ...
 35130      1.670
 35131      1.664
 35132      1.659
 35133      1.664
 35134      1.697
 
 [35135 rows x 1 columns],
        Energy_15
 0       2.322959
 1       2.371797
 2       2.415961
 3       2.302538
 4       2.363063
 ...          ...
 35130   3.275491
 35131   2.244719
 35132   2.136340
 35133   2.192805
 35134   1.446083
 
 [35135 rows x 1 columns],
        Energy_15
 0       1.544607
 1       1.544607
 2       1.319880
 3       1.319880
 4       0.913154
 ...          ...
 35130   1.455982
 35131   1.455982
 35132   1.201186
 35133   1.201186
 35134   0.259545
 
 [35135 rows x 1 columns],
        Energy_15
 0       0.778310
 1       0.778310
 2       0.665072
 3       0.665072
 4       0.460128
 ...          ...
 35130   0.733653
 35131   0.733653
 35132   0.605264
 35133   0.605264
 35134   0.130782
 
 [35135 rows x 1 columns],
        Energy_15
 0

In [23]:
y_15 = df.copy()
y_15 = y_15.iloc[1:]
y_15

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
1,2.584,2.371797,1.544607,0.778310,1.962012,2.733737,0.192929,0.558967,0.358525,0.660712,...,0.917793,0.165311,0.179747,0.319531,0.848467,0.944434,0.177722,0.222940,1.011313,0.795863
2,3.071,2.415961,1.319880,0.665072,1.676555,2.784640,0.382869,1.109272,0.377198,1.311186,...,0.934883,0.328060,0.356708,0.336174,0.864266,0.962019,0.352691,0.442426,1.030144,0.810682
3,2.694,2.302538,1.319880,0.665072,1.676555,2.653908,0.442052,1.280743,0.377198,1.513868,...,0.890992,0.378772,0.411848,0.336174,0.823691,0.916855,0.407209,0.510816,0.981781,0.772623
4,2.569,2.363063,0.913154,0.460128,1.159919,2.723669,0.192242,0.556976,0.668500,0.658358,...,0.914413,0.164722,0.179106,0.595793,0.845343,0.940956,0.177089,0.222146,1.007588,0.792932
5,3.174,2.334031,0.913154,0.460128,1.159919,2.690207,0.195137,0.565365,0.668500,0.668274,...,0.903179,0.167203,0.181804,0.595793,0.834957,0.929395,0.179757,0.225492,0.995209,0.783190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35131,1.664,2.244719,1.455982,0.733653,1.849437,2.587266,0.205654,0.595835,0.171793,0.704291,...,0.868619,0.176214,0.191602,0.153109,0.803007,0.893832,0.189444,0.237645,0.957128,0.753222
35132,1.659,2.136340,1.201186,0.605264,1.525786,2.462348,0.201219,0.582985,0.067223,0.689101,...,0.826680,0.172414,0.187470,0.059912,0.764237,0.850676,0.185359,0.232519,0.910916,0.716855
35133,1.664,2.192805,1.201186,0.605264,1.525786,2.527430,0.228585,0.662271,0.067223,0.782819,...,0.848530,0.195862,0.212966,0.059912,0.784436,0.873160,0.210568,0.264142,0.934992,0.735802
35134,1.697,1.446083,0.259545,0.130782,0.329682,1.666757,0.189302,0.548459,0.070958,0.648292,...,0.559578,0.162203,0.176368,0.063241,0.517310,0.575820,0.174381,0.218749,0.616596,0.485237


### Split into train and test

In [26]:
for h in range(0,df.shape[1]):
    n_train_samples_15 = int(len(X_15[h]) * 0.8)
    X_15_train = X_15[h][:n_train_samples_15]
    X_15_test = X_15[h][n_train_samples_15:]
    y_15_train = y_15[:n_train_samples_15]
    y_15_test = y_15[n_train_samples_15:]
print(X_15_train.shape, X_15_test.shape, y_15_train.shape, y_15_test.shape)

(28108, 1) (7027, 1) (28108, 51) (7027, 51)


## Last 2 energy points (30 minutes)

In [27]:
X_30 = []
X_tmp = df.copy()

for h in range(0,X_tmp.shape[1]):
    X_house = pd.DataFrame()
    for i in range(1, 3):
        X_house[f'Energy_{i*15}'] = X_tmp[h].shift(i)
    X_house = X_house.dropna().reset_index(drop=True)
    X_30.append(X_house)
X_30

[       Energy_15  Energy_30
 0          2.584      2.964
 1          3.071      2.584
 2          2.694      3.071
 3          2.569      2.694
 4          3.174      2.569
 ...          ...        ...
 35129      1.670      1.686
 35130      1.664      1.670
 35131      1.659      1.664
 35132      1.664      1.659
 35133      1.697      1.664
 
 [35134 rows x 2 columns],
        Energy_15  Energy_30
 0       2.371797   2.322959
 1       2.415961   2.371797
 2       2.302538   2.415961
 3       2.363063   2.302538
 4       2.334031   2.363063
 ...          ...        ...
 35129   3.275491   1.352712
 35130   2.244719   3.275491
 35131   2.136340   2.244719
 35132   2.192805   2.136340
 35133   1.446083   2.192805
 
 [35134 rows x 2 columns],
        Energy_15  Energy_30
 0       1.544607   1.544607
 1       1.319880   1.544607
 2       1.319880   1.319880
 3       0.913154   1.319880
 4       0.913154   0.913154
 ...          ...        ...
 35129   1.455982   1.467061
 35130   1.455

### Split into train and test

In [36]:
y_30 = df.copy()
y_30 = y_30.iloc[2:]
y_30

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
2,3.071,2.415961,1.319880,0.665072,1.676555,2.784640,0.382869,1.109272,0.377198,1.311186,...,0.934883,0.328060,0.356708,0.336174,0.864266,0.962019,0.352691,0.442426,1.030144,0.810682
3,2.694,2.302538,1.319880,0.665072,1.676555,2.653908,0.442052,1.280743,0.377198,1.513868,...,0.890992,0.378772,0.411848,0.336174,0.823691,0.916855,0.407209,0.510816,0.981781,0.772623
4,2.569,2.363063,0.913154,0.460128,1.159919,2.723669,0.192242,0.556976,0.668500,0.658358,...,0.914413,0.164722,0.179106,0.595793,0.845343,0.940956,0.177089,0.222146,1.007588,0.792932
5,3.174,2.334031,0.913154,0.460128,1.159919,2.690207,0.195137,0.565365,0.668500,0.668274,...,0.903179,0.167203,0.181804,0.595793,0.834957,0.929395,0.179757,0.225492,0.995209,0.783190
6,2.558,2.299585,0.997031,0.502393,1.266463,2.650505,0.238003,0.689557,0.216609,0.815072,...,0.889850,0.203932,0.221740,0.193050,0.822635,0.915679,0.219243,0.275025,0.980522,0.771632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35131,1.664,2.244719,1.455982,0.733653,1.849437,2.587266,0.205654,0.595835,0.171793,0.704291,...,0.868619,0.176214,0.191602,0.153109,0.803007,0.893832,0.189444,0.237645,0.957128,0.753222
35132,1.659,2.136340,1.201186,0.605264,1.525786,2.462348,0.201219,0.582985,0.067223,0.689101,...,0.826680,0.172414,0.187470,0.059912,0.764237,0.850676,0.185359,0.232519,0.910916,0.716855
35133,1.664,2.192805,1.201186,0.605264,1.525786,2.527430,0.228585,0.662271,0.067223,0.782819,...,0.848530,0.195862,0.212966,0.059912,0.784436,0.873160,0.210568,0.264142,0.934992,0.735802
35134,1.697,1.446083,0.259545,0.130782,0.329682,1.666757,0.189302,0.548459,0.070958,0.648292,...,0.559578,0.162203,0.176368,0.063241,0.517310,0.575820,0.174381,0.218749,0.616596,0.485237


In [37]:
for h in range(0,df.shape[1]):
    n_train_samples_30 = int(len(X_30[h]) * 0.8)
    X_30_train = X_30[h][:n_train_samples_30]
    X_30_test = X_30[h][n_train_samples_30:]
    y_30_train = y_30[:n_train_samples_30]
    y_30_test = y_30[n_train_samples_30:]
print(X_30_train.shape, X_30_test.shape, y_30_train.shape, y_30_test.shape)

(28107, 2) (7027, 2) (28107, 51) (7027, 51)


## Last 3 energy points (45 minutes)

In [38]:
X_45 = []
y_45 = df.copy()
X_tmp = df.copy()

for h in range(0,X_tmp.shape[1]):
    X_house = pd.DataFrame()
    for i in range(1, 4):
        X_house[f'Energy_{i*15}'] = X_tmp[h].shift(i)
    X_house = X_house.dropna().reset_index(drop=True)
    X_45.append(X_house)
X_45

[       Energy_15  Energy_30  Energy_45
 0          3.071      2.584      2.964
 1          2.694      3.071      2.584
 2          2.569      2.694      3.071
 3          3.174      2.569      2.694
 4          2.558      3.174      2.569
 ...          ...        ...        ...
 35128      1.670      1.686      1.664
 35129      1.664      1.670      1.686
 35130      1.659      1.664      1.670
 35131      1.664      1.659      1.664
 35132      1.697      1.664      1.659
 
 [35133 rows x 3 columns],
        Energy_15  Energy_30  Energy_45
 0       2.415961   2.371797   2.322959
 1       2.302538   2.415961   2.371797
 2       2.363063   2.302538   2.415961
 3       2.334031   2.363063   2.302538
 4       2.299585   2.334031   2.363063
 ...          ...        ...        ...
 35128   3.275491   1.352712   1.435749
 35129   2.244719   3.275491   1.352712
 35130   2.136340   2.244719   3.275491
 35131   2.192805   2.136340   2.244719
 35132   1.446083   2.192805   2.136340
 
 [35133 r

### Split into train and test

In [39]:
y_45 = df.copy()
y_45 = y_45.iloc[3:]
y_45

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
3,2.694,2.302538,1.319880,0.665072,1.676555,2.653908,0.442052,1.280743,0.377198,1.513868,...,0.890992,0.378772,0.411848,0.336174,0.823691,0.916855,0.407209,0.510816,0.981781,0.772623
4,2.569,2.363063,0.913154,0.460128,1.159919,2.723669,0.192242,0.556976,0.668500,0.658358,...,0.914413,0.164722,0.179106,0.595793,0.845343,0.940956,0.177089,0.222146,1.007588,0.792932
5,3.174,2.334031,0.913154,0.460128,1.159919,2.690207,0.195137,0.565365,0.668500,0.668274,...,0.903179,0.167203,0.181804,0.595793,0.834957,0.929395,0.179757,0.225492,0.995209,0.783190
6,2.558,2.299585,0.997031,0.502393,1.266463,2.650505,0.238003,0.689557,0.216609,0.815072,...,0.889850,0.203932,0.221740,0.193050,0.822635,0.915679,0.219243,0.275025,0.980522,0.771632
7,2.922,2.316070,0.997031,0.502393,1.266463,2.669505,0.194367,0.563131,0.216609,0.665634,...,0.896229,0.166543,0.181086,0.193050,0.828532,0.922243,0.179046,0.224601,0.987551,0.777163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35131,1.664,2.244719,1.455982,0.733653,1.849437,2.587266,0.205654,0.595835,0.171793,0.704291,...,0.868619,0.176214,0.191602,0.153109,0.803007,0.893832,0.189444,0.237645,0.957128,0.753222
35132,1.659,2.136340,1.201186,0.605264,1.525786,2.462348,0.201219,0.582985,0.067223,0.689101,...,0.826680,0.172414,0.187470,0.059912,0.764237,0.850676,0.185359,0.232519,0.910916,0.716855
35133,1.664,2.192805,1.201186,0.605264,1.525786,2.527430,0.228585,0.662271,0.067223,0.782819,...,0.848530,0.195862,0.212966,0.059912,0.784436,0.873160,0.210568,0.264142,0.934992,0.735802
35134,1.697,1.446083,0.259545,0.130782,0.329682,1.666757,0.189302,0.548459,0.070958,0.648292,...,0.559578,0.162203,0.176368,0.063241,0.517310,0.575820,0.174381,0.218749,0.616596,0.485237


In [40]:
for h in range(0,df.shape[1]):
    n_train_samples_45 = int(len(X_45[h]) * 0.8)
    X_45_train = X_45[h][:n_train_samples_45]
    X_45_test = X_45[h][n_train_samples_45:]
    y_45_train = y_45[:n_train_samples_45]
    y_45_test = y_45[n_train_samples_45:]
print(X_45_train.shape, X_45_test.shape, y_45_train.shape, y_45_test.shape)

(28106, 3) (7027, 3) (28106, 51) (7027, 51)


## Last 4 energy points (1 hour)

In [41]:
X_60 = []
y_60 = df.copy()
X_tmp = df.copy()

for h in range(0,X_tmp.shape[1]):
    X_house = pd.DataFrame()
    for i in range(1, 5):
        X_house[f'Energy_{i*15}'] = X_tmp[h].shift(i)
    X_house = X_house.dropna().reset_index(drop=True)
    X_60.append(X_house)
X_60

[       Energy_15  Energy_30  Energy_45  Energy_60
 0          2.694      3.071      2.584      2.964
 1          2.569      2.694      3.071      2.584
 2          3.174      2.569      2.694      3.071
 3          2.558      3.174      2.569      2.694
 4          2.922      2.558      3.174      2.569
 ...          ...        ...        ...        ...
 35127      1.670      1.686      1.664      1.651
 35128      1.664      1.670      1.686      1.664
 35129      1.659      1.664      1.670      1.686
 35130      1.664      1.659      1.664      1.670
 35131      1.697      1.664      1.659      1.664
 
 [35132 rows x 4 columns],
        Energy_15  Energy_30  Energy_45  Energy_60
 0       2.302538   2.415961   2.371797   2.322959
 1       2.363063   2.302538   2.415961   2.371797
 2       2.334031   2.363063   2.302538   2.415961
 3       2.299585   2.334031   2.363063   2.302538
 4       2.316070   2.299585   2.334031   2.363063
 ...          ...        ...        ...        ...
 3

In [42]:
y_60 = df.copy()
y_60 = y_60.iloc[4:]
y_60

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
4,2.569,2.363063,0.913154,0.460128,1.159919,2.723669,0.192242,0.556976,0.668500,0.658358,...,0.914413,0.164722,0.179106,0.595793,0.845343,0.940956,0.177089,0.222146,1.007588,0.792932
5,3.174,2.334031,0.913154,0.460128,1.159919,2.690207,0.195137,0.565365,0.668500,0.668274,...,0.903179,0.167203,0.181804,0.595793,0.834957,0.929395,0.179757,0.225492,0.995209,0.783190
6,2.558,2.299585,0.997031,0.502393,1.266463,2.650505,0.238003,0.689557,0.216609,0.815072,...,0.889850,0.203932,0.221740,0.193050,0.822635,0.915679,0.219243,0.275025,0.980522,0.771632
7,2.922,2.316070,0.997031,0.502393,1.266463,2.669505,0.194367,0.563131,0.216609,0.665634,...,0.896229,0.166543,0.181086,0.193050,0.828532,0.922243,0.179046,0.224601,0.987551,0.777163
8,2.994,2.341043,1.074578,0.541468,1.364965,2.698289,0.192219,0.556909,0.224078,0.658279,...,0.905892,0.164702,0.179085,0.199707,0.837465,0.932187,0.177068,0.222119,0.998199,0.785543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35131,1.664,2.244719,1.455982,0.733653,1.849437,2.587266,0.205654,0.595835,0.171793,0.704291,...,0.868619,0.176214,0.191602,0.153109,0.803007,0.893832,0.189444,0.237645,0.957128,0.753222
35132,1.659,2.136340,1.201186,0.605264,1.525786,2.462348,0.201219,0.582985,0.067223,0.689101,...,0.826680,0.172414,0.187470,0.059912,0.764237,0.850676,0.185359,0.232519,0.910916,0.716855
35133,1.664,2.192805,1.201186,0.605264,1.525786,2.527430,0.228585,0.662271,0.067223,0.782819,...,0.848530,0.195862,0.212966,0.059912,0.784436,0.873160,0.210568,0.264142,0.934992,0.735802
35134,1.697,1.446083,0.259545,0.130782,0.329682,1.666757,0.189302,0.548459,0.070958,0.648292,...,0.559578,0.162203,0.176368,0.063241,0.517310,0.575820,0.174381,0.218749,0.616596,0.485237


In [43]:
for h in range(0,df.shape[1]):
    n_train_samples_60 = int(len(X_60[h]) * 0.8)
    X_60_train = X_60[h][:n_train_samples_60]
    X_60_test = X_60[h][n_train_samples_60:]
    y_60_train = y_60[:n_train_samples_60]
    y_60_test = y_60[n_train_samples_60:]
print(X_60_train.shape, X_60_test.shape, y_60_train.shape, y_60_test.shape)

(28105, 4) (7027, 4) (28105, 51) (7027, 51)
