## Import libraries

In [1]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
warnings.filterwarnings('ignore',category=DeprecationWarning)
import sys
sys.path.append('..')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
import time
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from cuml.ensemble import RandomForestRegressor as cuRF
from scripts.function_utils import last_energy_points, predict_results, prepare_polynomial, normalize_training, expanding_window_split, retrieve_selected_features, total_averaged_metrics, no_ml_predict

## Load Data

In [2]:
data = pd.read_excel("~/datasets/Dataset.xlsx", sheet_name=['Total Consumers'])
df = data['Total Consumers']
number_of_houses = len(df.columns)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,2.964,2.322959,1.544607,0.778310,1.962012,2.677445,0.237877,0.689194,0.358525,0.814643,...,0.898895,0.203825,0.221624,0.319531,0.830996,0.924987,0.219128,0.274880,0.990488,0.779475
1,2.584,2.371797,1.544607,0.778310,1.962012,2.733737,0.192929,0.558967,0.358525,0.660712,...,0.917793,0.165311,0.179747,0.319531,0.848467,0.944434,0.177722,0.222940,1.011313,0.795863
2,3.071,2.415961,1.319880,0.665072,1.676555,2.784640,0.382869,1.109272,0.377198,1.311186,...,0.934883,0.328060,0.356708,0.336174,0.864266,0.962019,0.352691,0.442426,1.030144,0.810682
3,2.694,2.302538,1.319880,0.665072,1.676555,2.653908,0.442052,1.280743,0.377198,1.513868,...,0.890992,0.378772,0.411848,0.336174,0.823691,0.916855,0.407209,0.510816,0.981781,0.772623
4,2.569,2.363063,0.913154,0.460128,1.159919,2.723669,0.192242,0.556976,0.668500,0.658358,...,0.914413,0.164722,0.179106,0.595793,0.845343,0.940956,0.177089,0.222146,1.007588,0.792932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35131,1.664,2.244719,1.455982,0.733653,1.849437,2.587266,0.205654,0.595835,0.171793,0.704291,...,0.868619,0.176214,0.191602,0.153109,0.803007,0.893832,0.189444,0.237645,0.957128,0.753222
35132,1.659,2.136340,1.201186,0.605264,1.525786,2.462348,0.201219,0.582985,0.067223,0.689101,...,0.826680,0.172414,0.187470,0.059912,0.764237,0.850676,0.185359,0.232519,0.910916,0.716855
35133,1.664,2.192805,1.201186,0.605264,1.525786,2.527430,0.228585,0.662271,0.067223,0.782819,...,0.848530,0.195862,0.212966,0.059912,0.784436,0.873160,0.210568,0.264142,0.934992,0.735802
35134,1.697,1.446083,0.259545,0.130782,0.329682,1.666757,0.189302,0.548459,0.070958,0.648292,...,0.559578,0.162203,0.176368,0.063241,0.517310,0.575820,0.174381,0.218749,0.616596,0.485237


In [3]:
df_total = pd.DataFrame(df.sum(axis=1))
df_total.columns = ['Energy']
df_total

Unnamed: 0,Energy
0,58.514142
1,57.575249
2,63.011688
3,62.735722
4,63.404029
...,...
35131,49.211311
35132,43.139892
35133,44.599341
35134,28.956084


#### Example of past timesteps

In [4]:
Xt, yt = last_energy_points(df_total, 3)
Xt

Unnamed: 0,lag_1,lag_2,lag_3
0,63.011688,57.575249,58.514142
1,62.735722,63.011688,57.575249
2,63.404029,62.735722,63.011688
3,63.710440,63.404029,62.735722
4,50.544573,63.710440,63.404029
...,...,...,...
35128,62.642645,39.394640,42.250176
35129,49.211311,62.642645,39.394640
35130,43.139892,49.211311,62.642645
35131,44.599341,43.139892,49.211311


In [5]:
Xt_norm, _ = normalize_training(Xt)
Xt_norm

array([[0.36540976, 0.33113298, 0.33705271],
       [0.36366979, 0.36540976, 0.33113298],
       [0.36788347, 0.36366979, 0.36540976],
       ...,
       [0.24011798, 0.27839831, 0.36308294],
       [0.24931982, 0.24011798, 0.27839831],
       [0.15068901, 0.24931982, 0.24011798]])

In [6]:
yt

Unnamed: 0,Energy
0,62.735722
1,63.404029
2,63.710440
3,50.544573
4,49.966228
...,...
35128,49.211311
35129,43.139892
35130,44.599341
35131,28.956084


# No ML

In [7]:
X15 = df_total.copy()
X15 = X15.iloc[:-1]
X15.reset_index(drop=True, inplace=True)
X15

Unnamed: 0,Energy
0,58.514142
1,57.575249
2,63.011688
3,62.735722
4,63.404029
...,...
35130,62.642645
35131,49.211311
35132,43.139892
35133,44.599341


In [8]:
y15 = df_total.copy()
y15 = y15.iloc[1:]
y15.reset_index(drop=True, inplace=True)
y15

Unnamed: 0,Energy
0,57.575249
1,63.011688
2,62.735722
3,63.404029
4,63.710440
...,...
35130,49.211311
35131,43.139892
35132,44.599341
35133,28.956084


In [9]:
rmse_15, wape_15, r2_15 = no_ml_predict(X15.values, y15.values)

RMSE: 11.3868
WAPE: 19.97
R2: 0.6880


# Linear Regression

In [None]:
number_of_past_timesteps = 96

In [None]:
n_splits = 10

In [None]:
metrics_list_lr = []

X, y = last_energy_points(df_total, number_of_past_timesteps)

for i in range(n_splits):
    start = time.time()
    print("\nIteration ", i)


    X_train, X_test, y_train, y_test = expanding_window_split(X, y, cv=i, n_splits=n_splits)

    X_train_norm, scaler = normalize_training(X_train)
    X_test_norm = scaler.transform(X_test)

    rmse, wape, r2, _ = predict_results(X_train_norm, X_test_norm, y_train, y_test, LinearRegression())
    metrics_list_lr.append((rmse,wape,r2))
    print("\nElapsed time: %.3f seconds" % (time.time() - start))

In [None]:
rmse_lr, wape_lr, r2_lr = total_averaged_metrics(metrics_list_lr)

# XGBoost

In [None]:
metrics_list_xgb = []

for i in range(n_splits):
    start = time.time()
    print("\nIteration ", i)


    X_train, X_test, y_train, y_test = expanding_window_split(X, y, cv=i, n_splits=n_splits)

    X_train_norm, scaler = normalize_training(X_train)
    X_test_norm = scaler.transform(X_test)

    rmse, wape, r2, model_xgb = predict_results(X_train_norm, X_test_norm, y_train, y_test, xgb.XGBRegressor(seed=42, tree_method='gpu_hist'))
    metrics_list_xgb.append((rmse,wape,r2))
    print("\nElapsed time: %.3f seconds" % (time.time() - start))

In [None]:
rmse_xgb, wape_xgb, r2_xgb = total_averaged_metrics(metrics_list_xgb)

# Random Forest

In [None]:
metrics_list_rf = []

for i in range(n_splits):
    start = time.time()
    print("\nIteration ", i)


    X_train, X_test, y_train, y_test = expanding_window_split(X, y, cv=i, n_splits=n_splits)

    X_train_norm, scaler = normalize_training(X_train)
    X_test_norm = scaler.transform(X_test)

    rmse, wape, r2, _ = predict_results(X_train_norm, X_test_norm, y_train, y_test, cuRF())
    metrics_list_rf.append((rmse,wape,r2))
    print("\nElapsed time: %.3f seconds" % (time.time() - start))

In [None]:
rmse_rf, wape_rf, r2_rf = total_averaged_metrics(metrics_list_rf)

# Feature Selection

In [None]:
feature_importance = pd.Series(model_xgb.feature_importances_, index=X.columns)
feature_importance.nlargest(10).plot(kind='barh')

In [None]:
features = feature_importance.nlargest(10).index
X_selected = X.loc[:,features]
X_selected

# Hyper-parameter tuning

In [None]:
params_xgb = { 
    'max_depth': [3,6,10],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 500, 1000],
    'colsample_bytree': [0.3, 0.7],
}

In [None]:
params_rf = {
    'max_depth': [8, 16, 24, 30],
    'max_batch_size': [2, 32, 128],
    'min_samples_split': [2, 10],
    'n_estimators': [100, 250, 500]
}

In [None]:
xgb_model = xgb.XGBRegressor(tree_method="gpu_hist", seed=42)
grid_search = GridSearchCV(estimator = xgb_model, param_grid = params_xgb, cv = 5, n_jobs = 4, verbose = 0, scoring="r2")
grid_search.fit(X_selected, y)

In [None]:
grid_search.best_params_, grid_search.best_score_

In [None]:
xgb_params = grid_search.best_params_

In [None]:
rf = cuRF()
grid_search_rf = GridSearchCV(estimator = rf, param_grid = params_rf, cv = 5, n_jobs = 8, verbose = 1, scoring="r2")
grid_search_rf.fit(X_selected, y)

In [None]:
grid_search_rf.best_params_, grid_search_rf.best_score_

### Linear Regression (after feature selection)

In [None]:
metrics_list_lr_sel = []

for i in range(n_splits):
    start = time.time()
    print("\nIteration ", i)


    X_train, X_test, y_train, y_test = expanding_window_split(X_selected, y, cv=i, n_splits=n_splits)

    X_train_norm, scaler = normalize_training(X_train)
    X_test_norm = scaler.transform(X_test)

    rmse, wape, r2, _ = predict_results(X_train_norm, X_test_norm, y_train, y_test, LinearRegression())
    metrics_list_lr_sel.append((rmse,wape,r2))
    print("\nElapsed time: %.3f seconds" % (time.time() - start))

In [None]:
rmse_lr_sel, wape_lr_sel, r2_lr_sel = total_averaged_metrics(metrics_list_lr_sel)

### XGBoost (after feature selection + hyper tuning)

In [None]:
metrics_list_xgb_sel = []

for i in range(n_splits):
    start = time.time()
    print("\nIteration ", i)


    X_train, X_test, y_train, y_test = expanding_window_split(X_selected, y, cv=i, n_splits=n_splits)

    X_train_norm, scaler = normalize_training(X_train)
    X_test_norm = scaler.transform(X_test)

    rmse, wape, r2, model_xgb = predict_results(X_train_norm, X_test_norm, y_train, y_test, xgb.XGBRegressor(seed=42, tree_method='gpu_hist', colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimators=1000))
    metrics_list_xgb_sel.append((rmse,wape,r2))
    print("\nElapsed time: %.3f seconds" % (time.time() - start))

In [None]:
rmse_xgb_sel, wape_xgb_sel, r2_xgb_sel = total_averaged_metrics(metrics_list_xgb_sel)

### Random Forest (after feature selection + hyper tuning)

In [None]:
metrics_list_rf_sel = []

for i in range(n_splits):
    start = time.time()
    print("\nIteration ", i)


    X_train, X_test, y_train, y_test = expanding_window_split(X_selected, y, cv=i, n_splits=n_splits)

    X_train_norm, scaler = normalize_training(X_train)
    X_test_norm = scaler.transform(X_test)

    rmse, wape, r2, _ = predict_results(X_train_norm, X_test_norm, y_train, y_test, cuRF(max_depth=8, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=500))
    metrics_list_rf_sel.append((rmse,wape,r2))
    print("\nElapsed time: %.3f seconds" % (time.time() - start))

In [None]:
rmse_rf_sel, wape_rf_sel, r2_rf_sel = total_averaged_metrics(metrics_list_rf_sel)

## Plot performance metrics

In [None]:
plt.figure(figsize=(16, 7.2))
plt.bar(("Linear Regression", "XGBoost", "Random Forest"), (r2_lr, r2_xgb, r2_rf))
plt.title('RMSE')
plt.xlabel('Algorithm')
plt.ylabel('R2')
plt.show()

In [None]:
plt.figure(figsize=(16, 7.2))
plt.plot(range(1,number_of_past_timesteps+1), wape_list)
plt.title('WAPE per past timestep')
plt.xlabel('Number of past timesteps')
plt.ylabel('WAPE')
plt.show()

In [None]:
plt.figure(figsize=(16, 7.2))
plt.plot(range(1,number_of_past_timesteps+1), r2_list)
plt.title('R2 score per past timestep')
plt.xlabel('Number of past timesteps')
plt.ylabel('R2')
plt.show()

# Polynomial Regression

### Test for last 8 energy points (2 hours) degree 2, 3 and 4

In [None]:
degree_list = []
full_start = time.time()
for degree in range(2,5):
    metrics_list = []
    print("\n\t\tDegree ", degree)
    for i in range(1,number_of_past_timesteps+1):
        start = time.time()
        print("\nIteration ", i)
        X, y = last_energy_points(df_total, i)
        X_train, X_test, y_train, y_test = prepare_polynomial(X, y, deg=degree)
        print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
        X_train_norm, scaler = normalize_training(X_train)
        X_test_norm = scaler.transform(X_test)
        y_train_norm, scaler2 = normalize_training(y_train)
        y_test_norm = pd.DataFrame(scaler2.transform(y_test))
        
        rmse, wape, r2 = predict_results(X_train_norm, X_test_norm, y_train_norm, y_test_norm, LinearRegression())
        print()
        metrics_list.append((rmse,wape,r2))      
        print("\nElapsed time: %.3f seconds" % (time.time() - start))
    degree_list.append(metrics_list)
print("\nFull Elapsed time: %.3f seconds" % (time.time() - full_start))

In [None]:
p_rmse_list = []
p_wape_list = []
p_r2_list = []

for j in range(0,3):
    rmse_l = []
    wape_l = []
    r2_l = []
    
    for i in range(0,number_of_past_timesteps):
        rmse_l.append(degree_list[j][i][0])
        wape_l.append(degree_list[j][i][1])
        r2_l.append(degree_list[j][i][2])
    p_rmse_list.append(rmse_l)
    p_wape_list.append(wape_l)
    p_r2_list.append(r2_l)

## Comparison between Linear and Polynomial

In [None]:
figure, axis = plt.subplots(2, 2, figsize=(16,10))
axis[0,0].plot(range(1,number_of_past_timesteps+1), rmse_list)
axis[0,0].set_title("Linear - RMSE per past timestep")
axis[0,0].set_xlabel("Number of past timesteps")
axis[0,0].set_ylabel("RMSE")
  
axis[0,1].plot(range(1,number_of_past_timesteps+1), p_rmse_list[0])
axis[0,1].set_title("Polynomial (Degree 2) - RMSE per past timestep")
axis[0,1].set_xlabel("Number of past timesteps")
axis[0,1].set_ylabel("RMSE")

axis[1,0].plot(range(1,number_of_past_timesteps+1), p_rmse_list[1])
axis[1,0].set_title("Polynomial (Degree 3) - RMSE per past timestep")
axis[1,0].set_xlabel("Number of past timesteps")
axis[1,0].set_ylabel("RMSE")

axis[1,1].plot(range(1,number_of_past_timesteps+1), p_rmse_list[2])
axis[1,1].set_title("Polynomial (Degree 4) - RMSE per past timestep")
axis[1,1].set_xlabel("Number of past timesteps")
axis[1,1].set_ylabel("RMSE")

In [None]:
figure, axis = plt.subplots(2, 2, figsize=(16,10))
axis[0,0].plot(range(1,number_of_past_timesteps+1), wape_list)
axis[0,0].set_title("Linear - WAPE per past timestep")
axis[0,0].set_xlabel("Number of past timesteps")
axis[0,0].set_ylabel("WAPE")
  
axis[0,1].plot(range(1,number_of_past_timesteps+1), p_wape_list[0])
axis[0,1].set_title("Polynomial (Degree 2) - WAPE per past timestep")
axis[0,1].set_xlabel("Number of past timesteps")
axis[0,1].set_ylabel("WAPE")

axis[1,0].plot(range(1,number_of_past_timesteps+1), p_wape_list[1])
axis[1,0].set_title("Polynomial (Degree 3) - WAPE per past timestep")
axis[1,0].set_xlabel("Number of past timesteps")
axis[1,0].set_ylabel("WAPE")

axis[1,1].plot(range(1,number_of_past_timesteps+1), p_wape_list[2])
axis[1,1].set_title("Polynomial (Degree 4) - WAPE per past timestep")
axis[1,1].set_xlabel("Number of past timesteps")
axis[1,1].set_ylabel("WAPE")

In [None]:
figure, axis = plt.subplots(2, 2, figsize=(16,10))
axis[0,0].plot(range(1,number_of_past_timesteps+1), r2_list)
axis[0,0].set_title("Linear - R2 score per past timestep")
axis[0,0].set_xlabel("Number of past timesteps")
axis[0,0].set_ylabel("R2 score")
  
axis[0,1].plot(range(1,number_of_past_timesteps+1), p_r2_list[0])
axis[0,1].set_title("Polynomial (Degree 2) - R2 score per past timestep")
axis[0,1].set_xlabel("Number of past timesteps")
axis[0,1].set_ylabel("R2 score")

axis[1,0].plot(range(1,number_of_past_timesteps+1), p_r2_list[1])
axis[1,0].set_title("Polynomial (Degree 3) - R2 score per past timestep")
axis[1,0].set_xlabel("Number of past timesteps")
axis[1,0].set_ylabel("R2 score")

axis[1,1].plot(range(1,number_of_past_timesteps+1), p_r2_list[2])
axis[1,1].set_title("Polynomial (Degree 4) - R2 score per past timestep")
axis[1,1].set_xlabel("Number of past timesteps")
axis[1,1].set_ylabel("R2 score")