In [1]:
import numpy as np
import pandas as pd

pd.options.plotting.backend = "plotly"

In [2]:
hourly_data_path = 'students_drahi_production_consumption_hourly_completed.csv'
hourly_data = pd.read_csv(hourly_data_path)
hourly_data_index = hourly_data.set_index("datetime")

In [3]:
time = pd.to_datetime(hourly_data['datetime']).values.astype('datetime64[s]')
power_consumption = hourly_data['kw_total_zone2'].values

dec = [] # daily energy consumption
t_dec = []

for ti, t in enumerate(time):
    tmp_t = pd.Timestamp(t)

    if np.isclose(tmp_t.hour, 0) and np.isclose(tmp_t.minute, 0):

        day_end = np.datetime64(tmp_t + pd.Timedelta(days=1))
        ind = np.where((time > tmp_t) & (time < day_end), True, False)

        if len(time[ind]) > 0 and not np.isnan(power_consumption[ind]).any():
            t_dec.append(np.datetime64(tmp_t).astype('datetime64[s]'))
            dec.append(np.trapz(power_consumption[ind], time[ind].astype(int))/3600)

t_dec = np.array(t_dec)
dec = np.array(dec)

N = 7 # N days of predictors beforehand
final_ind = []
final_hourly = []

predictor_window = pd.Timedelta(days=N)

for ti, t in enumerate(t_dec):
    tmp_t = pd.Timestamp(t)

    ind = np.where((time >= tmp_t - predictor_window) & (time < tmp_t), True, False) # finding indices within the N prior days

    bad_ind = np.isnan(hourly_data.iloc[ind, 1::].values)
    if len(time[ind]) >= 24 * N and not bad_ind.any(): # rejecting any data with NaNs; useful for the student dataset
        final_ind.append(ti)
        final_hourly.append(hourly_data.iloc[ind, 1::].values)

target_time = t_dec[final_ind]
predictors = np.array(final_hourly)

X = predictors.reshape(len(predictors), -1)
y = np.array(dec[final_ind])

In [4]:
from sklearn.ensemble import GradientBoostingRegressor
from joblib import Memory
from tempfile import mkdtemp
from sklearn.neural_network import MLPRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


# preprocessor = Pipeline(steps=[])
cols = ['AirTemp', 'pres', 'rain', 'rh', 'wd', 'ws', 'Global_Solar_Flux',
        'Diffuse_Solar_Flux', 'Direct_Solar_Flux', 'Downwelling_IR_Flux', 'SAA',
        'SZA', 'PAC', 'TGBT [kW]', 'kw_heater_corridor1_zone1',
        'kw_heaters_corridor_zone2', 'kw_heaters_toilets_zone2',
        'kw_heatingcoolingtotal_zone1', 'kw_heatingcoolingtotal_zone2',
        'kw_lights_zone1', 'kw_lights_zone2', 'kw_total_zone1',
        'kw_total_zone2', 'kw_ventilation_zone1', 'kw_ventilation_zone2',
        'kw_water_heater_zone2', 'plugs_zone2']

pipelines = []

for col in cols:
    mem = Memory(location=mkdtemp(), verbose=10)
    scaler_pipeline =  Pipeline([('scaler', StandardScaler())], memory=mem)
    scaler_pipeline.fit(hourly_data_index["AirTemp"].values.reshape(-1, 1))
    pipelines.append(scaler_pipeline)

memmod = Memory(location=mkdtemp(), verbose=10)

preprocessor = ColumnTransformer(transformers=[
    (f"{cols[i%27]}-{i//27}", pipelines[i%27], [i]) for i in range(4536)
])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(learning_rate=0.05, n_estimators=200, verbose=2))
    #('regressor', MLPRegressor(hidden_layer_sizes=(100, 100, 50), max_iter=1000, verbose=True)),
    #('regressor', Ridge(alpha=1e8, random_state=0))
], memory=memmod)

model

In [5]:
model.fit(X, y)

________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(ColumnTransformer(transformers=[('AirTemp-0',
                                 Pipeline(memory=Memory(location=C:\Users\CONCOR~1\AppData\Local\Temp\tmpw5dy0qso\joblib),
                                          steps=[('scaler', StandardScaler())]),
                                 [0]),
                                ('pres-0',
                                 Pipeline(memory=Memory(location=C:\Users\CONCOR~1\AppData\Local\Temp\tmp9ykslhum\joblib),
                                          steps=[('scaler', StandardScaler())]),
                                 [1]),
                                ('rain-0',
                                 Pipeline(memory=Memory(...
                      ..., 
array([[10.279, ..., -0.219],
       ...,
       [ 9.449, ..., -0.207]]), array([45.964, ..., 63.33 ]), None, message_clsname='Pipeline', 

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(


________________________________________________fit_transform_one - 6.4s, 0.1min
      Iter       Train Loss   Remaining Time 
         1         872.8780            2.26m
         2         805.5884            2.25m
         3         743.7762            2.23m
         4         687.8446            2.21m
         5         636.5607            2.20m
         6         588.9376            2.19m
         7         546.7239            2.17m
         8         506.9134            2.16m
         9         471.3164            2.16m
        10         437.8136            2.16m
        11         407.0645            2.14m
        12         379.1997            2.14m
        13         353.9863            2.13m
        14         331.2997            2.12m
        15         309.9513            2.13m
        16         290.5598            2.11m
        17         272.0434            2.13m
        18         255.5766            2.12m
        19         240.8023            2.10m
        20        

In [6]:
from skimage.metrics import mean_squared_error
from sklearn.model_selection import cross_validate

def relative_squared_error(y_pred, y_true):
    """Relative squared error (RSE; also called relative mean square error). < 1 is good, = 1 is bad, > 1 really bad."""
    return np.mean((y_pred - y_true)**2)/np.mean((y_true - y_true.mean())**2)

-mean_squared_error(y, model.predict(X)), relative_squared_error(model.predict(X), y)

(-10.17429571194524, 0.01073119713593671)

In [7]:
np.mean(cross_validate(model, X, y, cv=5, scoring='neg_mean_squared_error')['test_score'])

________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(ColumnTransformer(transformers=[('AirTemp-0',
                                 Pipeline(memory=Memory(location=C:\Users\CONCOR~1\AppData\Local\Temp\tmpw5dy0qso\joblib),
                                          steps=[('scaler', StandardScaler())]),
                                 [0]),
                                ('pres-0',
                                 Pipeline(memory=Memory(location=C:\Users\CONCOR~1\AppData\Local\Temp\tmp9ykslhum\joblib),
                                          steps=[('scaler', StandardScaler())]),
                                 [1]),
                                ('rain-0',
                                 Pipeline(memory=Memory(...
                      ..., 
array([[13.661, ..., -0.275],
       ...,
       [ 9.449, ..., -0.207]]), array([36.181, ..., 63.33 ]), None, message_clsname='Pipeline', 

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(


________________________________________________fit_transform_one - 6.5s, 0.1min
      Iter       Train Loss   Remaining Time 
         1         611.2699            1.71m
         2         561.5078            1.73m
         3         516.1403            1.72m
         4         475.2880            1.71m
         5         437.6639            1.69m
         6         403.6083            1.69m
         7         373.0177            1.68m
         8         344.6027            1.67m
         9         318.9136            1.66m
        10         295.4168            1.65m
        11         274.0355            1.64m
        12         254.3011            1.63m
        13         235.8962            1.62m
        14         219.3438            1.61m
        15         203.9386            1.60m
        16         190.0661            1.60m
        17         177.5371            1.58m
        18         165.8361            1.58m
        19         154.9495            1.57m
        20        

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(


________________________________________________fit_transform_one - 6.2s, 0.1min
      Iter       Train Loss   Remaining Time 
         1         976.7789            1.75m
         2         900.1042            1.75m
         3         831.1022            1.74m
         4         767.7245            1.73m
         5         709.9497            1.72m
         6         657.5452            1.71m
         7         608.2073            1.70m
         8         563.1756            1.70m
         9         523.0033            1.69m
        10         486.3275            1.68m
        11         451.7145            1.67m
        12         420.5843            1.66m
        13         391.9137            1.65m
        14         366.1770            1.64m
        15         341.6945            1.63m
        16         320.2857            1.62m
        17         300.1404            1.61m
        18         281.0404            1.60m
        19         264.4431            1.59m
        20        

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(


________________________________________________fit_transform_one - 6.2s, 0.1min
      Iter       Train Loss   Remaining Time 
         1         841.4315            1.73m
         2         773.2821            1.75m
         3         711.7147            1.73m
         4         655.4931            1.73m
         5         604.0356            1.71m
         6         556.9612            1.71m
         7         514.6355            1.70m
         8         475.9106            1.69m
         9         440.7934            1.68m
        10         407.8447            1.67m
        11         378.1430            1.66m
        12         351.5206            1.66m
        13         326.5038            1.65m
        14         303.7177            1.64m
        15         283.0726            1.63m
        16         263.7270            1.62m
        17         246.3398            1.61m
        18         230.3754            1.60m
        19         215.6708            1.59m
        20        

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(


________________________________________________fit_transform_one - 6.2s, 0.1min
      Iter       Train Loss   Remaining Time 
         1         926.8652            1.74m
         2         854.8549            1.76m
         3         786.8154            1.74m
         4         726.8530            1.73m
         5         670.7082            1.72m
         6         620.9553            1.72m
         7         574.5620            1.71m
         8         531.7476            1.70m
         9         494.1846            1.69m
        10         458.4158            1.68m
        11         426.6738            1.67m
        12         396.9612            1.67m
        13         370.1782            1.66m
        14         345.0631            1.65m
        15         322.8828            1.64m
        16         302.1340            1.64m
        17         282.6519            1.63m
        18         264.7388            1.62m
        19         248.3649            1.61m
        20        

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(


________________________________________________fit_transform_one - 6.2s, 0.1min
      Iter       Train Loss   Remaining Time 
         1         898.1983            1.74m
         2         828.6388            1.75m
         3         764.4593            1.74m
         4         707.0583            1.74m
         5         654.1300            1.73m
         6         606.2596            1.72m
         7         563.0231            1.71m
         8         523.3821            1.70m
         9         485.8409            1.69m
        10         451.5024            1.68m
        11         421.1228            1.67m
        12         393.6125            1.67m
        13         366.9524            1.66m
        14         342.9182            1.65m
        15         320.7974            1.64m
        16         301.0621            1.63m
        17         283.0797            1.62m
        18         265.6556            1.61m
        19         248.7759            1.60m
        20        

-271.36869645246924

In [8]:
from skl2onnx import to_onnx

onnx_model = to_onnx(model, X[:1].astype(np.float32), target_opset=12, verbose=True)

[to_onnx] initial_types=[('X', FloatTensorType(shape=[None, 4536]))]
[convert_sklearn] parse_sklearn_model
[convert_sklearn] convert_topology
[convert_operators] begin
[convert_operators] iteration 1 - n_vars=0 n_ops=9074
[call_converter] call converter for 'SklearnArrayFeatureExtractor'.
[call_converter] call converter for 'SklearnScaler'.
[call_converter] call converter for 'SklearnArrayFeatureExtractor'.
[call_converter] call converter for 'SklearnScaler'.
[call_converter] call converter for 'SklearnArrayFeatureExtractor'.
[call_converter] call converter for 'SklearnScaler'.
[call_converter] call converter for 'SklearnArrayFeatureExtractor'.
[call_converter] call converter for 'SklearnScaler'.
[call_converter] call converter for 'SklearnArrayFeatureExtractor'.
[call_converter] call converter for 'SklearnScaler'.
[call_converter] call converter for 'SklearnArrayFeatureExtractor'.
[call_converter] call converter for 'SklearnScaler'.
[call_converter] call converter for 'SklearnArrayFea

In [9]:
with open('AStar-GradientBoost.onnx', 'wb') as file:
    file.write(onnx_model.SerializeToString())

In [10]:
from onnxruntime import InferenceSession

sess = InferenceSession('AStar-GradientBoost.onnx', providers=["CPUExecutionProvider"])
rerun = sess.run(None, {"X": X.astype(np.float32)})[0][:, 0]

rerun

array([ 51.336212 ,  58.222004 , 104.797455 ,  97.66365  ,  97.85977  ,
        99.70541  ,  97.59239  ,  62.02991  ,  78.22247  ,  96.29935  ,
        95.15741  , 108.41914  , 113.20386  , 116.8207   ,  78.75587  ,
        88.57376  , 133.77213  , 149.59976  , 132.67177  , 132.75331  ,
       116.75346  ,  88.20912  ,  86.01943  , 127.01332  , 117.278625 ,
       108.527176 , 109.14191  , 116.890045 ,  88.41673  ,  82.12076  ,
       124.5159   , 114.25835  , 104.367386 , 108.71654  , 102.40436  ,
        86.049675 ,  85.88189  , 116.09068  , 123.541214 , 101.617836 ,
       102.47592  , 100.55977  ,  84.33989  ,  78.14575  , 124.526596 ,
       115.71432  , 102.027374 ,  54.349953 ,   4.6094894,   1.3512878,
         1.8301239,   6.8823776,  66.816444 , 111.8513   , 116.88832  ,
       116.2038   ,  94.86139  ,  98.8379   , 131.90546  , 135.20149  ,
       123.35846  , 107.488525 , 100.14201  ,  77.38715  ,  81.19957  ,
       129.95296  , 114.48186  , 111.84845  , 115.039215 , 109.0

In [11]:
model.predict(X)

array([ 51.33620962,  58.22200359, 104.79745562,  97.66365208,
        97.85976785,  99.70540926,  97.5923902 ,  62.02990841,
        78.22246653,  96.29935199,  95.15740699, 108.41914835,
       113.20385741, 116.82069504,  78.75587229,  88.57375544,
       133.77212283, 149.59976642, 132.67175853, 132.75330204,
       116.75346768,  88.20912065,  86.01942948, 127.01331891,
       117.27862001, 108.52717457, 109.14191162, 116.89005961,
        88.41672977,  82.12075394, 124.5159106 , 114.25834325,
       104.36739466, 108.71654222, 102.4043577 ,  86.04966677,
        85.8818842 , 116.09069029, 123.54121845, 101.61784141,
       102.47592534, 100.5597749 ,  84.33988728,  78.14575421,
       124.52660557, 115.71431373, 102.02738197,  54.34996103,
         4.60948974,   1.35129276,   1.8301256 ,   6.88237333,
        66.81644209, 111.85130618, 116.88832312, 116.20380151,
        94.86138302,  98.83789477, 131.90544303, 135.20149157,
       123.35846603, 107.48851666, 100.14201818,  77.38