In [1]:
import numpy as np
import pandas as pd

pd.options.plotting.backend = "plotly"

In [2]:
hourly_data_path = 'students_drahi_production_consumption_hourly.csv'
hourly_data = pd.read_csv(hourly_data_path)
hourly_data_index = hourly_data.set_index("datetime")

In [3]:
from sklearn.preprocessing import FunctionTransformer

#Little hack to "freeze" transformers, since the data in the input is flattened 

def fit_and_freeze(transformer):
    fitted = [0]

    def func(x):
        if not fitted[0]:
            transformer.fit(x)
            fitted[0] = 1
        return transformer.transform(x)

    return FunctionTransformer(func)

In [4]:
time = pd.to_datetime(hourly_data['datetime']).values.astype('datetime64[s]')
power_consumption = hourly_data['kw_total_zone2'].values

dec = [] # daily energy consumption
t_dec = []

for ti, t in enumerate(time):
    tmp_t = pd.Timestamp(t)

    if np.isclose(tmp_t.hour, 0) and np.isclose(tmp_t.minute, 0):

        day_end = np.datetime64(tmp_t + pd.Timedelta(days=1))
        ind = np.where((time > tmp_t) & (time < day_end), True, False)

        if len(time[ind]) > 0 and not np.isnan(power_consumption[ind]).any():
            t_dec.append(np.datetime64(tmp_t).astype('datetime64[s]'))
            dec.append(np.trapz(power_consumption[ind], time[ind].astype(int))/3600)

t_dec = np.array(t_dec)
dec = np.array(dec)

N = 7 # N days of predictors beforehand
final_ind = []
final_hourly = []

predictor_window = pd.Timedelta(days=N)

for ti, t in enumerate(t_dec):
    tmp_t = pd.Timestamp(t)

    ind = np.where((time >= tmp_t - predictor_window) & (time < tmp_t), True, False) # finding indices within the N prior days

    bad_ind = np.isnan(hourly_data.iloc[ind, 1::].values)
    if len(time[ind]) >= 24 * N and not bad_ind.any(): # rejecting any data with NaNs; useful for the student dataset
        final_ind.append(ti)
        final_hourly.append(hourly_data.iloc[ind, 1::].values)

target_time = t_dec[final_ind]
predictors = np.array(final_hourly)

X = predictors.reshape(len(predictors), -1)
y = np.array(dec[final_ind])

In [5]:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# AirTemp preprocessor

airtemp_pipe = Pipeline([
    ('scaler', fit_and_freeze(StandardScaler()))
])
airtemp_pipe.fit(hourly_data_index["AirTemp"].values.reshape(-1, 1))
airtemp_transformer = [(f'AirTemp-{i}', airtemp_pipe, [i]) for i in range(0, 4536, 27)]

#Global Solar Flux preprocessor

global_solar_flux_pipe = Pipeline([
    ('deskew', FunctionTransformer(np.vectorize(lambda x : np.log2(4+x)))),
    ('scaler', fit_and_freeze(StandardScaler()))
])
global_solar_flux_pipe.fit(hourly_data_index["Global_Solar_Flux"].values.reshape(-1, 1))
global_solar_flux_transformer = [(f'Global_Solar_Flux-{i}', global_solar_flux_pipe, [i]) for i in range(6, 4536, 27)]

a = sum([[(f'{j}-{i}', 'passthrough', [i]) for i in range(j, 4536, 27)]
          for j in [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]], [])


In [6]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=148)

#reg = Ridge(alpha=1e8)
#reg.fit(x_train, y_train)
#y_pred = reg.predict(x_test)

In [7]:
from sklearn.linear_model import Ridge
from joblib import Memory
from tempfile import mkdtemp
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


# preprocessor = Pipeline(steps=[])
cols = ['AirTemp', 'pres', 'rain', 'rh', 'wd', 'ws', 'Global_Solar_Flux',
        'Diffuse_Solar_Flux', 'Direct_Solar_Flux', 'Downwelling_IR_Flux', 'SAA',
        'SZA', 'PAC', 'TGBT [kW]', 'kw_heater_corridor1_zone1',
        'kw_heaters_corridor_zone2', 'kw_heaters_toilets_zone2',
        'kw_heatingcoolingtotal_zone1', 'kw_heatingcoolingtotal_zone2',
        'kw_lights_zone1', 'kw_lights_zone2', 'kw_total_zone1',
        'kw_total_zone2', 'kw_ventilation_zone1', 'kw_ventilation_zone2',
        'kw_water_heater_zone2', 'plugs_zone2']

pipelines = []

for col in cols:
    mem = Memory(location=mkdtemp(), verbose=10)
    scaler_pipeline =  Pipeline([('scaler', StandardScaler())], memory=mem)
    scaler_pipeline.fit(hourly_data_index["AirTemp"].values.reshape(-1, 1))
    pipelines.append(scaler_pipeline)

memmod = Memory(location=mkdtemp(), verbose=10)

preprocessor = ColumnTransformer(transformers=[
    (f"{cols[i%27]}-{i//27}", pipelines[i%27], [i]) for i in range(4536)
])

model = Pipeline(steps=[
    #('preprocessor', preprocessor),
    #('regressor', GradientBoostingRegressor(learning_rate=0.05, n_estimators=200, verbose=2))
    #('regressor', MLPRegressor(hidden_layer_sizes=(100, 100, 50), max_iter=1000, verbose=True)),
    ('regressor', Ridge(alpha=1e8, random_state=0))
])

model

In [8]:
model.fit(X_train, y_train)

In [9]:
from skimage.metrics import mean_squared_error
from sklearn.model_selection import cross_validate

-mean_squared_error(y_test, model.predict(X_test))

-685.3041160545804

In [10]:
np.mean(cross_validate(model, X, y, cv=5, scoring='neg_mean_squared_error')['test_score'])

-853.343192440404

In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': np.logspace(-2, 10, 21)}

grid = GridSearchCV(Ridge(), param_grid, cv=5, scoring='neg_mean_squared_error')

grid.fit(X, y) # using the entire dataset for validation

best_alpha = grid.best_params_['alpha']
best_rse = -grid.best_score_/np.mean((y-y.mean())**2)
print('Best alpha from validation:', best_alpha)

Best alpha from validation: 10000000.0


In [12]:
best_model = grid.best_estimator_
np.mean(cross_validate(best_model, X, y, cv=5, scoring='neg_mean_squared_error')['test_score'])

-785.0043742391169

In [13]:
def relative_squared_error(y_pred, y_true):
    """Relative squared error (RSE; also called relative mean square error). < 1 is good, = 1 is bad, > 1 really bad."""
    return np.mean((y_pred - y_true)**2)/np.mean((y_true - y_true.mean())**2)

In [14]:
relative_squared_error(best_model.predict(X), y)

0.24059752974880574

In [None]:
from skl2onnx import to_onnx

#initial_type = [('float_input', FloatTensorType([None,4536]))]
#onnx_model = convert_sklearn(model, initial_types=initial_type)
onnx_model = to_onnx(model, X[:1].astype(np.float32), target_opset=12, verbose=True)

In [None]:
with open('AStar-GradientBoost.onnx', 'wb') as file:
    file.write(onnx_model.SerializeToString())

In [None]:
from onnxruntime import InferenceSession

sess = InferenceSession('AStar-GradientBoost.onnx', providers=["CPUExecutionProvider"])
rerun = sess.run(None, {"X": X_test.astype(np.float32)})[0][:, 0]

rerun

In [None]:
model.predict(X_test)