In [188]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Test: a simple model

In [189]:
from hackaton_eet.data.loaders import get_data
from hackaton_eet.data.transformers import align_data

First, let's get the data. To maket his easier, a helper function `get_data` has been created. As inputs, this function expects a dictionary of your desired datasets, their columns and lags. Note that all lags are given in terms of 15 minutes, and the dates on the column are the day you are trying to predict.

Since we run the model each day at 10:00:00, this means that lag 1 is 09:45:00, and so on.

The code knows of each set which bits are known in advance and which parts are only known after the fact, and will only return the data you have available at each time. It is highly recommended to use this function instead of writing your own.

In [212]:
feature_datasets = {
    'tennet.verrekenprijzen': {
        'lags': [1, 6, 13, 15],  # Lags are now days. Makes working with them easier
        'columns': ['invoeden', 'Afnemen']
    },
    'tennet.igcc': {
        'lags': [1],
        'columns': ["mean_IGCC_op","max_IGCC_op","mean_IGCC_af","max_IGCC_af","mean_opregelen","mean_Afregelen","mean_opregelen_reserve","mean_afregelen_reserve","mean_Mid_prijs_opregelen","max_Hoogste_prijs_opregelen","min_Laagste_prijs_afregelen","max_rampUp","avg_rampUp","max_rampCrossOpregel","max_rampCrossOpregel_sqr"]
    }
}
target_dataset = {
    'tennet.target': {}
}

feature_data = get_data(feature_datasets)
target_data = get_data(target_dataset)

In [213]:
feature_data

datetime,2014-05-17 02:00:00+02:00,2014-05-17 02:15:00+02:00,2014-05-17 02:30:00+02:00,2014-05-17 02:45:00+02:00,2014-05-17 03:00:00+02:00,2014-05-17 03:15:00+02:00,2014-05-17 03:30:00+02:00,2014-05-17 03:45:00+02:00,2014-05-17 04:00:00+02:00,2014-05-17 04:15:00+02:00,...,2016-05-30 23:45:00+02:00,2016-05-31 00:00:00+02:00,2016-05-31 00:15:00+02:00,2016-05-31 00:30:00+02:00,2016-05-31 00:45:00+02:00,2016-05-31 01:00:00+02:00,2016-05-31 01:15:00+02:00,2016-05-31 01:30:00+02:00,2016-05-31 01:45:00+02:00,2016-05-31 02:00:00+02:00
tennet.igcc.mean_IGCC_op.lag_1,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tennet.igcc.max_IGCC_op.lag_1,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tennet.igcc.mean_IGCC_af.lag_1,,,,,,,,,,,...,178.1,193.4,111.3,98.8,97.5,19.3,108.1,151.1,90.9,38.1
tennet.igcc.max_IGCC_af.lag_1,,,,,,,,,,,...,345.0,345.0,207.0,227.0,206.0,95.0,202.0,202.0,182.0,135.0
tennet.igcc.mean_opregelen.lag_1,,,,,,,,,,,...,0.0,0.1,4.2,0.0,0.0,7.5,7.9,0.0,0.0,0.1
tennet.igcc.mean_Afregelen.lag_1,,,,,,,,,,,...,18.2,51.3,1.5,25.3,75.3,34.1,0.2,0.0,0.0,0.0
tennet.igcc.mean_opregelen_reserve.lag_1,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tennet.igcc.mean_afregelen_reserve.lag_1,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tennet.igcc.mean_Mid_prijs_opregelen.lag_1,,,,,,,,,,,...,24.1,23.8,23.8,23.2,23.2,23.2,23.2,23.2,23.2,23.2
tennet.igcc.max_Hoogste_prijs_opregelen.lag_1,,,,,,,,,,,...,,25.69,30.8,,,40.02,23.36,,,23.36


## Training the model

In [206]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, explained_variance_score, mean_squared_error

In [207]:
s = 24*4*400

In [208]:
X = feature_data.fillna(0).values
y = target_data.fillna(0).values

X, y = align_data(X, y)

X, y = X.T, y.T

X_train, X_test = X[:s], X[s:]
y_train, y_test = y[:s], y[s:]

In [209]:
linest = linear_model.LinearRegression()
linest = linest.fit(X_train, y_train)
y_pred = linest.predict(X_test)

In [210]:
print("MAE: %.2f" % mean_absolute_error(y_test, y_pred))
print("VAF: %.2f" % explained_variance_score(y_test, y_pred))

MAE: 35.25
VAF: -0.02


In [116]:
linest

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [117]:
est = RandomForestRegressor(
    n_estimators=200,
    max_depth=3,
    n_jobs=-1,
)
est = est.fit(X_train, y_train)
y_pred = est.predict(X_test)

So, did it work?

In [118]:
print("MAE: %.2f" % mean_absolute_error(y_test, y_pred))
print("VAF: %.2f" % explained_variance_score(y_test, y_pred))

MAE: 29.09
VAF: 0.02


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.plot(y_pred[:, 1] - y_test[:, 1])