<a href="https://colab.research.google.com/github/fabricekfr/Backify/blob/master/Auto_sklearn_Automl_svr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title
!apt-get -y install build-essential swig
!curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip3 install
!pip3 install auto-sklearn

In [3]:
from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \
    UniformIntegerHyperparameter, CategoricalHyperparameter

import sklearn.metrics
import autosklearn
import autosklearn.regression
import autosklearn.pipeline.components.regression
from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm
from autosklearn.pipeline.constants import SPARSE, DENSE, \
    SIGNED_DATA, UNSIGNED_DATA, PREDICTIONS

import pandas as pd
import datetime
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import tensorflow as tf

In [4]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [5]:
df_full_sample = pd.read_csv('/content/df_full_sample.csv')
date_times = pd.to_datetime(df_full_sample.pop('timestamp'), format="%Y-%m-%d %H:%M:%S")

In [6]:
timestamp_s = date_times.map(datetime.datetime.timestamp)
day = 24*60*60
year = (365.2425)*day

df_full_sample['Day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
df_full_sample['Day cos'] = np.cos(timestamp_s * (2 * np.pi / day))
df_full_sample['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
df_full_sample['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


In [7]:
# specify the number of lag hours
n_hours = 24
n_features = 8
# frame as supervised learning
df_full_sample_reframed = series_to_supervised(df_full_sample, n_hours, 24)
yColumnIndex = df_full_sample.columns.get_loc('meter_reading')+1
for column in df_full_sample_reframed.columns:
	 if ('-' not in column) & (f"var{yColumnIndex}" not in column):
		 del df_full_sample_reframed[column]

In [8]:
n = len(df_full_sample_reframed)
df_train = df_full_sample_reframed[0:int(n*0.8)]

df_test = df_full_sample_reframed[int(n*0.8):]


train_mean = df_train.mean()
train_std = df_train.std()

df_train = (df_train - train_mean) / train_std
df_test = (df_test - train_mean) / train_std

In [9]:
# split into input and outputs
n_obs = n_hours * n_features
train_X, train_y = df_train.values[:, :n_obs], df_train.values[:, n_obs:]
test_X, test_y = df_test.values[:, :n_obs], df_test.values[:, n_obs:]
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

(804416, 192) (804416, 24) (201105, 192) (201105, 24)


In [10]:
automl  = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=60,
    per_run_time_limit=20,
    include_estimators=['libsvm_svr'],
    resampling_strategy='holdout',
    resampling_strategy_arguments={'train_size': 0.75},
    ensemble_memory_limit=2*1024,
    ml_memory_limit=5*1024,
    #metric=autosklearn.metrics.mean_absolute_error
)

from sklearn.multioutput import RegressorChain
wrapper = RegressorChain(automl)
wrapper.fit(train_X, train_y)







RegressorChain(base_estimator=AutoSklearnRegressor(dask_client=None,
                                                   delete_output_folder_after_terminate=True,
                                                   delete_tmp_folder_after_terminate=True,
                                                   disable_evaluator_output=False,
                                                   ensemble_memory_limit=2048,
                                                   ensemble_nbest=50,
                                                   ensemble_size=50,
                                                   exclude_estimators=None,
                                                   exclude_preprocessors=None,
                                                   get_smac_object_callback=None,
                                                   include_estimators=['libsvm...
                                                   initial_configurations_via_metalearning=25,
                               

In [11]:
# Print prediction score and statistics
# =====================================
pred_y = wrapper.predict(test_X)
#print(wrapper.sprint_statistics())
print("MAE score: ", mean_absolute_error(pred_y, test_y))
print("RMSE score: ", mean_squared_error(pred_y, test_y, squared=True))
print("MAPE score: ", np.mean(tf.keras.losses.mape(test_y, pred_y).numpy()))
#print(automl.show_models())
print("r2 score: ", sklearn.metrics.r2_score(pred_y, test_y))

MAE score:  0.4695889855288702
RMSE score:  0.4224103313924225
MAPE score:  100.36293795652037
r2 score:  0.0
