# Module - Preprocessing

In [1]:
# Import to be able to import python package from src
import sys
sys.path.insert(0, '../src')

In [2]:
import pandas as pd
import numpy as np
from darts.datasets import EnergyDataset

In [3]:
import ontime as on

The `LightGBM` module could not be imported. To enable LightGBM support in Darts, follow the detailed instructions in the installation guide: https://github.com/unit8co/darts/blob/master/INSTALL.md
The `Prophet` module could not be imported. To enable Prophet support in Darts, follow the detailed instructions in the installation guide: https://github.com/unit8co/darts/blob/master/INSTALL.md
  from tqdm.autonotebook import tqdm


---
## Load data

In [4]:
ts = EnergyDataset().load()
ts = ts.astype(np.float32)

---

## Tensorflow Preprocessing

In [5]:
from ontime.module import preprocessing

### Normalize

In [6]:
ts_t = preprocessing.common.normalize(ts)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


### Train test split (for time series)

In [7]:
train, test = preprocessing.common.train_test_split(ts_t, train_split=0.8)

### Split time series in chunks

In [8]:
train_list = preprocessing.common.split_by_length(train, 6)
test_list = preprocessing.common.split_by_length(test, 6)

### Split in X and y

In [9]:
X_train, y_train = preprocessing.common.split_inputs_from_targets(train_list, 4, 2)
X_test, y_test = preprocessing.common.split_inputs_from_targets(test_list, 4, 2)

### Transform in generic data type 

In [10]:
X_train = preprocessing.common.timeseries_list_to_numpy(X_train)
y_train = preprocessing.common.timeseries_list_to_numpy(y_train)
X_test = preprocessing.common.timeseries_list_to_numpy(X_test)
y_test = preprocessing.common.timeseries_list_to_numpy(y_test)

In [11]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4675, 4, 28)
(4675, 2, 28)
(1168, 4, 28)
(1168, 2, 28)


In [33]:
import numpy as np
import tensorflow as tf

In [37]:
features = np.arange(1, 10)
labels = features * -1

In [43]:
dataset = tf.keras.utils.timeseries_dataset_from_array(
    features, labels, sequence_length=3, batch_size=1)

In [45]:
features

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [46]:
labels

array([-1, -2, -3, -4, -5, -6, -7, -8, -9])

In [44]:
for inputs, targets in dataset:
  print("Input:", inputs.numpy(), "Target:", targets.numpy())

Input: [[1 2 3]] Target: [-1]
Input: [[2 3 4]] Target: [-2]
Input: [[3 4 5]] Target: [-3]
Input: [[4 5 6]] Target: [-4]
Input: [[5 6 7]] Target: [-5]
Input: [[6 7 8]] Target: [-6]
Input: [[7 8 9]] Target: [-7]


In [None]:
import numpy as np
import tensorflow as tf


class WindowGenerator:
    def __init__(self, input_width, target_width, offset, ts, target_columns=None):
        # Store the raw data.
        self.ts = ts
        self.df = ts.pd_dataframe()

        # Work out the target column indices.
        self.target_columns = target_columns
        if target_columns is not None:
            self.target_columns_indices = {name: i for i, name in
                                           enumerate(target_columns)}
        self.column_indices = {name: i for i, name in
                               enumerate(self.df.columns)}

        # Work out the window parameters.
        self.input_width = input_width
        self.target_width = target_width
        self.offset = offset

        self.total_window_size = input_width + offset

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.target_start = self.total_window_size - self.target_width
        self.targets_slice = slice(self.target_start, None)
        self.target_indices = np.arange(self.total_window_size)[self.targets_slice]

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Target indices: {self.target_indices}',
            f'Target column name(s): {self.target_columns}'])

    def split_window(self, features):
        inputs = features[:, self.input_slice, :]
        targets = features[:, self.targets_slice, :]
        if self.target_columns is not None:
            targets = tf.stack(
                [targets[:, :, self.column_indices[name]] for name in self.target_columns],
                axis=-1)

        # Slicing doesn't preserve static shape information, so set the shapes
        # manually. This way the `tf.data.Datasets` are easier to inspect.
        inputs.set_shape([None, self.input_width, None])
        targets.set_shape([None, self.target_width, None])

        return inputs, targets

    def make_dataset(self, data):
        data = np.array(data, dtype=np.float32)
        ds = tf.keras.utils.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            shuffle=True,
            batch_size=32,)
        return ds.map(self.split_window)

    @property
    def dataset(self):
        return self.make_dataset(self.df)

    @property
    def example(self):
        """Get and cache an example batch of `inputs, targets` for plotting."""
        result = getattr(self, '_example', None)
        if result is None:
            # No example batch was found, so get one from the dataset
            result = next(iter(self.dataset))
            # And cache it for next time
            self._example = result
        return result

In [None]:
target_columns = ['generation solar']
input_width=24
target_width=12

train_window = WindowGenerator(
    input_width=input_width,
    target_width=target_width,
    offset=1,
    target_columns=target_columns,
    ts=train)

val_window = WindowGenerator(
    input_width=input_width,
    target_width=target_width,
    offset=1,
    target_columns=target_columns,
    ts=val)

test_window = WindowGenerator(
    input_width=input_width,
    target_width=target_width,
    offset=1,
    target_columns=target_columns,
    ts=test)

In [None]:
train_window

In [None]:
train_window.dataset.element_spec

In [1]:
test_window.dataset.element_spec

NameError: name 'test_window' is not defined

In [None]:
dataset = {
    'train': train_window.dataset,
    'val': val_window.dataset,
    'test': test_window.dataset,
}

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=32,
                           kernel_size=(6,),
                           activation='relu'),
    tf.keras.layers.Dense(units=32, activation='relu'),
    tf.keras.layers.Dense(units=32, activation='relu'),
    tf.keras.layers.Dense(units=1)
])

In [None]:
model = tf.keras.Sequential([
    # Shape [batch, time, features] => [batch, lstm_units].
    # Adding more `lstm_units` just overfits more quickly.
    tf.keras.layers.LSTM(32, return_sequences=False),
    # Shape => [batch, out_steps*features].
    tf.keras.layers.Dense(OUT_STEPS*num_features,
                          kernel_initializer=tf.initializers.zeros()),
    # Shape => [batch, out_steps, features].
    tf.keras.layers.Reshape([OUT_STEPS, num_features])
])

In [None]:
MAX_EPOCHS = 20

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    mode='min'
)

model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=[tf.keras.metrics.MeanAbsoluteError()]
)

history = model.fit(
    dataset['train'],
    epochs=MAX_EPOCHS,
    validation_data=dataset['val'],
    callbacks=[early_stopping]
)

In [None]:
performance = model.evaluate(dataset['test'], verbose=0)

In [None]:
performance