# Module - Preprocessing

In [5]:
# Import to be able to import python package from src
import sys
sys.path.insert(0, '../src')

In [6]:
import pandas as pd
import numpy as np
from darts.datasets import EnergyDataset

In [7]:
import ontime as on

The `LightGBM` module could not be imported. To enable LightGBM support in Darts, follow the detailed instructions in the installation guide: https://github.com/unit8co/darts/blob/master/INSTALL.md
The `Prophet` module could not be imported. To enable Prophet support in Darts, follow the detailed instructions in the installation guide: https://github.com/unit8co/darts/blob/master/INSTALL.md
  from tqdm.autonotebook import tqdm


---
## Load data

In [8]:
ts = EnergyDataset().load()
ts = ts.astype(np.float32)

---

## Common Preprocessing

In [9]:
from ontime.module import preprocessing

### Normalize

In [10]:
ts_t = preprocessing.common.normalize(ts)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [17]:
ts_t = ts_t.univariate_component(0)

### Train test split (for time series)

In [18]:
train, test = preprocessing.common.train_test_split(ts_t, train_split=0.8)

### Split time series in chunks

In [19]:
from ontime import TimeSeries

In [23]:
def split_by_length(ts: TimeSeries, length: int, shift: int = 1, drop_last: bool = True) -> list:
    """
    Split a TimeSeries into parts of a given length

    :param ts: TimeSeries to split
    :param length: int length of each part
    :param drop_last: bool, whether to drop the last part if it is shorter than n
    :return: list of TimeSeries
    """

    # Get DataFrame
    df = ts.pd_dataframe()

    # Initialize a list to hold the DataFrame splits
    splits_df = []

    # Start index for the first split
    start_index = 0

    while start_index < len(df):
        end_index = start_index + length

        # Append the part to the list, using slicing with .iloc
        splits_df.append(df.iloc[start_index:end_index])

        # Update start_index for the next split
        start_index += shift

    # If the last dataframe has a different length, then drop it.
    if drop_last:
        last_df = splits_df[-1]
        second_last = splits_df[-2]
        if len(last_df) != len(second_last):
            splits_df = splits_df[:-1]

    # Change the data structure from DataFrame to TimeSeries
    return list(map(TimeSeries.from_dataframe, splits_df))

In [24]:
train_list = split_by_length(train, length=6, shift=1)
test_list = split_by_length(test, length=6, shift=1)

In [25]:
train_list

[<TimeSeries (DataArray) (time: 6, component: 1, sample: 1)>
 array([[[0.7550676]],
 
        [[0.758446 ]],
 
        [[0.7567568]],
 
        [[0.7398649]],
 
        [[0.722973 ]],
 
        [[0.6925676]]], dtype=float32)
 Coordinates:
   * time       (time) datetime64[ns] 2014-12-31T23:00:00 ... 2015-01-01T04:00:00
   * component  (component) object 'generation biomass'
 Dimensions without coordinates: sample
 Attributes:
     static_covariates:  None
     hierarchy:          None,
 <TimeSeries (DataArray) (time: 6, component: 1, sample: 1)>
 array([[[0.758446 ]],
 
        [[0.7567568]],
 
        [[0.7398649]],
 
        [[0.722973 ]],
 
        [[0.6925676]],
 
        [[0.6773649]]], dtype=float32)
 Coordinates:
   * time       (time) datetime64[ns] 2015-01-01 ... 2015-01-01T05:00:00
   * component  (component) object 'generation biomass'
 Dimensions without coordinates: sample
 Attributes:
     static_covariates:  None
     hierarchy:          None,
 <TimeSeries (DataArray) (t

### Split in X and y

In [15]:
X_train, y_train = preprocessing.common.split_inputs_from_targets(train_list, 4, 2)
X_test, y_test = preprocessing.common.split_inputs_from_targets(test_list, 4, 2)

### Transform in generic data type 

In [16]:
X_train = preprocessing.common.timeseries_list_to_numpy(X_train)
y_train = preprocessing.common.timeseries_list_to_numpy(y_train)
X_test = preprocessing.common.timeseries_list_to_numpy(X_test)
y_test = preprocessing.common.timeseries_list_to_numpy(y_test)

In [17]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4675, 4, 28)
(4675, 2, 28)
(1168, 4, 28)
(1168, 2, 28)
