# Tabular dataloaders

> Dataloaders for tabular data

In [None]:
#| default_exp dataloaders.tabular

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export

import numpy as np
from abc import ABC, abstractmethod
from typing import Union

from ddopnew.dataloaders.base import BaseDataLoader

In [None]:
#| export
class XYDataLoader(BaseDataLoader):

    """

    A class for datasets with the typicall X, Y structure. Both X
    and Y are numpy arrays. X may be of shape (datapoints, features) or (datapoints, sequence_length, features) 
    if lag features are used. The prep_lag_features can be used to create those lag features. Y is of shape
    (datapoints, units).

    """
    
    def __init__(self,
        X: np.ndarray,
        Y: np.ndarray,
        val_index_start: Union[int, None] = None, 
        test_index_start: Union[int, None] = None, 
        lag_window_params: Union[dict] = None # default: {'lag_window': 0, 'include_y': False, 'pre_calc-calc': False}
    ):

        self.X = X
        self.Y = Y

        self.val_index_start = val_index_start
        self.test_index_start = test_index_start

        # train index ends either at the start of the validation set, the start of the test set or at the end of the dataset
        if self.val_index_start is not None:
            self.train_index_end = self.val_index_start-1
        elif self.test_index_start is not None:
            self.train_index_end = self.test_index_start-1
        else:
            self.train_index_end = len(Y)-1

        self.dataset_type = "train"

        lag_window_params = lag_window_params or {'lag_window': 0, 'include_y': False, 'pre_calc': False}

        self.prep_lag_features(**lag_window_params)

        # X must at least have datapoint and feature dimension
        if len(X.shape) == 1:
            self.X = X.reshape(-1, 1)
        
        # Y must have at least datapoint and unit dimension (even if only one unit is present)
        if len(Y.shape) == 1:
            self.Y = Y.reshape(-1, 1)

        assert len(X) == len(Y), 'X and Y must have the same length'

        self.num_units = Y.shape[1] # shape 0 is alsways time, shape 1 is the number of units (e.g., SKUs)

        super().__init__()

    def prep_lag_features(self,
        lag_window: int = 0, # length of the lage window
        include_y: bool = False, # if lag demand shall be included as feature
        pre_calc: bool = False # if all lags are pre-calculated for the entire dataset
        ):

        """
        Create lag feature for the dataset. If "inlcude_y" is true, then a lag-1 of of the target variable is added as a feature.
        If lag-window is > 0, the lag features are added as middle dimension to X. Note that this, e.g., means that with a lag
        window of 1, the data will include 2 time steps, the current features including lag-1 demand and the lag-1 features
        including lag-2 demand. If pre-calc is true, all these calculations are performed on the entire dataset reduce
        computation time later on at the expense of increases memory usage. 

        """
        # to be discussed: Do we need option to only provide lag demand wihtout lag features?
        self.lag_window = lag_window
        self.include_y = include_y
        self.pre_calc = pre_calc

        if self.pre_calc:
            if self.include_y:
                # add additional column to X with demand shifted by 1
                self.X = np.concatenate((self.X, np.roll(self.Y, 1, axis=0)), axis=1)
                self.X = self.X[1:] # remove first row
                self.Y = self.Y[1:] # remove first row
                
                self.val_index_start = self.val_index_start-1
                self.test_index_start = self.test_index_start-1
                self.train_index_end  = self.train_index_end-1
        
            if self.lag_window is not None and self.lag_window > 0:

                # add lag features as dimention 2 to X (making it dimension (datapoints, sequence_length, features))
                X_lag = np.zeros((self.X.shape[0], self.lag_window+1, self.X.shape[1]))
                for i in range(self.lag_window+1):
                    if i == 0:
                        features = self.X
                    else:    
                        features = self.X[:-i, :]
                    X_lag[i:, self.lag_window-i, :] = features
                self.X = X_lag[self.lag_window:]
                self.Y = self.Y[self.lag_window:]

                self.val_index_start = self.val_index_start-self.lag_window
                self.test_index_start = self.test_index_start-self.lag_window
                self.train_index_end  = self.train_index_end-self.lag_window

        else:
            self.lag_window = None
            self.include_y = False

                # add time dimension to X
    
    def __getitem__(self, idx): 

        """ get item by index, depending on the dataset type (train, val, test)"""

        if self.dataset_type == "train":
            if idx > self.train_index_end:
                raise IndexError(f'index{idx} out of range{self.train_index_end}')
            idx = idx

        elif self.dataset_type == "val":
            idx = idx + self.val_index_start
            
            if idx >= self.test_index_start:
                raise IndexError(f'index{idx} out of range{self.test_index_start}')
            
        elif self.dataset_type == "test":
            idx = idx + self.test_index_start
            
            if idx >= len(self.X):
                raise IndexError(f'index{idx} out of range{len(self.X)}')
        
        else:
            raise ValueError('dataset_type not set')

        return self.X[idx], self.Y[idx]

    def __len__(self):
        return len(self.X)
    
    @property
    def X_shape(self):
        return self.X.shape
    
    @property
    def Y_shape(self):
        return self.Y.shape

    @property
    def len_train(self):
        return self.train_index_end+1

    @property
    def len_val(self):
        if self.val_index_start is None:
            raise ValueError('no validation set defined')
        return self.test_index_start-self.val_index_start

    @property
    def len_test(self):
        if self.test_index_start is None:
            raise ValueError('no test set defined')
        return len(self.Y)-self.test_index_start

    def get_all_X(self,
                dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all'
                ): 

        """
        Returns the entire features dataset.
        Return either the train, val, test, or all data.
        """

        if dataset_type == 'train':
            return self.X[:self.val_index_start].copy() if self.X is not None else None
        elif dataset_type == 'val':
            return self.X[self.val_index_start:self.test_index_start].copy() if self.X is not None else None
        elif dataset_type == 'test':
            return self.X[self.test_index_start:].copy() if self.X is not None else None
        elif dataset_type == 'all':
            return self.X.copy() if self.X is not None else None
        else:
            raise ValueError('dataset_type not recognized')

    def get_all_Y(self,
                dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all'
                ): 

        """
        Returns the entire target dataset.
        Return either the train, val, test, or all data.
        """

        if dataset_type == 'train':
            return self.Y[:self.val_index_start].copy() if self.Y is not None else None
        elif dataset_type == 'val':
            return self.Y[self.val_index_start:self.test_index_start].copy() if self.Y is not None else None
        elif dataset_type == 'test':
            return self.Y[self.test_index_start:].copy() if self.Y is not None else None
        elif dataset_type == 'all':
            return self.Y.copy() if self.Y is not None else None
        else:
            raise ValueError('dataset_type not recognized')
        

In [None]:
show_doc(XYDataLoader, title_level=2)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L14){target="_blank" style="float:right; font-size:smaller"}

## XYDataLoader

>      XYDataLoader (X:numpy.ndarray, Y:numpy.ndarray,
>                    val_index_start:Optional[int]=None,
>                    test_index_start:Optional[int]=None,
>                    lag_window_params:dict=None)

*A class for datasets with the typicall X, Y structure. Both X
and Y are numpy arrays. X may be of shape (datapoints, features) or (datapoints, sequence_length, features) 
if lag features are used. The prep_lag_features can be used to create those lag features. Y is of shape
(datapoints, units).*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| X | ndarray |  |  |
| Y | ndarray |  |  |
| val_index_start | Optional | None |  |
| test_index_start | Optional | None |  |
| lag_window_params | dict | None | default: {'lag_window': 0, 'include_y': False, 'pre_calc-calc': False} |

In [None]:
show_doc(XYDataLoader.prep_lag_features)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L58){target="_blank" style="float:right; font-size:smaller"}

### XYDataLoader.prep_lag_features

>      XYDataLoader.prep_lag_features (lag_window:int=0, include_y:bool=False,
>                                      pre_calc:bool=False)

*Create lag feature for the dataset. If "inlcude_y" is true, then a lag-1 of of the target variable is added as a feature.
If lag-window is > 0, the lag features are added as middle dimension to X. Note that this, e.g., means that with a lag
window of 1, the data will include 2 time steps, the current features including lag-1 demand and the lag-1 features
including lag-2 demand. If pre-calc is true, all these calculations are performed on the entire dataset reduce
computation time later on at the expense of increases memory usage.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| lag_window | int | 0 | length of the lage window |
| include_y | bool | False | if lag demand shall be included as feature |
| pre_calc | bool | False | if all lags are pre-calculated for the entire dataset |

In [None]:
show_doc(XYDataLoader.__getitem__)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L99){target="_blank" style="float:right; font-size:smaller"}

### XYDataLoader.__getitem__

>      XYDataLoader.__getitem__ (idx)

*get item by index, depending on the dataset type (train, val, test)*

In [None]:
show_doc(XYDataLoader.get_all_X)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L150){target="_blank" style="float:right; font-size:smaller"}

### XYDataLoader.get_all_X

>      XYDataLoader.get_all_X (dataset_type:str='train')

*Returns the entire features dataset.
Return either the train, val, test, or all data.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| dataset_type | str | train | can be 'train', 'val', 'test', 'all' |

In [None]:
show_doc(XYDataLoader.get_all_Y)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L169){target="_blank" style="float:right; font-size:smaller"}

### XYDataLoader.get_all_Y

>      XYDataLoader.get_all_Y (dataset_type:str='train')

*Returns the entire target dataset.
Return either the train, val, test, or all data.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| dataset_type | str | train | can be 'train', 'val', 'test', 'all' |

Example usage of ```XYDataLoader``` for simple dataset:

In [None]:
X = np.random.standard_normal((100, 2))
Y = np.random.standard_normal((100, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)

dataloader = XYDataLoader(X = X, Y = Y)

sample_X, sample_Y = dataloader[0]
print("sample:", sample_X, sample_Y)
print("sample shape Y:", sample_Y.shape)

print("length:", len(dataloader))

sample: [0.92106972 0.70156946] [3.65913853]
sample shape Y: (1,)
length: 100


Example usage of ```XYDataLoader``` on how to handle train, val, and test set:

In [None]:
X = np.random.standard_normal((10, 2))
Y = np.random.standard_normal((10, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)

dataloader = XYDataLoader(X = X, Y = Y, val_index_start=6, test_index_start=8)

sample_X, sample_Y = dataloader[0]

print("length train:", dataloader.len_train, "length val:", dataloader.len_val, "length test:", dataloader.len_test)

print("")
print("### Data from train set ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.val()

print("")
print("### Data from val set ###")
for i in range(dataloader.len_val):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.test()

print("")
print("### Data from test set ###")
for i in range(dataloader.len_test):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.train()

print("")
print("### Data from train set again ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

length train: 6 length val: 2 length test: 2

### Data from train set ###
idx: 0 data: [ 0.46617153 -1.5426306 ] [-3.42515345]
idx: 1 data: [0.5959217  0.03931713] [1.02734347]
idx: 2 data: [-0.28027772 -0.3641926 ] [-0.21429458]
idx: 3 data: [-0.28347742 -1.60037267] [-4.4814403]
idx: 4 data: [ 0.01677989 -0.84802836] [-2.56829448]
idx: 5 data: [0.54772979 0.48605575] [1.9488545]

### Data from val set ###
idx: 0 data: [ 0.30611839 -1.38076852] [-2.76311498]
idx: 1 data: [0.46367473 1.40884102] [4.47144858]

### Data from test set ###
idx: 0 data: [-0.30705721  0.42819263] [1.50589912]
idx: 1 data: [ 0.76539705 -0.06903068] [2.19095387]

### Data from train set again ###
idx: 0 data: [ 0.46617153 -1.5426306 ] [-3.42515345]
idx: 1 data: [0.5959217  0.03931713] [1.02734347]
idx: 2 data: [-0.28027772 -0.3641926 ] [-0.21429458]
idx: 3 data: [-0.28347742 -1.60037267] [-4.4814403]
idx: 4 data: [ 0.01677989 -0.84802836] [-2.56829448]
idx: 5 data: [0.54772979 0.48605575] [1.9488545]


In [None]:
# | hide
dataloader.get_all_X('all')
dataloader.get_all_X('train')
dataloader.get_all_X('val')
dataloader.get_all_X('test')

array([[-0.30705721,  0.42819263],
       [ 0.76539705, -0.06903068]])

In [None]:
# | hide

dataloader.get_all_Y('all')
dataloader.get_all_Y('train')
dataloader.get_all_Y('val')
dataloader.get_all_Y('test')

array([[1.50589912],
       [2.19095387]])

Example usage of ```XYDataLoader``` on how to include lag features:

In [None]:
X = np.random.standard_normal((10, 2))
Y = np.random.standard_normal((10, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)

lag_window_params = {'lag_window': 1, 'include_y': True, 'pre_calc': True}

dataloader = XYDataLoader(X = X, Y = Y, val_index_start=6, test_index_start=8, lag_window_params=lag_window_params)

sample_X, sample_Y = dataloader[0]

print("length train:", dataloader.len_train, "length val:", dataloader.len_val, "length test:", dataloader.len_test)

print("")
print("### Data from train set ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.val()

print("")
print("### Data from val set ###")
for i in range(dataloader.len_val):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.test()

print("")
print("### Data from test set ###")
for i in range(dataloader.len_test):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)


dataloader.train()

print("")
print("### Data from train set again ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

length train: 4 length val: 2 length test: 2

### Data from train set ###
idx: 0 data: [[-0.02395273 -0.36204873 -3.2205055 ]
 [ 1.45525805 -1.11224179 -0.94619   ]] [0.2521341]
idx: 1 data: [[ 1.45525805 -1.11224179 -0.94619   ]
 [ 0.23480631  1.18639722  0.2521341 ]] [3.53852786]
idx: 2 data: [[0.23480631 1.18639722 0.2521341 ]
 [1.82872894 1.5804164  3.53852786]] [7.25937768]
idx: 3 data: [[ 1.82872894  1.5804164   3.53852786]
 [ 0.31580198 -0.31130287  7.25937768]] [-1.96853374]

### Data from val set ###
idx: 0 data: [[ 0.31580198 -0.31130287  7.25937768]
 [ 0.72142481 -0.76287586 -1.96853374]] [-1.33508208]
idx: 1 data: [[ 0.72142481 -0.76287586 -1.96853374]
 [ 0.27407661 -2.12056758 -1.33508208]] [-6.31542487]

### Data from test set ###
idx: 0 data: [[ 0.27407661 -2.12056758 -1.33508208]
 [ 0.20329526 -1.08744979 -6.31542487]] [-4.01934226]
idx: 1 data: [[ 0.20329526 -1.08744979 -6.31542487]
 [ 1.89736436  0.80393495 -4.01934226]] [6.99178952]

### Data from train set again ###

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()