# Tabular dataloaders

> Dataloaders for tabular data

In [None]:
#| default_exp dataloaders.tabular

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export

import logging
logging.basicConfig(level=logging.INFO)

import numpy as np
from abc import ABC, abstractmethod
from typing import Union, Tuple, List
import pandas as pd

from ddopnew.dataloaders.base import BaseDataLoader

from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
#| export
class XYDataLoader(BaseDataLoader):

    """

    A class for datasets with the typicall X, Y structure. Both X
    and Y are numpy arrays. X may be of shape (datapoints, features) or (datapoints, sequence_length, features) 
    if lag features are used. The prep_lag_features can be used to create those lag features. Y is of shape
    (datapoints, units).

    """
    
    def __init__(self,
        X: np.ndarray,
        Y: np.ndarray,
        val_index_start: Union[int, None] = None, 
        test_index_start: Union[int, None] = None, 
        lag_window_params: Union[dict] = None, # default: {'lag_window': 0, 'include_y': False, 'pre_calc': False}
        normalize_features: Union[dict] = None, # default: {'normalize': True, 'ignore_one_hot': True}
    ):

        self.X = X
        self.Y = Y

        self.val_index_start = val_index_start
        self.test_index_start = test_index_start

        # train index ends either at the start of the validation set, the start of the test set or at the end of the dataset
        if self.val_index_start is not None:
            self.train_index_end = self.val_index_start-1
        elif self.test_index_start is not None:
            self.train_index_end = self.test_index_start-1
        else:
            self.train_index_end = len(Y)-1

        self.dataset_type = "train"

        normalize_features = normalize_features or {'normalize': True, 'ignore_one_hot': True}
        lag_window_params = lag_window_params or {'lag_window': 0, 'include_y': False, 'pre_calc': False}

        self.normalize_features(**normalize_features, initial_normalization=True)
        self.prep_lag_features(**lag_window_params)

        # X must at least have datapoint and feature dimension
        if len(X.shape) == 1:
            self.X = X.reshape(-1, 1)
        
        # Y must have at least datapoint and unit dimension (even if only one unit is present)
        if len(Y.shape) == 1:
            self.Y = Y.reshape(-1, 1)

        assert len(X) == len(Y), 'X and Y must have the same length'

        self.num_units = Y.shape[1] # shape 0 is alsways time, shape 1 is the number of units (e.g., SKUs)

        super().__init__()

    def normalize_features(self,
        normalize: bool = True,
        ignore_one_hot: bool = True,
        initial_normalization=False # Flag if it is set before having added lag features
        ):

        """
        Normalize features using a standard scaler. If ignore_one_hot is true, one-hot encoded features are not normalized.

        """

        if normalize:

            scaler = StandardScaler()

            if initial_normalization:

                if len(self.X.shape) == 3:
                    raise ValueError('Normalization not possible with lag features. Please set initial_normalization=False')
            
                scaler.fit(self.X[:self.train_index_end+1]) # +1 to include the last training point
                scaler.transform(self.X)

                if initial_normalization:
                    return
                else:
                    raise NotImplementedError('Normalization after lag features have been set not implemented yet')

                    # Idea:
                        # remove time dimension
                        # normalize features
                        # add time_dimension back
                    # Problem:
                        # usage of prep_lag_features needs to ensure y is not added a second time

    def prep_lag_features(self,
        lag_window: int = 0, # length of the lage window
        include_y: bool = False, # if lag demand shall be included as feature
        pre_calc: bool = False # if all lags are pre-calculated for the entire dataset
        ):

        """
        Create lag feature for the dataset. If "inlcude_y" is true, then a lag-1 of of the target variable is added as a feature.
        If lag-window is > 0, the lag features are added as middle dimension to X. Note that this, e.g., means that with a lag
        window of 1, the data will include 2 time steps, the current features including lag-1 demand and the lag-1 features
        including lag-2 demand. If pre-calc is true, all these calculations are performed on the entire dataset reduce
        computation time later on at the expense of increases memory usage. 

        """
        # to be discussed: Do we need option to only provide lag demand wihtout lag features?
        self.lag_window = lag_window
        self.pre_calc = pre_calc
        self.include_y = include_y
        
        if self.pre_calc:
            if self.include_y:
                # add additional column to X with demand shifted by 1
                self.X = np.concatenate((self.X, np.roll(self.Y, 1, axis=0)), axis=1)
                self.X = self.X[1:] # remove first row
                self.Y = self.Y[1:] # remove first row
                
                self.val_index_start = self.val_index_start-1
                self.test_index_start = self.test_index_start-1
                self.train_index_end  = self.train_index_end-1
        
            if self.lag_window is not None and self.lag_window > 0:

                # add lag features as dimention 2 to X (making it dimension (datapoints, sequence_length, features))
                X_lag = np.zeros((self.X.shape[0], self.lag_window+1, self.X.shape[1]))
                for i in range(self.lag_window+1):
                    if i == 0:
                        features = self.X
                    else:    
                        features = self.X[:-i, :]
                    X_lag[i:, self.lag_window-i, :] = features
                self.X = X_lag[self.lag_window:]
                self.Y = self.Y[self.lag_window:]

                self.val_index_start = self.val_index_start-self.lag_window
                self.test_index_start = self.test_index_start-self.lag_window
                self.train_index_end  = self.train_index_end-self.lag_window

        else:
            self.lag_window = None
            self.include_y = False
            # add time dimension to X

    def update_lag_features(self,
        lag_window: int,
        ):

        """ Update lag window parameters for dataloader object that is already initialized """

        raise NotImplementedError('Not implemented yet')

        # Problem: updating lag_features naively would shorten the dataset each time it is called

    def __getitem__(self, idx): 

        """ get item by index, depending on the dataset type (train, val, test)"""

        if self.dataset_type == "train":
            if idx > self.train_index_end:
                raise IndexError(f'index {idx} out of range{self.train_index_end}')
            idx = idx

        elif self.dataset_type == "val":
            idx = idx + self.val_index_start
            
            if idx >= self.test_index_start:
                raise IndexError(f'index{idx} out of range{self.test_index_start}')
            
        elif self.dataset_type == "test":
            idx = idx + self.test_index_start
            
            if idx >= len(self.X):
                raise IndexError(f'index{idx} out of range{len(self.X)}')
        
        else:
            raise ValueError('dataset_type not set')

        return self.X[idx], self.Y[idx]

    def __len__(self):
        return len(self.X)
    
    @property
    def X_shape(self):
        return self.X.shape
    
    @property
    def Y_shape(self):
        return self.Y.shape

    @property
    def len_train(self):
        return self.train_index_end+1

    @property
    def len_val(self):
        if self.val_index_start is None:
            raise ValueError('no validation set defined')
        return self.test_index_start-self.val_index_start

    @property
    def len_test(self):
        if self.test_index_start is None:
            raise ValueError('no test set defined')
        return len(self.Y)-self.test_index_start

    def get_all_X(self,
                dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all'
                ): 

        """
        Returns the entire features dataset.
        Return either the train, val, test, or all data.
        """

        if dataset_type == 'train':
            return self.X[:self.val_index_start].copy() if self.X is not None else None
        elif dataset_type == 'val':
            return self.X[self.val_index_start:self.test_index_start].copy() if self.X is not None else None
        elif dataset_type == 'test':
            return self.X[self.test_index_start:].copy() if self.X is not None else None
        elif dataset_type == 'all':
            return self.X.copy() if self.X is not None else None
        else:
            raise ValueError('dataset_type not recognized')

    def get_all_Y(self,
                dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all'
                ): 

        """
        Returns the entire target dataset.
        Return either the train, val, test, or all data.
        """

        if dataset_type == 'train':
            return self.Y[:self.val_index_start].copy() if self.Y is not None else None
        elif dataset_type == 'val':
            return self.Y[self.val_index_start:self.test_index_start].copy() if self.Y is not None else None
        elif dataset_type == 'test':
            return self.Y[self.test_index_start:].copy() if self.Y is not None else None
        elif dataset_type == 'all':
            return self.Y.copy() if self.Y is not None else None
        else:
            raise ValueError('dataset_type not recognized')
        

In [None]:
show_doc(XYDataLoader, title_level=2)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L20){target="_blank" style="float:right; font-size:smaller"}

## XYDataLoader

>      XYDataLoader (X:numpy.ndarray, Y:numpy.ndarray,
>                    val_index_start:Optional[int]=None,
>                    test_index_start:Optional[int]=None,
>                    lag_window_params:dict=None, normalize_features:dict=None)

*A class for datasets with the typicall X, Y structure. Both X
and Y are numpy arrays. X may be of shape (datapoints, features) or (datapoints, sequence_length, features) 
if lag features are used. The prep_lag_features can be used to create those lag features. Y is of shape
(datapoints, units).*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| X | ndarray |  |  |
| Y | ndarray |  |  |
| val_index_start | Optional | None |  |
| test_index_start | Optional | None |  |
| lag_window_params | dict | None | default: {'lag_window': 0, 'include_y': False, 'pre_calc': False} |
| normalize_features | dict | None | default: {'normalize': True, 'ignore_one_hot': True} |

In [None]:
show_doc(XYDataLoader.prep_lag_features)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L111){target="_blank" style="float:right; font-size:smaller"}

### XYDataLoader.prep_lag_features

>      XYDataLoader.prep_lag_features (lag_window:int=0, include_y:bool=False,
>                                      pre_calc:bool=False)

*Create lag feature for the dataset. If "inlcude_y" is true, then a lag-1 of of the target variable is added as a feature.
If lag-window is > 0, the lag features are added as middle dimension to X. Note that this, e.g., means that with a lag
window of 1, the data will include 2 time steps, the current features including lag-1 demand and the lag-1 features
including lag-2 demand. If pre-calc is true, all these calculations are performed on the entire dataset reduce
computation time later on at the expense of increases memory usage.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| lag_window | int | 0 | length of the lage window |
| include_y | bool | False | if lag demand shall be included as feature |
| pre_calc | bool | False | if all lags are pre-calculated for the entire dataset |

In [None]:
show_doc(XYDataLoader.__getitem__)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L173){target="_blank" style="float:right; font-size:smaller"}

### XYDataLoader.__getitem__

>      XYDataLoader.__getitem__ (idx)

*get item by index, depending on the dataset type (train, val, test)*

In [None]:
show_doc(XYDataLoader.get_all_X)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L226){target="_blank" style="float:right; font-size:smaller"}

### XYDataLoader.get_all_X

>      XYDataLoader.get_all_X (dataset_type:str='train')

*Returns the entire features dataset.
Return either the train, val, test, or all data.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| dataset_type | str | train | can be 'train', 'val', 'test', 'all' |

In [None]:
show_doc(XYDataLoader.get_all_Y)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L246){target="_blank" style="float:right; font-size:smaller"}

### XYDataLoader.get_all_Y

>      XYDataLoader.get_all_Y (dataset_type:str='train')

*Returns the entire target dataset.
Return either the train, val, test, or all data.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| dataset_type | str | train | can be 'train', 'val', 'test', 'all' |

Example usage of ```XYDataLoader``` for simple dataset:

In [None]:
X = np.random.standard_normal((100, 2))
Y = np.random.standard_normal((100, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)

dataloader = XYDataLoader(X = X, Y = Y)

sample_X, sample_Y = dataloader[0]
print("sample:", sample_X, sample_Y)
print("sample shape Y:", sample_Y.shape)

print("length:", len(dataloader))

sample: [-1.63994308  0.13549824] [-3.38753308]
sample shape Y: (1,)
length: 100


Example usage of ```XYDataLoader``` on how to handle train, val, and test set:

In [None]:
X = np.random.standard_normal((10, 2))
Y = np.random.standard_normal((10, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)

dataloader = XYDataLoader(X = X, Y = Y, val_index_start=6, test_index_start=8)

sample_X, sample_Y = dataloader[0]

print("length train:", dataloader.len_train, "length val:", dataloader.len_val, "length test:", dataloader.len_test)

print("")
print("### Data from train set ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.val()

print("")
print("### Data from val set ###")
for i in range(dataloader.len_val):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.test()

print("")
print("### Data from test set ###")
for i in range(dataloader.len_test):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.train()

print("")
print("### Data from train set again ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

length train: 6 length val: 2 length test: 2

### Data from train set ###
idx: 0 data: [0.18776023 1.66839591] [4.28944588]
idx: 1 data: [-0.27719155  0.40190162] [0.15596495]
idx: 2 data: [-0.71553719  0.08537152] [-1.57621627]
idx: 3 data: [ 1.25414844 -1.15060407] [-0.87212214]
idx: 4 data: [-0.13077158 -0.64079997] [-3.36943747]
idx: 5 data: [2.40752771 0.15625665] [4.71208213]

### Data from val set ###
idx: 0 data: [ 1.3429748  -0.38041163] [1.14371727]
idx: 1 data: [-0.77508115 -0.79383888] [-3.29342162]

### Data from test set ###
idx: 0 data: [-1.56573895 -1.19523184] [-5.87876387]
idx: 1 data: [0.08372153 1.31012091] [4.52378226]

### Data from train set again ###
idx: 0 data: [0.18776023 1.66839591] [4.28944588]
idx: 1 data: [-0.27719155  0.40190162] [0.15596495]
idx: 2 data: [-0.71553719  0.08537152] [-1.57621627]
idx: 3 data: [ 1.25414844 -1.15060407] [-0.87212214]
idx: 4 data: [-0.13077158 -0.64079997] [-3.36943747]
idx: 5 data: [2.40752771 0.15625665] [4.71208213]


In [None]:
# | hide
dataloader.get_all_X('all')
dataloader.get_all_X('train')
dataloader.get_all_X('val')
dataloader.get_all_X('test')

array([[-1.56573895, -1.19523184],
       [ 0.08372153,  1.31012091]])

In [None]:
# | hide

dataloader.get_all_Y('all')
dataloader.get_all_Y('train')
dataloader.get_all_Y('val')
dataloader.get_all_Y('test')

array([[-5.87876387],
       [ 4.52378226]])

Example usage of ```XYDataLoader``` on how to include lag features:

In [None]:
X = np.random.standard_normal((10, 2))
Y = np.random.standard_normal((10, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)

lag_window_params = {'lag_window': 1, 'include_y': True, 'pre_calc': True}

dataloader = XYDataLoader(X = X, Y = Y, val_index_start=6, test_index_start=8, lag_window_params=lag_window_params)

sample_X, sample_Y = dataloader[0]

print("length train:", dataloader.len_train, "length val:", dataloader.len_val, "length test:", dataloader.len_test)

print("")
print("### Data from train set ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.val()

print("")
print("### Data from val set ###")
for i in range(dataloader.len_val):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.test()

print("")
print("### Data from test set ###")
for i in range(dataloader.len_test):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.train()

print("")
print("### Data from train set again ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

length train: 4 length val: 2 length test: 2

### Data from train set ###
idx: 0 data: [[ 0.12064842  0.92218769  3.22653523]
 [ 0.83660435 -0.88768222  1.2122358 ]] [-1.3309468]
idx: 1 data: [[ 0.83660435 -0.88768222  1.2122358 ]
 [ 0.67280815  0.07891949 -1.3309468 ]] [2.32867388]
idx: 2 data: [[ 0.67280815  0.07891949 -1.3309468 ]
 [-0.1513143   0.03342441  2.32867388]] [0.22463874]
idx: 3 data: [[-0.1513143   0.03342441  2.32867388]
 [ 0.15557226 -0.02865607  0.22463874]] [0.88872031]

### Data from val set ###
idx: 0 data: [[ 0.15557226 -0.02865607  0.22463874]
 [-0.8775118  -0.59691176  0.88872031]] [-4.1931668]
idx: 1 data: [[-0.8775118  -0.59691176  0.88872031]
 [-1.3087275  -1.47004448 -4.1931668 ]] [-6.13630354]

### Data from test set ###
idx: 0 data: [[-1.3087275  -1.47004448 -4.1931668 ]
 [-0.54701461 -0.89519827 -6.13630354]] [-4.72374999]
idx: 1 data: [[-0.54701461 -0.89519827 -6.13630354]
 [-0.42618545  0.449891   -4.72374999]] [-0.30863742]

### Data from train set aga

In [None]:
#| export
class MultiShapeLoader(BaseDataLoader):

    """
    A class designed for comlex datasets with mutlipe feature types. The class is more
    memory-efficient than the XYDataLoader, as it separate the storeage of SKU-specific
    feature, time-specific features, and time-SKU-specific features. The class works generically
    as long as those feature classes are provided during pre-processing. The class is designed 
    to handle classic learning, but able to work in a meta-learning pipeline where no SKU-dimension
    is present and the model needs to make prediction on SKU-time level without knowhing the
    specific SKU.
    """
    
    def __init__(self,
        demand: pd.DataFrame, # Demand data of shape time x SKU
        SKU_features: pd.DataFrame, # Features constant over time of shape SKU x SKU_features
        time_features: pd.DataFrame, # Features constant over SKU of shape time x time_features
        time_SKU_features: pd.DataFrame, # Features varying over time and SKU of shape time x (time_SKU_features*SKU) with double index
        mask: pd.DataFrame, # Mask of shape time x SKU telling which SKUs are available at which time (can be used as mask during trainig or added to features)
        
        val_index_start: Union[int, None] = None, # Validation index start on the time dimension
        test_index_start: Union[int, None] = None, # Test index start on the time dimension
        in_sample_val_test_SKUs: List = None, # SKUs in the training set to be used for validation and testing, out-of-sample w.r.t. time dimension
        out_of_sample_val_SKUs: List = None, # SKUs to be hold-out for validation (can be same as test if no validation on out-of-sample SKUs required)
        out_of_sample_test_SKUs: List = None, # SKUs to be hold-out for testing
        lag_window_params: Union[dict] = None, # default: {'lag_window': 0, 'include_y': False, 'pre_calc': True}
        normalize_features: Union[dict] = None, # default: {'normalize': True, 'ignore_one_hot': True}
        engineered_SKU_features: Union[dict] = None, # default: ["mean_demand", "std_demand", "kurtosis_demand", "skewness_demand", "percentile_10_demand", "percentile_30_demand", "median_demand", "percentile_70_demand", "percentile_90_demand", "inter_quartile_range"]
        include_non_available: bool = False, # if timestep/SKU combination where the SKU was not available for sale shall be included. If included, it will be used as feature, otherwise as mask.
        train_subset: int = False ,# if only a subset of SKUs is used for training. Will always contain in_sample_val_test_SKUs and then fills the rest with random SKUs
        train_subset_SKUs: List = None, # if train_subset is set, specific SKUs can be provided
        SKU_as_batch: bool = False # if get_index during training gets an index for the time dimension (in batch) or from time*SKU dimension
    ):

        normalize_features = normalize_features or {'normalize': True, 'ignore_one_hot': True}
        lag_window_params = lag_window_params or {'lag_window': 0, 'include_y': False, 'pre_calc': False}
        self.lag_window_params = lag_window_params
        self.train_index_start = self.lag_window_params["lag_window"] # start index for training data
        self.train_index_start += self.lag_window_params["include_y"] # if lag demand is included as feature need one more timestep
        engineered_SKU_features = engineered_SKU_features or ["mean_demand", "std_demand", "kurtosis_demand", "skewness_demand", "percentile_10_demand", "percentile_30_demand", "median_demand", "percentile_70_demand", "percentile_90_demand", "inter_quartile_range"]

        self.demand = demand
        self.SKU_features = SKU_features
        self.time_features = time_features
        self.time_SKU_features = time_SKU_features
        self.mask = mask
        self.num_time_SKU_features = len(self.time_SKU_features.columns.get_level_values(0).unique())
        self.num_units = len(self.demand.columns)
        self.num_features = len(self.SKU_features.columns) + len(self.time_features.columns) + self.num_time_SKU_features
        if engineered_SKU_features is not None:
            self.num_features += len(engineered_SKU_features)
        if lag_window_params["include_y"]:
            self.num_features += 1
        if include_non_available:
            self.num_features += 1

        self.normalized_in_sample_SKUs = False
        self.normalized_out_of_sample_val_SKUs = False
        self.normalized_out_of_sample_test_SKUs = False

        self.include_non_available = include_non_available
        self.train_subset = train_subset
        self.train_subset_SKUs = train_subset_SKUs
        self.SKU_as_batch = SKU_as_batch

        self.SKU_type = "in_sample" # or "out_of_sample_val" or "out_of_sample_test" # affecting the SKU-dimension
        self.dataset_type = "train" # or "val" or "test", affecting the time-dimension

        logging.info("Setting indices for validation and test set")
        self.val_index_start = val_index_start
        self.test_index_start = test_index_start

        # train index ends either at the start of the validation set, the start of the test set or at the end of the dataset
        if self.val_index_start is not None:
            self.train_index_end = self.val_index_start-1
        elif self.test_index_start is not None:
            self.train_index_end = self.test_index_start-1
        else:
            self.train_index_end = len(self.demand)-1
        
        logging.info("Setting out-of-sample SKUs")
        # print("Number of SKUs in dataset:", self.demand.shape[1])
        for sku, attr_suffix in [(out_of_sample_val_SKUs, 'val'), (out_of_sample_test_SKUs, 'test')]:
            if sku is not None:
                setattr(self, f'demand_out_of_sample_{attr_suffix}', self.demand.loc[:, sku])
                setattr(self, f'SKU_features_out_of_sample_{attr_suffix}', self.SKU_features.loc[sku])
                setattr(self, f'time_SKU_features_out_of_sample_{attr_suffix}', # here SKU are in columns on index level 2
                        self.time_SKU_features.loc[:, pd.IndexSlice[:, sku]])   
                setattr(self, f'mask_out_of_sample_{attr_suffix}', self.mask.loc[:, sku])
                # time_features are independent of SKU

                self.demand.drop(columns=sku, inplace=True)
                self.SKU_features.drop(index=sku, inplace=True)
                for single_sku in sku if isinstance(sku, list) else [sku]:
                    columns_to_drop = self.time_SKU_features.columns.get_loc_level(single_sku, level=1)
                    self.time_SKU_features.drop(columns=self.time_SKU_features.columns[columns_to_drop[0]], inplace=True)
                self.mask.drop(columns=sku, inplace=True)
        self.in_sample_val_test_SKUs = in_sample_val_test_SKUs
        
        logging.info("Identifying training SKUs")
        self.identify_train_SKUs()

        logging.info("Creating engineered SKU features for training data")
        engineered_SKU_features = self.build_engineered_SKU_features(engineered_SKU_features, self.demand.iloc[:self.train_index_end+1]) # only for training data initially
        self.SKU_features = pd.concat([self.SKU_features, engineered_SKU_features.transpose()], axis=1)
        
        logging.info("Normalizing in-sample SKU features (based on training timesteps)")
        self.normalize_features_in_sample(**normalize_features, initial_normalization=True)

        # store row and column indices of demand, SKU_features time_features mask and then convert to numpy array

        self.demand_indices = self.save_indices(self.demand)
        self.SKU_features_indices = self.save_indices(self.SKU_features)
        self.time_features_indices = self.save_indices(self.time_features)
        self.time_SKU_features_indices = self.save_indices(self.time_SKU_features)
        self.mask_indices = self.save_indices(self.mask)

        self.demand = self.demand.to_numpy()
        self.SKU_features = self.SKU_features.to_numpy()
        self.time_features = self.time_features.to_numpy()
        self.time_SKU_features = self.time_SKU_features.to_numpy()
        self.mask = self.mask.to_numpy()

        self.len_train_time = self.train_index_end-self.train_index_start+1
        if SKU_as_batch:
            logging.info("Creating time-SKU index for training data")
            self.sku_time_index = [(i, j) for i in range(self.train_SKUs_indices.shape[0]) for j in range(self.len_train_time)]

        super().__init__()

    def identify_train_SKUs(self):
        """ determine which SKUs are used for training, validation and testing """

        if self.train_subset:

            if self.train_subset_SKUs is not None:
                if len(self.train_subset_SKUs) != self.train_subset:
                    raise ValueError('train_subset_SKUs must have the same length as train_subset')
                train_SKUs = self.train_subset_SKUs
                # check that all train_SKUs are in demand.collumns
                if not set(train_SKUs).issubset(self.demand.columns):
                    raise ValueError('train_subset_SKUs must be a subset of all training SKUs')
                if self.in_sample_val_test_SKUs is not None:
                    if not set(self.in_sample_val_test_SKUs).issubset(train_SKUs):
                        raise ValueError('train_subset_SKUs must contain in_sample_val_test_SKUs')
            else:
                if self.in_sample_val_test_SKUs is not None and self.train_subset <= len(self.in_sample_val_test_SKUs):
                    raise ValueError('train_subset must be equal or larger than the number of in_sample_val_test_SKUs')
                train_SKUs = self.in_sample_val_test_SKUs if self.in_sample_val_test_SKUs is not None else []
                remaining_SKUs = self.demand.columns.difference(train_SKUs)
                additional_SKUs = np.random.choice(remaining_SKUs, self.train_subset-len(train_SKUs), replace=False)
                train_SKUs = np.concatenate((train_SKUs, additional_SKUs))
    
        else:
            train_SKUs = self.demand.columns # val and test SKUs have been removed before, only training SKUs remain
        
        self.train_SKUs = train_SKUs

        self.train_SKUs_indices = self.demand.columns.get_indexer(self.train_SKUs)
        if self.in_sample_val_test_SKUs is not None:
            self.in_sample_val_test_SKUs_indices = self.demand.columns.get_indexer(self.in_sample_val_test_SKUs)
        

    @staticmethod
    def build_engineered_SKU_features(engineered_SKU_features: List, demand: pd.DataFrame):

        """
        Create engineered features for each SKU
        """

        feature_names = []
        feature_values = []

        for feature in engineered_SKU_features:

            if feature == "mean_demand":
                mean_demand = demand.mean(axis=0)
            elif feature == "std_demand":
                std_demand = demand.std(axis=0)
            elif feature == "kurtosis_demand":
                kurtosis_demand = demand.kurtosis(axis=0)
            elif feature == "skewness_demand":
                skewness_demand = demand.skew(axis=0)
            elif feature == "percentile_10_demand":
                percentile_10_demand = demand.quantile(0.1, axis=0)
            elif feature == "percentile_30_demand":
                percentile_30_demand = demand.quantile(0.3, axis=0)
            elif feature == "median_demand":
                median_demand = demand.median(axis=0)
            elif feature == "percentile_70_demand":
                percentile_70_demand = demand.quantile(0.7, axis=0)
            elif feature == "percentile_90_demand":
                percentile_90_demand = demand.quantile(0.9, axis=0)
            elif feature == "inter_quartile_range":
                inter_quartile_range = demand.quantile(0.75, axis=0) - demand.quantile(0.25, axis=0)
            else:  
                raise ValueError(f'Feature {feature} not recognized')
            
            feature_names.append(feature)
            feature_values.append(locals()[feature])

        return pd.DataFrame(feature_values, columns=demand.columns, index=feature_names)
    
    def normalize_features_in_sample(self,
        normalize: bool = True,
        ignore_one_hot: bool = True,
        initial_normalization = False # Flag if it is set before having added lag features
        ):

        """
        Normalize features using a standard scaler. If ignore_one_hot is true, one-hot encoded features are not normalized.
        """

        if normalize:

            if self.normalized_in_sample_SKUs:
                raise ValueError('Features already normalized')

            self.scaler_demand = StandardScaler()
            self.scaler_SKU_features = StandardScaler()
            self.scaler_time_features = StandardScaler()
            self.scaler_time_SKU_features= [StandardScaler() for _ in range(self.num_time_SKU_features)]

            if initial_normalization:
            
                logging.info("--Normalizing demand")
                self.scaler_demand.fit(self.demand[:self.train_index_end+1])
                transformed_demand = self.scaler_demand.transform(self.demand)
                self.demand.iloc[:,:] = transformed_demand

                logging.info("--Normalizing SKU features")
                continuous_features = [col for col in self.SKU_features.columns if not self.is_one_hot(self.SKU_features[col])]
                if len(continuous_features) > 0:
                    self.scaler_SKU_features.fit(self.SKU_features[continuous_features]) # SKU features are already calculated based on training index
                    transformed_SKU_features = self.scaler_SKU_features.transform(self.SKU_features[continuous_features])
                    self.SKU_features[continuous_features] = transformed_SKU_features

                logging.info("--Normalizing time features")
                continuous_features = [col for col in self.time_features.columns if not self.is_one_hot(self.time_features[col])]
                if len(continuous_features) > 0:
                    self.scaler_time_features.fit(self.time_features.loc[:self.train_index_end+1,continuous_features]) # each column to be normalized
                    transformed_time_features = self.scaler_time_features.transform(self.time_features.loc[:,continuous_features])
                    self.time_features.loc[:,continuous_features] = transformed_time_features

                logging.info("--Normalizing time-SKU features")
                # Normalize time-SKU features (double-indexed)
                for i, feature in enumerate(self.time_SKU_features.columns.get_level_values(0).unique()):
                    # Select all columns corresponding to the current feature in level 0
                    feature_df = self.time_SKU_features.xs(key=feature, axis=1, level=0)
                    if not self.is_one_hot_across_skus(feature_df):
                        self.scaler_time_SKU_features[i].fit(feature_df[:self.train_index_end+1])
                        transformed_feature_df = self.scaler_time_SKU_features[i].transform(feature_df)
                        self.time_SKU_features.loc[:, (feature, slice(None))] = transformed_feature_df
            
                self.normalized_in_sample_SKUs = True

            else:
                raise NotImplementedError('Training data can only normalized during initialization - later normlization not implemented yet')

    def update_lag_features(self,
        lag_window: int,
        ):

        """ Update lag window parameters for dataloader object that is already initialized """

        raise NotImplementedError('Not implemented yet')

        # Problem: updating lag_features naively would shorten the dataset each time it is called

    def get_time_SKU_idx(self, idx):

        """ get time and SKU index by index, depending on the dataset type (train, val, test)"""

        if self.dataset_type == "train":

            if self.SKU_as_batch:
                if idx >= len(self.sku_time_index):
                    raise IndexError(f'index {idx} out of range{len(self.sku_time_index)}')
                idx_sku, idx_time, = self.sku_time_index[idx]
                idx_skus = [idx_sku]

                print(idx_time, idx_sku)

            else:
                if idx+self.train_index_start > self.train_index_end:
                    raise IndexError(f'index {idx} out of range{self.train_index_end-self.train_index_start}')
                idx_skus = self.train_SKUs_indices
                idx_time = idx
            idx_time += self.train_index_start

        elif self.dataset_type == "val":
            idx_time = idx + self.val_index_start
            if self.in_sample_val_test_SKUs is not None:
                idx_skus = self.in_sample_val_test_SKUs_indices
            else:
                idx_skus = self.train_SKUs_indices
            
            if idx >= self.test_index_start:
                raise IndexError(f'index{idx} out of range{self.test_index_start}')
        elif self.dataset_type == "test":
            idx_time = idx + self.test_index_start
            if self.in_sample_val_test_SKUs is not None:
                idx_skus = self.in_sample_val_test_SKUs_indices
            else:
                idx_skus = self.train_SKUs_indices
            
            if idx >= len(self.demand):
                raise IndexError(f'index{idx} out of range{len(self.demand)}')
        else:
            raise ValueError('dataset_type not set')

        return idx_time, idx_skus

    def __getitem__(self, idx):

        """ get item by index, depending on the dataset type (train, val, test)"""

        lag_window = self.lag_window_params["lag_window"]
        include_y = self.lag_window_params["include_y"]

        idx_time, idx_skus = self.get_time_SKU_idx(idx)
        num_skus = len(idx_skus)

        print(idx_time, idx_skus)
        demand = self.demand[idx_time, idx_skus]

        item = np.empty((1,lag_window+1, self.num_features, num_skus))

        for t in range(lag_window+1):

            item_t = np.empty((1, self.num_features, num_skus))
            idx_time_t = idx_time-t

            if include_y:
                assert idx_time_t-1 >= 0
                lag_demand = self.demand[idx_time_t-1, idx_skus]

            SKU_features = self.SKU_features[idx_skus].transpose()
            time_features = self.time_features[idx_time_t]
            # repeate time_SKU_features for all SKUs with SKU as last dimension 
            time_features = np.repeat(time_features[:, np.newaxis], num_skus, axis=1)

            time_SKU_features = np.empty((self.num_time_SKU_features, num_skus))
            for i, idx_sku in enumerate(idx_skus):
                SKU_indices = [len(self.train_SKUs)*i+idx_sku for i in range(self.num_time_SKU_features)]
                time_SKU_features[:,i] = self.time_SKU_features[idx_time_t, SKU_indices]
            len_SKU_features = len(SKU_features)
            len_time_features = len(time_features)
            
            item_t[:,:len(SKU_features),:] = SKU_features
            item_t[:,len_SKU_features:(len_SKU_features+len_time_features),:] = time_features
            item_t[:,(len_SKU_features+len_time_features):(len_SKU_features+len_time_features+self.num_time_SKU_features),:] = time_SKU_features

            if self.include_non_available:
                if self.include_y:
                    item_t[:,-2,:] = lag_demand
                item_t[:,-1,:] = self.mask[idx_time,idx_skus]
            else:
                item_t[:,-1,:] = lag_demand

            item[:,t,:,:] = item_t
        
        if lag_window == 0:
            if item.shape[1] == 1:
                item = item.squeeze(1) 
            else:
                raise ValueError('Lag window is 0, but item has more than one time dimension')
        if self.SKU_as_batch:
            if item.shape[-1] == 1:
                item = item.squeeze(-1) 
            else:
                raise ValueError('SKU as batch, but item has more than one SKU dimension')
    
        return item, demand

    def __len__(self):
        return len(self.demand)
    
    @property
    def X_shape(self):
        return (len_train_time, self.num_features, self.num_units)
    
    @property
    def Y_shape(self):
        return (len_train_time, self.num_units)

    @property
    def len_train(self):
        if self.SKU_as_batch:
            return len(self.sku_time_index)
        else:
            return self.len_train_time

    @property
    def len_val(self):
        if self.val_index_start is None:
            raise ValueError('no validation set defined')
        return self.test_index_start-self.val_index_start

    @property
    def len_test(self):
        if self.test_index_start is None:
            raise ValueError('no test set defined')
        return len(self.demand)-self.test_index_start

    def get_all_X(self,
                dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all'
                ): 

        raise NotImplementedError('Not implemented yet')

        # """
        # Returns the entire features dataset.
        # Return either the train, val, test, or all data.
        # """

        # if dataset_type == 'train':
        #     return self.X[:self.val_index_start].copy() if self.X is not None else None
        # elif dataset_type == 'val':
        #     return self.X[self.val_index_start:self.test_index_start].copy() if self.X is not None else None
        # elif dataset_type == 'test':
        #     return self.X[self.test_index_start:].copy() if self.X is not None else None
        # elif dataset_type == 'all':
        #     return self.X.copy() if self.X is not None else None
        # else:
        #     raise ValueError('dataset_type not recognized')

    def get_all_Y(self,
                dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all'
                ): 

        # """
        # Returns the entire target dataset.
        # Return either the train, val, test, or all data.
        # """

        raise NotImplementedError('Not implemented yet')

        # if dataset_type == 'train':
        #     return self.Y[:self.val_index_start].copy() if self.Y is not None else None
        # elif dataset_type == 'val':
        #     return self.Y[self.val_index_start:self.test_index_start].copy() if self.Y is not None else None
        # elif dataset_type == 'test':
        #     return self.Y[self.test_index_start:].copy() if self.Y is not None else None
        # elif dataset_type == 'all':
        #     return self.Y.copy() if self.Y is not None else None
        # else:
        #     raise ValueError('dataset_type not recognized')

    @staticmethod
    def is_one_hot(column):
        return set(column.unique()) <= {0, 1}

    @staticmethod
    def is_one_hot_across_skus(feature_df):
        """
        Check if the set of unique values in a feature across all SKU_ids is {0, 1}.
        feature_df: DataFrame slice for a specific feature with SKU_ids as columns.
        """
        flattened_values = feature_df.values.flatten()
        
        # Check if the unique values in this flattened array are exactly {0, 1}
        unique_values = set(flattened_values)
        
        return unique_values <= {0, 1}

    @staticmethod
    def save_indices(df):
        """
        Saves the row and column indices of a DataFrame.
        """
        return {
            'rows': df.index,
            'columns': df.columns
        }

In [None]:
from ddopnew.datasets.kaggle_m5 import KaggleM5DatasetLoader

data_path = "/Users/magnus/Documents/02_PhD/Reinforcement_Learning/general_purpose_drl/Newsvendor/kaggle_data" # For testing purposes, please specify the path to the data on your machine
if data_path is not None:
    loader = KaggleM5DatasetLoader(data_path, overwrite=False, product_as_feature=False)
    demand, SKU_features, time_features, time_SKU_features, mask = loader.load_dataset()


INFO:root:Using existing data from disk
INFO:root:Importing data


INFO:root:Preprocessing data
INFO:root:--Creating catogory mapping and features
INFO:root:--Preparing sales time series data
INFO:root:--Preparing calendric information
INFO:root:--Preparing snap features
INFO:root:--Preparing price information
INFO:root:--Creating indicator table if products are available for purchase
INFO:root:--Preparing final outputs and ensure consistency of time and feature dimensions


In [None]:
val_index_start = len(demand)-300
test_index_start = len(demand)-100

out_of_sample_val_SKUs = ["HOBBIES_1_002_CA_1", "HOBBIES_1_003_CA_1"]
out_of_sample_test_SKUs = ["HOBBIES_1_005_CA_1", "FOODS_3_819_WI_3"]

dataloader = MultiShapeLoader(
    demand.copy(),
    SKU_features.copy(),
    time_features.copy(),
    time_SKU_features.copy(),
    mask.copy(),
    val_index_start=val_index_start,
    test_index_start=test_index_start,
    # in_sample_val_test_SKUs=["FOODS_3_825_WI_3"],
    out_of_sample_val_SKUs=out_of_sample_val_SKUs,
    out_of_sample_test_SKUs=out_of_sample_test_SKUs,
    lag_window_params = {'lag_window': 5, 'include_y': True, 'pre_calc': False},
    # train_subset=300,
    # train_subset_SKUs=["HOBBIES_1_001_CA_1", "HOBBIES_1_012_CA_1"],
    SKU_as_batch = True
    )

INFO:root:Setting indices for validation and test set
INFO:root:Setting out-of-sample SKUs
INFO:root:Identifying training SKUs
INFO:root:Creating engineered SKU features for training data
INFO:root:Normalizing in-sample SKU features (based on training timesteps)
INFO:root:--Normalizing demand
INFO:root:--Normalizing SKU features
INFO:root:--Normalizing time features
INFO:root:--Normalizing time-SKU features
INFO:root:Creating time-SKU index for training data


In [None]:
dataloader.__getitem__(49844609) #986 with non-zero lag demand

1634 30485
1640 [30485]


(array([[[ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
           0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
           0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
           0.00000000e+00,  1.00000000e+00, -2.03228586e-01,
          -4.43128224e-02, -2.07535661e-01, -1.26581336e-01,
          -5.62582701e-02, -1.35647453e-01, -2.35129336e-01,
          -3.53059783e-01, -1.48281536e-01, -4.23038458e-01,
           1.62981158e+00,  1.72888660e+00,  0.00000000e+00,
           0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
           1.00000000e+0

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()