# Tabular dataloaders

> To be written.

In [None]:
#| default_exp dataloaders.tabular

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export

import numpy as np
from abc import ABC, abstractmethod
from typing import Union

from ddopnew.dataloaders.base import BaseDataLoader

In [None]:
#| export
class XYDataLoader(BaseDataLoader):
    
    def __init__(self,
        X: np.ndarray,
        Y: np.ndarray,
        val_index_start: Union[int, None] = None, # give list
        test_index_start: Union[int, None] = None, # give list
        lag_window_params: Union[dict] = {'lag_window': None, 'include_y': False, 'pre-calc': False} # give list
    ):

        self.X = X
        self.Y = Y

        self.val_index_start = val_index_start
        self.test_index_start = test_index_start

        if self.val_index_start is not None:
            self.train_index_end = self.val_index_start-1
        elif self.test_index_start is not None:
            self.train_index_end = self.test_index_start-1
        else:
            self.train_index_end = len(Y)-1

        self.dataset_type = "train"


        self.prep_lag_features(lag_window_params)

  
        if len(X.shape) == 1:
            self.X = X.reshape(-1, 1)
        
        if len(Y.shape) == 1:
            self.Y = Y.reshape(-1, 1)

        assert len(X) == len(Y), 'X and Y must have the same length'

        self.num_units = Y.shape[1] # shape 0 is alsways time, shape 1 is the number of units (e.g., SKUs)

        super().__init__()

    def prep_lag_features(self, lag_window_params: dict):
        # handle lag window for data
        # to be discussed: Do we need option to only provide lag demand wihtout lag features?
        self.lag_window = lag_window_params['lag_window']
        self.include_y = lag_window_params['include_y']
        self.pre_calc = lag_window_params['pre-calc']

        if self.pre_calc:
            if self.include_y:
                # add additional column to X with demand shifted by 1
                self.X = np.concatenate((self.X, np.roll(self.Y, 1, axis=0)), axis=1)
                self.X = self.X[1:] # remove first row
                self.Y = self.Y[1:] # remove first row
                
                self.val_index_start = self.val_index_start-1
                self.test_index_start = self.test_index_start-1
                self.train_index_end  = self.train_index_end-1
        
            if self.lag_window is not None and self.lag_window > 0:

                # add lag features as dimention 2 to X (making it dimension (datapoints, sequence_length, features))
                X_lag = np.zeros((self.X.shape[0], self.lag_window+1, self.X.shape[1]))
                for i in range(self.lag_window+1):
                    if i == 0:
                        features = self.X
                    else:    
                        features = self.X[:-i, :]
                    X_lag[i:, self.lag_window-i, :] = features
                self.X = X_lag[self.lag_window:]
                self.Y = self.Y[self.lag_window:]

                self.val_index_start = self.val_index_start-self.lag_window
                self.test_index_start = self.test_index_start-self.lag_window
                self.train_index_end  = self.train_index_end-self.lag_window

        else:
            self.lag_window = None
            self.include_y = False

                # add time dimension to X
    
    def __getitem__(self, idx): 

        if self.dataset_type == "train":
            if idx > self.train_index_end:
                raise IndexError(f'index{idx} out of range{self.train_index_end}')
            idx = idx

        elif self.dataset_type == "val":
            idx = idx + self.val_index_start
            
            if idx >= self.test_index_start:
                raise IndexError(f'index{idx} out of range{self.test_index_start}')
            
        elif self.dataset_type == "test":
            idx = idx + self.test_index_start
            
            if idx >= len(self.X):
                raise IndexError(f'index{idx} out of range{len(self.X)}')
        
        else:
            raise ValueError('dataset_type not set')

        return self.X[idx], self.Y[idx]

    def __len__(self):
        return len(self.X)
    
    @property
    def X_shape(self):
        return self.X.shape
    
    @property
    def Y_shape(self):
        return self.Y.shape

    @property
    def len_train(self):
        return self.train_index_end+1

    @property
    def len_val(self):
        if self.val_index_start is None:
            raise ValueError('no validation set defined')
        return self.test_index_start-self.val_index_start

    @property
    def len_test(self):
        if self.test_index_start is None:
            raise ValueError('no test set defined')
        return len(self.X)-self.test_index_start

    def get_all_X(self):

        """
        Returns the entire features dataset. If no X data is available, return None.
        """
        return self.X.copy() if self.X is not None else None

    def get_all_Y(self):

        """
        Returns the entire target dataset. If no Y data is available, return None.
        """
        return self.Y.copy()if self.Y is not None else None

In [None]:
X = np.random.standard_normal((100, 2))
Y = np.random.standard_normal((100, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)

dataloader = XYDataLoader(X = X, Y = Y)

sample_X, sample_Y = dataloader[0]
print("sample:", sample_X, sample_Y)
print("sample shape Y:", sample_Y.shape)

print("length:", len(dataloader))

sample: [0.97649279 1.11038653] [7.80996828]
sample shape Y: (1,)
length: 100


In [None]:
X = np.random.standard_normal((10, 2))
Y = np.random.standard_normal((10, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)

dataloader = XYDataLoader(X = X, Y = Y, val_index_start=6, test_index_start=8)

sample_X, sample_Y = dataloader[0]

print("length train:", dataloader.len_train, "length val:", dataloader.len_val, "length test:", dataloader.len_test)

print("")
print("### Data from train set ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.val()

print("")
print("### Data from val set ###")
for i in range(dataloader.len_val):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.test()

print("")
print("### Data from test set ###")
for i in range(dataloader.len_test):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.train()

print("")
print("### Data from train set again ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

length train: 6 length val: 2 length test: 2

### Data from train set ###
idx: 0 data: [1.86140907 0.59972911] [5.97134646]
idx: 1 data: [-0.09913633 -0.46507388] [-1.70772751]
idx: 2 data: [-2.17004601  1.54839745] [0.93603037]
idx: 3 data: [-1.53564039  0.67961273] [0.14033715]
idx: 4 data: [-0.31501817 -0.29763493] [-3.83603639]
idx: 5 data: [-0.88941714 -0.99108722] [-5.05767751]

### Data from val set ###
idx: 0 data: [-1.37271469 -0.80566106] [-6.17871165]
idx: 1 data: [0.08755767 1.00959059] [2.78155005]

### Data from test set ###
idx: 0 data: [-0.91741809 -0.32920412] [-1.95309461]
idx: 1 data: [-2.51029432  1.85533508] [-2.36939189]

### Data from train set again ###
idx: 0 data: [1.86140907 0.59972911] [5.97134646]
idx: 1 data: [-0.09913633 -0.46507388] [-1.70772751]
idx: 2 data: [-2.17004601  1.54839745] [0.93603037]
idx: 3 data: [-1.53564039  0.67961273] [0.14033715]
idx: 4 data: [-0.31501817 -0.29763493] [-3.83603639]
idx: 5 data: [-0.88941714 -0.99108722] [-5.05767751]


In [None]:
X = np.random.standard_normal((10, 2))
Y = np.random.standard_normal((10, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)

lag_window_params = {'lag_window': 1, 'include_y': True, 'pre-calc': True}

dataloader = XYDataLoader(X = X, Y = Y, val_index_start=6, test_index_start=8, lag_window_params=lag_window_params)

sample_X, sample_Y = dataloader[0]

print("length train:", dataloader.len_train, "length val:", dataloader.len_val, "length test:", dataloader.len_test)

print("")
print("### Data from train set ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.val()

print("")
print("### Data from val set ###")
for i in range(dataloader.len_val):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.test()

print("")
print("### Data from test set ###")
for i in range(dataloader.len_test):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)


dataloader.train()

print("")
print("### Data from train set again ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

length train: 4 length val: 2 length test: 2

### Data from train set ###
idx: 0 data: [[-1.40876254 -0.14268124 -3.32373933]
 [ 1.3568839   0.19228266 -3.47869167]] [2.83952595]
idx: 1 data: [[ 1.3568839   0.19228266 -3.47869167]
 [ 0.57655503 -0.20559243  2.83952595]] [-1.03795912]
idx: 2 data: [[ 0.57655503 -0.20559243  2.83952595]
 [ 0.64965388 -1.22862093 -1.03795912]] [-2.51057617]
idx: 3 data: [[ 0.64965388 -1.22862093 -1.03795912]
 [-0.29481134  2.06600286 -2.51057617]] [4.69638231]

### Data from val set ###
idx: 0 data: [[-0.29481134  2.06600286 -2.51057617]
 [ 0.4712446  -0.19227017  4.69638231]] [-0.89261003]
idx: 1 data: [[ 0.4712446  -0.19227017  4.69638231]
 [ 0.10975213 -0.01414033 -0.89261003]] [0.32833319]

### Data from test set ###
idx: 0 data: [[ 0.10975213 -0.01414033 -0.89261003]
 [ 0.44967256  0.13099119  0.32833319]] [-0.88167709]
idx: 1 data: [[ 0.44967256  0.13099119  0.32833319]
 [ 0.29672722 -0.4092795  -0.88167709]] [-0.30977552]

### Data from train set a

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()