# Tabular dataloaders

> Dataloaders for tabular data

In [None]:
#| default_exp dataloaders.tabular

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export

import numpy as np
from abc import ABC, abstractmethod
from typing import Union

from ddopnew.dataloaders.base import BaseDataLoader

from sklearn.preprocessing import StandardScaler

In [None]:
#| export
class XYDataLoader(BaseDataLoader):

    """

    A class for datasets with the typicall X, Y structure. Both X
    and Y are numpy arrays. X may be of shape (datapoints, features) or (datapoints, sequence_length, features) 
    if lag features are used. The prep_lag_features can be used to create those lag features. Y is of shape
    (datapoints, units).

    """
    
    def __init__(self,
        X: np.ndarray,
        Y: np.ndarray,
        val_index_start: Union[int, None] = None, 
        test_index_start: Union[int, None] = None, 
        lag_window_params: Union[dict] = None, # default: {'lag_window': 0, 'include_y': False, 'pre_calc': False}
        normalize_features: Union[dict] = None, # default: {'normalize': True, 'ignore_one_hot': True}
    ):

        self.X = X
        self.Y = Y

        self.val_index_start = val_index_start
        self.test_index_start = test_index_start

        # train index ends either at the start of the validation set, the start of the test set or at the end of the dataset
        if self.val_index_start is not None:
            self.train_index_end = self.val_index_start-1
        elif self.test_index_start is not None:
            self.train_index_end = self.test_index_start-1
        else:
            self.train_index_end = len(Y)-1

        self.dataset_type = "train"

        normalize_features = normalize_features or {'normalize': True, 'ignore_one_hot': True}
        lag_window_params = lag_window_params or {'lag_window': 0, 'include_y': False, 'pre_calc': False}

        self.normalize_features(**normalize_features, initial_normalization=True)
        self.prep_lag_features(**lag_window_params)

        # X must at least have datapoint and feature dimension
        if len(X.shape) == 1:
            self.X = X.reshape(-1, 1)
        
        # Y must have at least datapoint and unit dimension (even if only one unit is present)
        if len(Y.shape) == 1:
            self.Y = Y.reshape(-1, 1)

        assert len(X) == len(Y), 'X and Y must have the same length'

        self.num_units = Y.shape[1] # shape 0 is alsways time, shape 1 is the number of units (e.g., SKUs)

        super().__init__()

    def normalize_features(self,
        normalize: bool = True,
        ignore_one_hot: bool = True,
        initial_normalization=False # Flag if it is set before having added lag features
        ):

        """
        Normalize features using a standard scaler. If ignore_one_hot is true, one-hot encoded features are not normalized.

        """

        if normalize:

            scaler = StandardScaler()

            if initial_normalization:

                if len(self.X.shape) == 3:
                    raise ValueError('Normalization not possible with lag features. Please set initial_normalization=False')
            
                scaler.fit(self.X[:self.train_index_end+1]) # +1 to include the last training point
                scaler.transform(self.X)

                if initial_normalization:
                    return
                else:
                    raise NotImplementedError('Normalization after lag features have been set not implemented yet')

                    # Idea:
                        # remove time dimension
                        # normalize features
                        # add time_dimension back
                    # Problem:
                        # usage of prep_lag_features needs to ensure y is not added a second time

    def prep_lag_features(self,
        lag_window: int = 0, # length of the lage window
        include_y: bool = False, # if lag demand shall be included as feature
        pre_calc: bool = False # if all lags are pre-calculated for the entire dataset
        ):

        """
        Create lag feature for the dataset. If "inlcude_y" is true, then a lag-1 of of the target variable is added as a feature.
        If lag-window is > 0, the lag features are added as middle dimension to X. Note that this, e.g., means that with a lag
        window of 1, the data will include 2 time steps, the current features including lag-1 demand and the lag-1 features
        including lag-2 demand. If pre-calc is true, all these calculations are performed on the entire dataset reduce
        computation time later on at the expense of increases memory usage. 

        """
        # to be discussed: Do we need option to only provide lag demand wihtout lag features?
        self.lag_window = lag_window
        self.pre_calc = pre_calc
        self.include_y = include_y
        
        if self.pre_calc:
            if self.include_y:
                # add additional column to X with demand shifted by 1
                self.X = np.concatenate((self.X, np.roll(self.Y, 1, axis=0)), axis=1)
                self.X = self.X[1:] # remove first row
                self.Y = self.Y[1:] # remove first row
                
                self.val_index_start = self.val_index_start-1
                self.test_index_start = self.test_index_start-1
                self.train_index_end  = self.train_index_end-1
        
            if self.lag_window is not None and self.lag_window > 0:

                # add lag features as dimention 2 to X (making it dimension (datapoints, sequence_length, features))
                X_lag = np.zeros((self.X.shape[0], self.lag_window+1, self.X.shape[1]))
                for i in range(self.lag_window+1):
                    if i == 0:
                        features = self.X
                    else:    
                        features = self.X[:-i, :]
                    X_lag[i:, self.lag_window-i, :] = features
                self.X = X_lag[self.lag_window:]
                self.Y = self.Y[self.lag_window:]

                self.val_index_start = self.val_index_start-self.lag_window
                self.test_index_start = self.test_index_start-self.lag_window
                self.train_index_end  = self.train_index_end-self.lag_window

        else:
            self.lag_window = None
            self.include_y = False
            # add time dimension to X

    def update_lag_features(self,
        lag_window: int,
        ):

        """ Update lag window parameters for dataloader object that is already initialized """

        raise NotImplementedError('Not implemented yet')

        # Problem: updating lag_features naively would shorten the dataset each time it is called

    def __getitem__(self, idx): 

        """ get item by index, depending on the dataset type (train, val, test)"""

        if self.dataset_type == "train":
            if idx > self.train_index_end:
                raise IndexError(f'index {idx} out of range{self.train_index_end}')
            idx = idx

        elif self.dataset_type == "val":
            idx = idx + self.val_index_start
            
            if idx >= self.test_index_start:
                raise IndexError(f'index{idx} out of range{self.test_index_start}')
            
        elif self.dataset_type == "test":
            idx = idx + self.test_index_start
            
            if idx >= len(self.X):
                raise IndexError(f'index{idx} out of range{len(self.X)}')
        
        else:
            raise ValueError('dataset_type not set')

        return self.X[idx], self.Y[idx]

    def __len__(self):
        return len(self.X)
    
    @property
    def X_shape(self):
        return self.X.shape
    
    @property
    def Y_shape(self):
        return self.Y.shape

    @property
    def len_train(self):
        return self.train_index_end+1

    @property
    def len_val(self):
        if self.val_index_start is None:
            raise ValueError('no validation set defined')
        return self.test_index_start-self.val_index_start

    @property
    def len_test(self):
        if self.test_index_start is None:
            raise ValueError('no test set defined')
        return len(self.Y)-self.test_index_start

    def get_all_X(self,
                dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all'
                ): 

        """
        Returns the entire features dataset.
        Return either the train, val, test, or all data.
        """

        if dataset_type == 'train':
            return self.X[:self.val_index_start].copy() if self.X is not None else None
        elif dataset_type == 'val':
            return self.X[self.val_index_start:self.test_index_start].copy() if self.X is not None else None
        elif dataset_type == 'test':
            return self.X[self.test_index_start:].copy() if self.X is not None else None
        elif dataset_type == 'all':
            return self.X.copy() if self.X is not None else None
        else:
            raise ValueError('dataset_type not recognized')

    def get_all_Y(self,
                dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all'
                ): 

        """
        Returns the entire target dataset.
        Return either the train, val, test, or all data.
        """

        if dataset_type == 'train':
            return self.Y[:self.val_index_start].copy() if self.Y is not None else None
        elif dataset_type == 'val':
            return self.Y[self.val_index_start:self.test_index_start].copy() if self.Y is not None else None
        elif dataset_type == 'test':
            return self.Y[self.test_index_start:].copy() if self.Y is not None else None
        elif dataset_type == 'all':
            return self.Y.copy() if self.Y is not None else None
        else:
            raise ValueError('dataset_type not recognized')
        

In [None]:
show_doc(XYDataLoader, title_level=2)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L16){target="_blank" style="float:right; font-size:smaller"}

## XYDataLoader

>      XYDataLoader (X:numpy.ndarray, Y:numpy.ndarray,
>                    val_index_start:Optional[int]=None,
>                    test_index_start:Optional[int]=None,
>                    lag_window_params:dict=None, normalize_features:dict=None)

*A class for datasets with the typicall X, Y structure. Both X
and Y are numpy arrays. X may be of shape (datapoints, features) or (datapoints, sequence_length, features) 
if lag features are used. The prep_lag_features can be used to create those lag features. Y is of shape
(datapoints, units).*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| X | ndarray |  |  |
| Y | ndarray |  |  |
| val_index_start | Optional | None |  |
| test_index_start | Optional | None |  |
| lag_window_params | dict | None | default: {'lag_window': 0, 'include_y': False, 'pre_calc': False} |
| normalize_features | dict | None | default: {'normalize': True, 'ignore_one_hot': True} |

In [None]:
show_doc(XYDataLoader.prep_lag_features)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L107){target="_blank" style="float:right; font-size:smaller"}

### XYDataLoader.prep_lag_features

>      XYDataLoader.prep_lag_features (lag_window:int=0, include_y:bool=False,
>                                      pre_calc:bool=False)

*Create lag feature for the dataset. If "inlcude_y" is true, then a lag-1 of of the target variable is added as a feature.
If lag-window is > 0, the lag features are added as middle dimension to X. Note that this, e.g., means that with a lag
window of 1, the data will include 2 time steps, the current features including lag-1 demand and the lag-1 features
including lag-2 demand. If pre-calc is true, all these calculations are performed on the entire dataset reduce
computation time later on at the expense of increases memory usage.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| lag_window | int | 0 | length of the lage window |
| include_y | bool | False | if lag demand shall be included as feature |
| pre_calc | bool | False | if all lags are pre-calculated for the entire dataset |

In [None]:
show_doc(XYDataLoader.__getitem__)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L169){target="_blank" style="float:right; font-size:smaller"}

### XYDataLoader.__getitem__

>      XYDataLoader.__getitem__ (idx)

*get item by index, depending on the dataset type (train, val, test)*

In [None]:
show_doc(XYDataLoader.get_all_X)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L222){target="_blank" style="float:right; font-size:smaller"}

### XYDataLoader.get_all_X

>      XYDataLoader.get_all_X (dataset_type:str='train')

*Returns the entire features dataset.
Return either the train, val, test, or all data.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| dataset_type | str | train | can be 'train', 'val', 'test', 'all' |

In [None]:
show_doc(XYDataLoader.get_all_Y)

---

[source](https://github.com/opimwue/ddopnew/blob/main/ddopnew/dataloaders/tabular.py#L242){target="_blank" style="float:right; font-size:smaller"}

### XYDataLoader.get_all_Y

>      XYDataLoader.get_all_Y (dataset_type:str='train')

*Returns the entire target dataset.
Return either the train, val, test, or all data.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| dataset_type | str | train | can be 'train', 'val', 'test', 'all' |

Example usage of ```XYDataLoader``` for simple dataset:

In [None]:
X = np.random.standard_normal((100, 2))
Y = np.random.standard_normal((100, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)

dataloader = XYDataLoader(X = X, Y = Y)

sample_X, sample_Y = dataloader[0]
print("sample:", sample_X, sample_Y)
print("sample shape Y:", sample_Y.shape)

print("length:", len(dataloader))

sample: [-0.42385982  0.7019158 ] [1.13521106]
sample shape Y: (1,)
length: 100


Example usage of ```XYDataLoader``` on how to handle train, val, and test set:

In [None]:
X = np.random.standard_normal((10, 2))
Y = np.random.standard_normal((10, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)

dataloader = XYDataLoader(X = X, Y = Y, val_index_start=6, test_index_start=8)

sample_X, sample_Y = dataloader[0]

print("length train:", dataloader.len_train, "length val:", dataloader.len_val, "length test:", dataloader.len_test)

print("")
print("### Data from train set ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.val()

print("")
print("### Data from val set ###")
for i in range(dataloader.len_val):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.test()

print("")
print("### Data from test set ###")
for i in range(dataloader.len_test):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.train()

print("")
print("### Data from train set again ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

length train: 6 length val: 2 length test: 2

### Data from train set ###
idx: 0 data: [-0.37166809  0.71075037] [1.65376876]
idx: 1 data: [-0.21804472  0.91068277] [2.33257268]
idx: 2 data: [-1.04216162 -0.0807868 ] [-1.98027247]
idx: 3 data: [-1.62845905  0.313468  ] [-1.9377411]
idx: 4 data: [-0.74723207 -1.00907747] [-5.34868116]
idx: 5 data: [ 0.67368307 -2.17422889] [-6.41373988]

### Data from val set ###
idx: 0 data: [-0.51178383  1.03651835] [2.87778826]
idx: 1 data: [ 0.03225077 -0.03668587] [1.2431435]

### Data from test set ###
idx: 0 data: [-0.25727476 -1.37935847] [-4.14031444]
idx: 1 data: [-0.12235761  2.29179115] [6.23461214]

### Data from train set again ###
idx: 0 data: [-0.37166809  0.71075037] [1.65376876]
idx: 1 data: [-0.21804472  0.91068277] [2.33257268]
idx: 2 data: [-1.04216162 -0.0807868 ] [-1.98027247]
idx: 3 data: [-1.62845905  0.313468  ] [-1.9377411]
idx: 4 data: [-0.74723207 -1.00907747] [-5.34868116]
idx: 5 data: [ 0.67368307 -2.17422889] [-6.41373988

In [None]:
# | hide
dataloader.get_all_X('all')
dataloader.get_all_X('train')
dataloader.get_all_X('val')
dataloader.get_all_X('test')

array([[-0.25727476, -1.37935847],
       [-0.12235761,  2.29179115]])

In [None]:
# | hide

dataloader.get_all_Y('all')
dataloader.get_all_Y('train')
dataloader.get_all_Y('val')
dataloader.get_all_Y('test')

array([[-4.14031444],
       [ 6.23461214]])

Example usage of ```XYDataLoader``` on how to include lag features:

In [None]:
X = np.random.standard_normal((10, 2))
Y = np.random.standard_normal((10, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)

lag_window_params = {'lag_window': 1, 'include_y': True, 'pre_calc': True}

dataloader = XYDataLoader(X = X, Y = Y, val_index_start=6, test_index_start=8, lag_window_params=lag_window_params)

sample_X, sample_Y = dataloader[0]

print("length train:", dataloader.len_train, "length val:", dataloader.len_val, "length test:", dataloader.len_test)

print("")
print("### Data from train set ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.val()

print("")
print("### Data from val set ###")
for i in range(dataloader.len_val):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.test()

print("")
print("### Data from test set ###")
for i in range(dataloader.len_test):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

dataloader.train()

print("")
print("### Data from train set again ###")
for i in range(dataloader.len_train):
    sample_X, sample_Y = dataloader[i]
    print("idx:", i, "data:", sample_X, sample_Y)

length train: 4 length val: 2 length test: 2

### Data from train set ###
idx: 0 data: [[-2.05550983  1.00697184 -2.09458771]
 [-0.99485469 -0.51723711 -0.95039455]] [-5.04795275]
idx: 1 data: [[-0.99485469 -0.51723711 -0.95039455]
 [-2.03886764 -0.0509625  -5.04795275]] [-4.11651972]
idx: 2 data: [[-2.03886764 -0.0509625  -5.04795275]
 [-0.03441691  0.93914279 -4.11651972]] [2.01628296]
idx: 3 data: [[-0.03441691  0.93914279 -4.11651972]
 [ 0.93748207 -0.61476079  2.01628296]] [1.91951511]

### Data from val set ###
idx: 0 data: [[ 0.93748207 -0.61476079  2.01628296]
 [ 0.97973217  0.08565794  1.91951511]] [2.44694631]
idx: 1 data: [[ 0.97973217  0.08565794  1.91951511]
 [-0.24381243 -0.6729163   2.44694631]] [-1.03887324]

### Data from test set ###
idx: 0 data: [[-0.24381243 -0.6729163   2.44694631]
 [-2.75827219 -0.29493011 -1.03887324]] [-6.96114421]
idx: 1 data: [[-2.75827219 -0.29493011 -1.03887324]
 [-0.20028243  0.73914439 -6.96114421]] [1.84460677]

### Data from train set ag

In [None]:
#| export
class MultiSourceDataLoader(BaseDataLoader):

    """

    A class for datasets that combine multiple sources of data. The data is provided as a list of numpy arrays.
    It converts the data into X,Y pairs at runtime. It also has the capability to handle the multi-product case
    with a meta-learning approach across products.

    """
    
    def __init__(self,
        X: np.ndarray,
        Y: np.ndarray,
        val_index_start: Union[int, None] = None, 
        test_index_start: Union[int, None] = None, 
        lag_window_params: Union[dict] = None, # default: {'lag_window': 0, 'include_y': False, 'pre_calc': False}
        normalize_features: Union[dict] = None, # default: {'normalize': True, 'ignore_one_hot': True}
    ):

        self.X = X
        self.Y = Y

        self.val_index_start = val_index_start
        self.test_index_start = test_index_start

        # train index ends either at the start of the validation set, the start of the test set or at the end of the dataset
        if self.val_index_start is not None:
            self.train_index_end = self.val_index_start-1
        elif self.test_index_start is not None:
            self.train_index_end = self.test_index_start-1
        else:
            self.train_index_end = len(Y)-1

        self.dataset_type = "train"

        normalize_features = normalize_features or {'normalize': True, 'ignore_one_hot': True}
        lag_window_params = lag_window_params or {'lag_window': 0, 'include_y': False, 'pre_calc': False}

        self.normalize_features(**normalize_features, initial_normalization=True)
        self.prep_lag_features(**lag_window_params)

        # X must at least have datapoint and feature dimension
        if len(X.shape) == 1:
            self.X = X.reshape(-1, 1)
        
        # Y must have at least datapoint and unit dimension (even if only one unit is present)
        if len(Y.shape) == 1:
            self.Y = Y.reshape(-1, 1)

        assert len(X) == len(Y), 'X and Y must have the same length'

        self.num_units = Y.shape[1] # shape 0 is alsways time, shape 1 is the number of units (e.g., SKUs)

        super().__init__()

    def normalize_features(self,
        normalize: bool = True,
        ignore_one_hot: bool = True,
        initial_normalization=False # Flag if it is set before having added lag features
        ):

        """
        Normalize features using a standard scaler. If ignore_one_hot is true, one-hot encoded features are not normalized.

        """

        if normalize:

            scaler = StandardScaler()

            if initial_normalization:

                if len(self.X.shape) == 3:
                    raise ValueError('Normalization not possible with lag features. Please set initial_normalization=False')
            
                scaler.fit(self.X[:self.train_index_end+1]) # +1 to include the last training point
                scaler.transform(self.X)

                if initial_normalization:
                    return
                else:
                    raise NotImplementedError('Normalization after lag features have been set not implemented yet')

                    # Idea:
                        # remove time dimension
                        # normalize features
                        # add time_dimension back
                    # Problem:
                        # usage of prep_lag_features needs to ensure y is not added a second time

    def prep_lag_features(self,
        lag_window: int = 0, # length of the lage window
        include_y: bool = False, # if lag demand shall be included as feature
        pre_calc: bool = False # if all lags are pre-calculated for the entire dataset
        ):

        """
        Create lag feature for the dataset. If "inlcude_y" is true, then a lag-1 of of the target variable is added as a feature.
        If lag-window is > 0, the lag features are added as middle dimension to X. Note that this, e.g., means that with a lag
        window of 1, the data will include 2 time steps, the current features including lag-1 demand and the lag-1 features
        including lag-2 demand. If pre-calc is true, all these calculations are performed on the entire dataset reduce
        computation time later on at the expense of increases memory usage. 

        """
        # to be discussed: Do we need option to only provide lag demand wihtout lag features?
        self.lag_window = lag_window
        self.pre_calc = pre_calc
        self.include_y = include_y
        
        if self.pre_calc:
            if self.include_y:
                # add additional column to X with demand shifted by 1
                self.X = np.concatenate((self.X, np.roll(self.Y, 1, axis=0)), axis=1)
                self.X = self.X[1:] # remove first row
                self.Y = self.Y[1:] # remove first row
                
                self.val_index_start = self.val_index_start-1
                self.test_index_start = self.test_index_start-1
                self.train_index_end  = self.train_index_end-1
        
            if self.lag_window is not None and self.lag_window > 0:

                # add lag features as dimention 2 to X (making it dimension (datapoints, sequence_length, features))
                X_lag = np.zeros((self.X.shape[0], self.lag_window+1, self.X.shape[1]))
                for i in range(self.lag_window+1):
                    if i == 0:
                        features = self.X
                    else:    
                        features = self.X[:-i, :]
                    X_lag[i:, self.lag_window-i, :] = features
                self.X = X_lag[self.lag_window:]
                self.Y = self.Y[self.lag_window:]

                self.val_index_start = self.val_index_start-self.lag_window
                self.test_index_start = self.test_index_start-self.lag_window
                self.train_index_end  = self.train_index_end-self.lag_window

        else:
            self.lag_window = None
            self.include_y = False
            # add time dimension to X

    def update_lag_features(self,
        lag_window: int,
        ):

        """ Update lag window parameters for dataloader object that is already initialized """

        raise NotImplementedError('Not implemented yet')

        # Problem: updating lag_features naively would shorten the dataset each time it is called

    def __getitem__(self, idx): 

        """ get item by index, depending on the dataset type (train, val, test)"""

        if self.dataset_type == "train":
            if idx > self.train_index_end:
                raise IndexError(f'index {idx} out of range{self.train_index_end}')
            idx = idx

        elif self.dataset_type == "val":
            idx = idx + self.val_index_start
            
            if idx >= self.test_index_start:
                raise IndexError(f'index{idx} out of range{self.test_index_start}')
            
        elif self.dataset_type == "test":
            idx = idx + self.test_index_start
            
            if idx >= len(self.X):
                raise IndexError(f'index{idx} out of range{len(self.X)}')
        
        else:
            raise ValueError('dataset_type not set')

        return self.X[idx], self.Y[idx]

    def __len__(self):
        return len(self.X)
    
    @property
    def X_shape(self):
        return self.X.shape
    
    @property
    def Y_shape(self):
        return self.Y.shape

    @property
    def len_train(self):
        return self.train_index_end+1

    @property
    def len_val(self):
        if self.val_index_start is None:
            raise ValueError('no validation set defined')
        return self.test_index_start-self.val_index_start

    @property
    def len_test(self):
        if self.test_index_start is None:
            raise ValueError('no test set defined')
        return len(self.Y)-self.test_index_start

    def get_all_X(self,
                dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all'
                ): 

        """
        Returns the entire features dataset.
        Return either the train, val, test, or all data.
        """

        if dataset_type == 'train':
            return self.X[:self.val_index_start].copy() if self.X is not None else None
        elif dataset_type == 'val':
            return self.X[self.val_index_start:self.test_index_start].copy() if self.X is not None else None
        elif dataset_type == 'test':
            return self.X[self.test_index_start:].copy() if self.X is not None else None
        elif dataset_type == 'all':
            return self.X.copy() if self.X is not None else None
        else:
            raise ValueError('dataset_type not recognized')

    def get_all_Y(self,
                dataset_type: str = 'train' # can be 'train', 'val', 'test', 'all'
                ): 

        """
        Returns the entire target dataset.
        Return either the train, val, test, or all data.
        """

        if dataset_type == 'train':
            return self.Y[:self.val_index_start].copy() if self.Y is not None else None
        elif dataset_type == 'val':
            return self.Y[self.val_index_start:self.test_index_start].copy() if self.Y is not None else None
        elif dataset_type == 'test':
            return self.Y[self.test_index_start:].copy() if self.Y is not None else None
        elif dataset_type == 'all':
            return self.Y.copy() if self.Y is not None else None
        else:
            raise ValueError('dataset_type not recognized')

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()