# Python libraries which could be used for solution of the test task

In [2]:
import numpy as np

# Baseline implementation of the cross-validation split of a dataset
`DatasetCV_Baseline` class is a baseline implementation of the cross-validation split (K-Fold type) of the dataset into the train and test subsets with 2 transformations:
* `train_raw` and `test_raw` methods return the train and test splits of the <b>original untransformed</b> data for each cross-validation fold
* `train_standardized` and `test_standardized` methods return the train and test splits of the <b>standardized</b> dataset for each cross-validation fold

Standardization of each cross-validation split of the original untransformed dataset is performed as follows:

$$z_i^{i_{fold}}=\frac{x_i^{i_{fold}}-\mu^{i_{fold}}}{\sigma^{i_{fold}}}\qquad\qquad\qquad\mu^{i_{fold}}=\frac{1}{N}\sum_{j=1}^Nx_j^{i_{fold}}\qquad\qquad\qquad\sigma^{i_{fold}}=\sqrt{\frac{1}{N}\sum_{j=1}^N(x_j^{i_{fold}})^2-\left(\frac{1}{N}\sum_{j=1}^Nx_j^{i_{fold}}\right)^2}$$

In [3]:
class DatasetCV_Baseline:
    def __init__(self, n_folds: int, n_samples: int):
        self._dataset: np.ndarray = np.random.rand(n_folds, n_samples)
       
    @property
    def dataset(self) -> np.ndarray:
        return self._dataset

    @property
    def n_folds(self) -> int:
        return self._dataset.shape[0]

    def train_raw(self, i_fold: int) -> np.ndarray:
        dataset = []
        for i in range(self.n_folds):
            if i != i_fold:
               dataset.append(self._dataset[i])
        return np.concatenate(dataset)

    def test_raw(self, i_fold: int) -> np.ndarray:
        return self._dataset[i_fold]

    def train_standardized(self, i_fold: int) -> np.ndarray:
        return (self.train_raw(i_fold)-self._train_mean(i_fold))/self._train_std(i_fold)

    def test_standardized(self, i_fold: int) -> np.ndarray:
        return (self.test_raw(i_fold)-self._test_mean(i_fold))/self._test_std(i_fold)

    def _train_mean(self, i_fold: int) -> float:
        return np.mean(self.train_raw(i_fold))

    def _test_mean(self, i_fold: int) -> float:
        return np.mean(self.test_raw(i_fold))

    def _train_std(self, i_fold: int) -> float:
        return np.std(self.train_raw(i_fold))

    def _test_std(self, i_fold: int) -> float:
        return np.std(self.test_raw(i_fold))


<b>Task description:</b>
1. What is cross-validation? When cross-validation helps? What types of cross-validation do you know?
2. What is standardization? Why standardization is important for training of machine learning models? What other transformations of the original dataset do you know?
3. Evaluate the asymptotic complexity of `DatasetCV_Baseline._train_mean`, `DatasetCV_Baseline._test_mean`, `DatasetCV_Baseline._train_std`, and `DatasetCV_Baseline._test_std` methods

# Optimized version of the cross-validation split of a dataset
Below is the prototype implementation of `DatasetCV_Optimized` class identical to `DatasetCV_Baseline` which needs to be optimized:

In [12]:
class DatasetCV_Optimized:
    def __init__(self, dataset: np.ndarray):
        self._dataset: np.ndarray = dataset
        self.dataset_mean_test = []
        self.dataset_mean_test = [np.mean(self.dataset[i]) for i in range(self.n_folds)]
        self.dataset_std_test = []
        self.dataset_std_test = [np.std(self.dataset[i]) for i in range(self.n_folds)]

    @property
    def dataset(self) -> np.ndarray:
        return self._dataset

    @property
    def n_folds(self) -> int:
        return self._dataset.shape[0]

    def train_raw(self, i_fold: int) -> np.ndarray:
        dataset = []
        dataset = [self._dataset[i] for i in range(self.n_folds) if i != i_fold]

        return np.concatenate(dataset)
    
    def test_raw(self, i_fold: int) -> np.ndarray:
        return self._dataset[i_fold]

    def train_standardized(self, i_fold: int) -> np.ndarray:
        return (self.train_raw(i_fold)-self._train_mean(i_fold))/self._train_std(i_fold)

    def test_standardized(self, i_fold: int) -> np.ndarray:
        return (self.test_raw(i_fold)-self.dataset_mean_test[i_fold])/self.dataset_std_test[i_fold]

    def _train_mean(self, i_fold: int) -> float:
        return np.mean(self.train_raw(i_fold))

    def _test_mean(self, i_fold: int) -> float:
        return np.mean(self.test_raw(i_fold))
    
    def _train_std(self, i_fold: int) -> float:
        return np.std(self.train_raw(i_fold))

    def _test_std(self, i_fold: int) -> float:
        return np.std(self.test_raw(i_fold))

<b>Task description:</b>
1. Identify methods of `DatasetCV_Baseline` class having the performance issues due to the suboptimal implementation
2. Implement the identified methods of `DatasetCV_Baseline` class in a more efficient way
3. Implement the algorithm for `DatasetCV_Optimized` to reduce the asymptotic complexity of `DatasetCV_Optimized._train_mean`, `DatasetCV_Optimized._test_mean`, `DatasetCV_Optimized._train_std`, and `DatasetCV_Optimized._test_std` methods
4. Ensure that the proposed implementation is effective during both `DatasetCV_Optimized` initialization and call of `DatasetCV_Optimized.train_standardized` and `DatasetCV_Optimized.test_standardized` methods

# Check if the optimized and baseline implementations are the same

In [13]:
dataset_baseline = DatasetCV_Baseline(n_folds=10, n_samples=2)
dataset_optimized = DatasetCV_Optimized(dataset=dataset_baseline.dataset)

for i_fold in range(dataset_baseline.n_folds):
    assert np.allclose(dataset_baseline.train_raw(i_fold), dataset_optimized.train_raw(i_fold)), 'Incorrect implementation of "train_raw" method'
    assert np.allclose(dataset_baseline.test_raw(i_fold), dataset_optimized.test_raw(i_fold)), 'Incorrect implementation of "test_raw" method'
    assert np.allclose(dataset_baseline.train_standardized(i_fold), dataset_optimized.train_standardized(i_fold)), 'Incorrect implementation of "train_standardized" method'
    assert np.allclose(dataset_baseline.test_standardized(i_fold), dataset_optimized.test_standardized(i_fold)), 'Incorrect implementation of "test_standardized" method'

print('Correct implementation')

Correct implementation
