In [1]:
import h5py
import numpy as np
from collections import defaultdict
from tqdm.auto import tqdm

In [2]:
PATH = '../../datasets/usps/usps.h5'
with h5py.File(PATH, 'r') as hf:
        train = hf.get('train')
        X_tr = train.get('data')[:]
        y_tr = train.get('target')[:]
        test = hf.get('test')
        X_te = test.get('data')[:]
        y_te = test.get('target')[:]

In [None]:
class KFoldCrossValidation:
    def __init__(self, n_folds, seed=42):
        self.n_folds = n_folds
        self.seed = seed
    
    def cross_validate(self, estimator, X, y):
        y_tests, errors, accuracies = [], [], []

        for X_train, X_test, y_train, y_test in tqdm(
                self.generate_folds(X, y),
                desc='Running on fold',
                total=self.n_folds):
            estimator.fit(X_train, y_train)
            score = estimator.score(X_test, y_test)
            rescaled_error = (self.n_folds / len(y)) * score['error']
            errors.append(rescaled_error)
            accuracies.append(score['accuracy'])
            y_tests.append(y_test)

        return {
            'error': np.mean(errors),
            'accuracy': np.mean(accuracies),
            'y_tests': y_tests
        }

    def generate_folds(self, X, y):
        rng = np.random.default_rng(self.seed)
        folds = defaultdict(lambda: [])
        classes, y_indices, class_counts = np.unique(
            y, return_inverse=True, return_counts=True)
        class_indices = np.split(
            np.argsort(y_indices), np.cumsum(class_counts)[:-1])

        for i in range(classes.shape[0]):
            class_indices[i] = rng.permutation(class_indices[i])
            cumsum_counts = [
                int(j * (1 / self.n_folds) * class_indices[i].shape[0]) 
                for j in range(1, self.n_folds)
            ]
            for fold_num, fold_data_indices in enumerate(
                np.split(class_indices[i], cumsum_counts)):
                folds[fold_num].extend(fold_data_indices)

        folds = {i: rng.permutation(folds[i]) for i in range(self.n_folds)}

        for i in range(self.n_folds):
            yield (np.delete(X, folds[i]), np.take(X, folds[i]), 
                   np.delete(y, folds[i]), np.take(y, folds[i]))     
    
    def plot_folds_stratification(self):
        # np.bincount(np.take(y, folds[0])), 
        # np.bincount(np.take(y, folds[1]))