In [1]:
import sys, os, time
import numpy as np
from datetime import datetime
from joblib import Parallel, delayed
import logging

parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from estimators.statistical_descriptor import Nagler_WS
# from plot.figure_roc import ROC_plot
from utils.dataset_management import parse_pipeline
from utils.dataset_load import shuffle_data, DatasetLoader
from utils.fold_management import FoldManagement
from utils.label_management import LabelManagement
from utils.balance_management import *
from utils.figures import plot_boxplots, plot_roc_curves
from utils.files_management import (
    load_yaml,
    dump_pkl,
    init_logger,
    report_prediction,
    report_metric_from_log,
    set_folder,
    logger_dataset,
    logger_fold,
    save_metrics
)

In [2]:
dataset_loader = DatasetLoader(
            "../data/dataset/dataset_AD_08200821_14Mas3Top3Phy_W15_corrected_V2.h5",
            shuffle=True,
            descrp=[
                "date",
                "massif",
                "acquisition",
                "elevation",
                "slope",
                "orientation",
                "tmin",
                "hsnow",
                "tel"
            ],
            print_info=True,
            seed=42
        )

x, y = dataset_loader.request_data("(date.dt.month == 3 and date.dt.day == 1) and ((elevation > 1000) and (elevation < 2000))")

Request: (date.dt.month == 3 and date.dt.day == 1) and ((elevation > 1000) and (elevation < 2000)) with 9125 samples
(9125, 15, 15, 9)


In [3]:
labels_manager = LabelManagement(method="3labels")

targets = labels_manager.transform(y)
label_encoder = labels_manager.get_encoder()

fold_manager = FoldManagement(method="combinationFold",
                            shuffle=True, 
                            seed=42,
                            train_aprox_size=0.8)

fold_groups = fold_manager.split(x, y)

In [4]:
def bFold(folds, targets, seed=42):
fold_manager = FoldManagement(method="combinationFold",
                            shuffle=True, 
                            seed=42,
                            train_aprox_size=0.8)

fold_groups = fold_manager.split(x, y)
    """
    Create balanced sub-folds for binary or multi-label classification within each main fold.
    
    Parameters
    ----------
    folds : list of tuples
        A list containing train and test indices for each fold.
    targets : numpy.ndarray
        Target labels.
    seed : int, optional (default=42)
        Seed for random number generator.

    Returns
    -------
    list of tuples
        A list containing balanced train and test indices for each sub-fold.
    """
    rng = np.random.default_rng(seed)
    sub_folds = []

    for train_indices, test_indices in folds:
        train_indices = np.array(train_indices)
        train_targets = targets[train_indices]

        # Identify the smallest class
        unique, counts = np.unique(train_targets, return_counts=True)
        smallest_class = unique[np.argmin(counts)]
        num_smallest_class = np.min(counts)

        # Store indices for each class and shuffle
        class_indices = {label: rng.permutation(train_indices[train_targets == label]) for label in unique}

        # Number of groups for each class based on smallest class size
        class_groups = {label: np.array_split(class_indices[label], max(1, len(class_indices[label]) // num_smallest_class)) for label in unique}

        # Generate sub-folds by combining groups
        smallest_class_groups = class_groups[smallest_class]
        del class_groups[smallest_class]
        
        other_class_groups = list(itertools.product(*class_groups.values()))

        for group in smallest_class_groups:
            for combo in other_class_groups:
                combined_train_indices = np.concatenate([group, *combo])
                rng.shuffle(combined_train_indices)
                sub_folds.append((combined_train_indices, test_indices))
    
    return sub_folds

In [5]:
bfold_groups = bFold(fold_groups, targets, seed=42)

In [6]:
fold_key = {}
for kfold, (train_index, test_index) in enumerate(fold_groups):
    print(f"------------------ Fold: {kfold} ------------------")
    
    train_unique_targets, train_target_counts = np.unique(targets[train_index], return_counts=True)
    train_target_ratios = train_target_counts / train_target_counts.sum()
    train_target_info = ", ".join(f"{target}: {count} ({ratio:.2%})" for target, count, ratio in zip(train_unique_targets, train_target_counts, train_target_ratios))
    
    test_unique_targets, test_target_counts = np.unique(targets[test_index], return_counts=True)
    test_target_ratios = test_target_counts / test_target_counts.sum()
    test_target_info = ", ".join(f"{target}: {count} ({ratio:.2%})" for target, count, ratio in zip(test_unique_targets, test_target_counts, test_target_ratios))
    
    massif_train = y['metadata'][train_index, 1]
    massif_test = y['metadata'][test_index, 1]

    print(f"    - Distribution class train: {train_target_info}")
    print(f"    - Distribution class test: {test_target_info}")
    print(f"    - Train size: {len(train_index) / (len(train_index) + len(test_index)) * 100:.2f}%")
    print(f"    - Massif in train: {np.unique(massif_train)}")
    print(f"    - Massif in test: {np.unique(massif_test)}")

    fold_key[kfold] = {"train": massif_train, "test": massif_test}


------------------ Fold: 0 ------------------
    - Distribution class train: 0: 272 (3.72%), 1: 5688 (77.85%), 2: 1346 (18.42%)
    - Distribution class test: 0: 60 (3.30%), 1: 1452 (79.82%), 2: 307 (16.88%)
    - Train size: 80.07%
    - Massif in train: ['ARAVIS' 'BAUGES' 'BELLEDONNE' 'CHARTEUSE' 'HTE-MAURIE' 'HTE-TARENT'
 'MAURIENNE' 'MONT-BLANC' 'THABOR' 'VANOISE']
    - Massif in test: ['BEAUFORTAIN' 'GRANDES-ROUSSES' 'VERCORS']
------------------ Fold: 1 ------------------
    - Distribution class train: 0: 322 (4.37%), 1: 5735 (77.85%), 2: 1310 (17.78%)
    - Distribution class test: 0: 10 (0.57%), 1: 1405 (79.92%), 2: 343 (19.51%)
    - Train size: 80.73%
    - Massif in train: ['ARAVIS' 'BEAUFORTAIN' 'CHARTEUSE' 'GRANDES-ROUSSES' 'HTE-MAURIE'
 'HTE-TARENT' 'MAURIENNE' 'MONT-BLANC' 'THABOR' 'VANOISE' 'VERCORS']
    - Massif in test: ['BAUGES' 'BELLEDONNE']
------------------ Fold: 2 ------------------
    - Distribution class train: 0: 263 (3.60%), 1: 5694 (77.84%), 2: 1358 (1

In [7]:
fold_key = {}
for kfold, (train_index, test_index) in enumerate(bfold_groups):
    print(f"------------------ Fold: {kfold} ------------------")
    
    train_unique_targets, train_target_counts = np.unique(targets[train_index], return_counts=True)
    train_target_ratios = train_target_counts / train_target_counts.sum()
    train_target_info = ", ".join(f"{target}: {count} ({ratio:.2%})" for target, count, ratio in zip(train_unique_targets, train_target_counts, train_target_ratios))
    
    test_unique_targets, test_target_counts = np.unique(targets[test_index], return_counts=True)
    test_target_ratios = test_target_counts / test_target_counts.sum()
    test_target_info = ", ".join(f"{target}: {count} ({ratio:.2%})" for target, count, ratio in zip(test_unique_targets, test_target_counts, test_target_ratios))
    
    massif_train = y['metadata'][train_index, 1]
    massif_test = y['metadata'][test_index, 1]

    print(f"    - Distribution class train: {train_target_info}")
    print(f"    - Distribution class test: {test_target_info}")
    print(f"    - Train size: {len(train_index) / (len(train_index) + len(test_index)) * 100:.2f}%")
    print(f"    - Massif in train: {np.unique(massif_train)}")
    print(f"    - Massif in test: {np.unique(massif_test)}")

    fold_key[kfold] = {"train": massif_train, "test": massif_test}


------------------ Fold: 0 ------------------
    - Distribution class train: 0: 272 (30.43%), 1: 285 (31.88%), 2: 337 (37.70%)
    - Distribution class test: 0: 60 (3.30%), 1: 1452 (79.82%), 2: 307 (16.88%)
    - Train size: 32.95%
    - Massif in train: ['ARAVIS' 'BAUGES' 'BELLEDONNE' 'CHARTEUSE' 'HTE-MAURIE' 'HTE-TARENT'
 'MAURIENNE' 'MONT-BLANC' 'THABOR' 'VANOISE']
    - Massif in test: ['BEAUFORTAIN' 'GRANDES-ROUSSES' 'VERCORS']
------------------ Fold: 1 ------------------
    - Distribution class train: 0: 272 (30.43%), 1: 285 (31.88%), 2: 337 (37.70%)
    - Distribution class test: 0: 60 (3.30%), 1: 1452 (79.82%), 2: 307 (16.88%)
    - Train size: 32.95%
    - Massif in train: ['ARAVIS' 'BAUGES' 'BELLEDONNE' 'CHARTEUSE' 'HTE-MAURIE' 'HTE-TARENT'
 'MAURIENNE' 'MONT-BLANC' 'THABOR' 'VANOISE']
    - Massif in test: ['BEAUFORTAIN' 'GRANDES-ROUSSES' 'VERCORS']
------------------ Fold: 2 ------------------
    - Distribution class train: 0: 272 (30.46%), 1: 285 (31.91%), 2: 336 (37.6