In [1]:
import sys, os, time
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import label_binarize

parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from estimators.statistical_descriptor import Nagler_WS
# from plot.figure_roc import ROC_plot
from utils.dataset_management import parse_pipeline
from utils.dataset_load import shuffle_data, DatasetLoader
from utils.fold_management import FoldManagement
from utils.label_management import LabelManagement
from utils.figures import plot_boxplots, plot_roc_curves
from utils.files_management import (
    load_yaml,
    dump_pkl,
    init_logger,
    report_prediction,
    report_metric_from_log,
    set_folder,
    logger_dataset,
    logger_fold
)

In [2]:
os.cpu_count()

8

In [3]:
param_path = "../parameter/config_pipeline.yml"
pipeline_params = load_yaml(param_path)

try:
    data_path = "../data/dataset/dataset_AD_08200821_14Mas3Top3Phy_W15_corrected_V2.h5"
    out_dir = pipeline_params["out_dir"]
    fold_method = pipeline_params["fold_method"]
    seed = pipeline_params["seed"]
    labeling_method = pipeline_params["labeling_method"]
    resampling_method = pipeline_params["resampling_method"]
    balance_data = pipeline_params["balance_data"]
    # orbit = pipeline_params["orbit"]
    request = pipeline_params["request"]
    shuffle_data = pipeline_params["shuffle_data"]
    channel_transformation = pipeline_params["channel_transformation"]
    BANDS_MAX = pipeline_params["BANDS_MAX"]

except KeyError as e:
    print("KeyError: %s undefined" % e)

out_dir = set_folder(out_dir, pipeline_params)
log_dataset, _ = init_logger(out_dir, "dataset_info")
log_results, _ = init_logger(out_dir + "results", "results")

dataset_loader = DatasetLoader(
    data_path,
    shuffle=shuffle_data,
    descrp=[
        "date",
        "massif",
        "acquisition",
        "elevation",
        "slope",
        "orientation",
        "tmin",
        "hsnow",
        "tel"
    ],
    print_info=True
)



In [4]:
x, y = dataset_loader.request_data(request)


Request: (date.dt.month == 3 and date.dt.day == 1) and ((elevation > 1000) and (elevation < 1500)) with 2577 samples
(2577, 15, 15, 9)


In [5]:
labels_manager = LabelManagement(method=labeling_method)

targets = labels_manager.transform(y)


In [6]:
massives_count = {}
for index, name in enumerate(y['metadata'][:, 1]):
            if name not in massives_count:
                massives_count[name] = {'count': 0, 'indices': []}
            massives_count[name]['count'] += 1
            massives_count[name]['indices'].append(index)

In [7]:
import numpy as np
import itertools
import random
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

def combination_method(dict_massives, train_size=0.8, proximity_value=1, shuffle=False, seed=None):
    """
    Generate prioritized combinations of massives based on a given dictionary.

    Parameters:
    - dict_massives : dict
        A dictionary where keys are massives and values are dictionaries containing 'count' and 'indices' keys.
        Example: {
            'massif1': {'count': 10, 'indices': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]},
            'massif2': {'count': 5, 'indices': [10, 11, 12, 13, 14]},
            ...
        }
    - train_size : float, optional (default=0.8)
        The proportion of the dataset to include in the train split.
    - proximity_value : int, optional (default=1)
        A value to control the proximity to the desired train size.
    - shuffle : bool, optional (default=False)
        Whether to shuffle the selection of combinations.
    - seed : int, optional (default=None)
        Seed for random number generator (used if shuffle is True).

    Returns:
    - list of tuples
        A list containing train and test indices for each prioritized combination of massives.
    """
    if shuffle:
        random.seed(seed)

    total_count = sum(value['count'] for value in dict_massives.values())
    massives = list(dict_massives.keys())
    sorted(massives)
    
    all_combinations = []
    for r in range(1, len(massives)):
        combinations_object = itertools.combinations(massives, r)
        combinations_list = list(combinations_object)
        all_combinations.extend(combinations_list)

    valid_combinations = []
    for combo in all_combinations:
        combo_count = sum(dict_massives[massif]['count'] for massif in combo)
        percentage = (combo_count / total_count) * 100
        if (train_size * 100) - proximity_value <= percentage <= (train_size * 100) + proximity_value:
            valid_combinations.append(combo)

    valid_combinations.sort(key=lambda combo: len(combo))

    if shuffle:
        random.shuffle(valid_combinations)

    uncovered_train_massives = set(massives)
    uncovered_test_massives = set(massives)
    selected_combinations = []

    for combo in valid_combinations:
        if not uncovered_train_massives and not uncovered_test_massives:
            break

        train_massifs_in_combo = set(combo)
        test_massifs_in_combo = set(massives) - train_massifs_in_combo

        if uncovered_train_massives & train_massifs_in_combo or uncovered_test_massives & test_massifs_in_combo:
            selected_combinations.append(combo)
            uncovered_train_massives -= train_massifs_in_combo
            uncovered_test_massives -= test_massifs_in_combo

    result = []
    for combo in selected_combinations:
        train_indices = []
        test_indices = []
        for massif in massives:
            if massif in combo:
                train_indices.extend(dict_massives[massif]['indices'])
            else:
                test_indices.extend(dict_massives[massif]['indices'])

        result.append((train_indices, test_indices))

    return result

def balance_classes(results, targets, method='oversample', seed=42):
    """
    Balance the classes within each fold using the specified method.

    Parameters
    ----------
    results : list of tuples
        A list containing train and test indices for each fold.
    targets : numpy.ndarray
        Target labels.
    method : str, optional (default='oversample')
        The resampling method to use ('oversample', 'undersample', 'smote').
    seed : int, optional (default=42)
        Seed for random number generator.

    Returns
    -------
    list of tuples
        A list containing balanced train and test indices for each fold.
    """
    balanced_results = []

    for train_indices, test_indices in results:
        train_targets = targets[train_indices]
        
        if method == 'oversample':
            sampler = RandomOverSampler(random_state=seed)
            balanced_train_indices, _ = sampler.fit_resample(np.array(train_indices).reshape(-1, 1), train_targets)
            balanced_train_indices = balanced_train_indices.flatten()
        
        elif method == 'undersample':
            sampler = RandomUnderSampler(random_state=seed)
            balanced_train_indices, _ = sampler.fit_resample(np.array(train_indices).reshape(-1, 1), train_targets)
            balanced_train_indices = balanced_train_indices.flatten()
        
        elif method == 'smote':
            sampler = SMOTE(random_state=seed)
            balanced_train_indices, _ = sampler.fit_resample(np.array(train_indices).reshape(-1, 1), train_targets)
            balanced_train_indices = balanced_train_indices.flatten()
        
        else:
            raise ValueError(f"Unknown resampling method: {method}")

        balanced_results.append((balanced_train_indices.tolist(), test_indices))

    return balanced_results

In [101]:
results = combination_method(massives_count, train_size=0.8, proximity_value=1, shuffle=True, seed=100)
results = balance_classes(results, targets, method="undersample", seed=100)
for kfold, (train_index, test_index) in enumerate(results):
        print(f"------------------ Fold: {kfold} ------------------")
        
        train_unique_targets, train_target_counts = np.unique(targets[train_index], return_counts=True)
        train_target_ratios = train_target_counts / train_target_counts.sum()
        train_target_info = ", ".join(f"{target}: {count} ({ratio:.2%})" for target, count, ratio in zip(train_unique_targets, train_target_counts, train_target_ratios))
        
        test_unique_targets, test_target_counts = np.unique(targets[test_index], return_counts=True)
        test_target_ratios = test_target_counts / test_target_counts.sum()
        test_target_info = ", ".join(f"{target}: {count} ({ratio:.2%})" for target, count, ratio in zip(test_unique_targets, test_target_counts, test_target_ratios))
        
        print(f"    - Distribution class train: {train_target_info}")
        print(f"    - Distribution class test: {test_target_info}")
        print(f"    - Train size: {len(train_index) / (len(train_index) + len(test_index)) * 100:.2f}%")
        print(f"    - Massif in train: {np.unique(y['metadata'][train_index, 1])}")
        print(f"    - Massif in test: {np.unique(y['metadata'][test_index, 1])}")

------------------ Fold: 0 ------------------
    - Distribution class train: 0: 902 (50.00%), 1: 902 (50.00%)
    - Distribution class test: 0: 328 (63.08%), 1: 192 (36.92%)
    - Train size: 77.62%
    - Massif in train: ['ARAVIS' 'BAUGES' 'BELLEDONNE' 'CHARTEUSE' 'GRANDES-ROUSSES' 'HTE-MAURIE'
 'HTE-TARENT' 'MAURIENNE' 'MONT-BLANC']
    - Massif in test: ['BEAUFORTAIN' 'VANOISE' 'VERCORS']
------------------ Fold: 1 ------------------
    - Distribution class train: 0: 757 (50.00%), 1: 757 (50.00%)
    - Distribution class test: 0: 199 (37.13%), 1: 337 (62.87%)
    - Train size: 73.85%
    - Massif in train: ['ARAVIS' 'BAUGES' 'BEAUFORTAIN' 'CHARTEUSE' 'HTE-MAURIE' 'HTE-TARENT'
 'MAURIENNE' 'VANOISE']
    - Massif in test: ['BELLEDONNE' 'GRANDES-ROUSSES' 'MONT-BLANC' 'VERCORS']
------------------ Fold: 2 ------------------
    - Distribution class train: 0: 883 (50.00%), 1: 883 (50.00%)
    - Distribution class test: 0: 295 (58.30%), 1: 211 (41.70%)
    - Train size: 77.73%
    - Ma

In [15]:
for kfold, (train_index, test_index) in enumerate(results):
        print(f"------------------ Fold: {kfold} ------------------")
        
        train_unique_targets, train_target_counts = np.unique(targets[train_index], return_counts=True)
        train_target_ratios = train_target_counts / train_target_counts.sum()
        train_target_info = ", ".join(f"{target}: {count} ({ratio:.2%})" for target, count, ratio in zip(train_unique_targets, train_target_counts, train_target_ratios))
        
        test_unique_targets, test_target_counts = np.unique(targets[test_index], return_counts=True)
        test_target_ratios = test_target_counts / test_target_counts.sum()
        test_target_info = ", ".join(f"{target}: {count} ({ratio:.2%})" for target, count, ratio in zip(test_unique_targets, test_target_counts, test_target_ratios))
        
        print(f"    - Distribution class train: {train_target_info}")
        print(f"    - Distribution class test: {test_target_info}")
        print(f"    - Train size: {len(train_index) / (len(train_index) + len(test_index)) * 100:.2f}%")
        print(f"    - Massif in train: {np.unique(y['metadata'][train_index, 1])}")
        print(f"    - Massif in test: {np.unique(y['metadata'][test_index, 1])}")

------------------ Fold: 0 ------------------
    - Distribution class train: 0: 902 (50.00%), 1: 902 (50.00%)
    - Distribution class test: 0: 328 (63.08%), 1: 192 (36.92%)
    - Train size: 77.62%
    - Massif in train: ['ARAVIS' 'BAUGES' 'BELLEDONNE' 'CHARTEUSE' 'GRANDES-ROUSSES' 'HTE-MAURIE'
 'HTE-TARENT' 'MAURIENNE' 'MONT-BLANC']
    - Massif in test: ['BEAUFORTAIN' 'VANOISE' 'VERCORS']
------------------ Fold: 1 ------------------
    - Distribution class train: 0: 757 (50.00%), 1: 757 (50.00%)
    - Distribution class test: 0: 199 (37.13%), 1: 337 (62.87%)
    - Train size: 73.85%
    - Massif in train: ['ARAVIS' 'BAUGES' 'BEAUFORTAIN' 'CHARTEUSE' 'HTE-MAURIE' 'HTE-TARENT'
 'MAURIENNE' 'VANOISE']
    - Massif in test: ['BELLEDONNE' 'GRANDES-ROUSSES' 'MONT-BLANC' 'VERCORS']
------------------ Fold: 2 ------------------
    - Distribution class train: 0: 883 (50.00%), 1: 883 (50.00%)
    - Distribution class test: 0: 295 (58.30%), 1: 211 (41.70%)
    - Train size: 77.73%
    - Ma

In [10]:
for kfold, (train_index, test_index) in enumerate(results):
        print(f"------------------ Fold: {kfold} ------------------")
        
        train_unique_targets, train_target_counts = np.unique(targets[train_index], return_counts=True)
        train_target_ratios = train_target_counts / train_target_counts.sum()
        train_target_info = ", ".join(f"{target}: {count} ({ratio:.2%})" for target, count, ratio in zip(train_unique_targets, train_target_counts, train_target_ratios))
        
        test_unique_targets, test_target_counts = np.unique(targets[test_index], return_counts=True)
        test_target_ratios = test_target_counts / test_target_counts.sum()
        test_target_info = ", ".join(f"{target}: {count} ({ratio:.2%})" for target, count, ratio in zip(test_unique_targets, test_target_counts, test_target_ratios))
        
        print(f"    - Distribution class train: {train_target_info}")
        print(f"    - Distribution class test: {test_target_info}")
        print(f"    - Train size: {len(train_index) / (len(train_index) + len(test_index)) * 100:.2f}%")
        print(f"    - Massif in train: {np.unique(y['metadata'][train_index, 1])}")
        print(f"    - Massif in test: {np.unique(y['metadata'][test_index, 1])}")

------------------ Fold: 0 ------------------
    - Distribution class train: 0: 934 (50.00%), 1: 934 (50.00%)
    - Distribution class test: 0: 371 (69.87%), 1: 160 (30.13%)
    - Train size: 77.87%
    - Massif in train: ['ARAVIS' 'BAUGES' 'BELLEDONNE' 'CHARTEUSE' 'GRANDES-ROUSSES' 'HTE-TARENT'
 'MONT-BLANC' 'VANOISE' 'VERCORS']
    - Massif in test: ['BEAUFORTAIN' 'HTE-MAURIE' 'MAURIENNE']
------------------ Fold: 1 ------------------
    - Distribution class train: 0: 774 (50.00%), 1: 774 (50.00%)
    - Distribution class test: 0: 198 (38.22%), 1: 320 (61.78%)
    - Train size: 74.93%
    - Massif in train: ['ARAVIS' 'BAUGES' 'BEAUFORTAIN' 'CHARTEUSE' 'GRANDES-ROUSSES'
 'HTE-TARENT' 'MAURIENNE' 'VANOISE' 'VERCORS']
    - Massif in test: ['BELLEDONNE' 'HTE-MAURIE' 'MONT-BLANC']
------------------ Fold: 2 ------------------
    - Distribution class train: 0: 774 (50.00%), 1: 774 (50.00%)
    - Distribution class test: 0: 195 (37.86%), 1: 320 (62.14%)
    - Train size: 75.04%
    - Ma