In [1]:
import sys, os, time
import numpy as np
from datetime import datetime
from joblib import Parallel, delayed
import logging

parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from estimators.statistical_descriptor import Nagler_WS
# from plot.figure_roc import ROC_plot
from utils.dataset_management import parse_pipeline
from utils.dataset_load import shuffle_data, DatasetLoader
from utils.fold_management import FoldManagement
from utils.label_management import LabelManagement
from utils.figures import plot_boxplots, plot_roc_curves
from utils.files_management import (
    load_yaml,
    dump_pkl,
    init_logger,
    report_prediction,
    report_metric_from_log,
    set_folder,
    logger_dataset,
    logger_fold,
    save_metrics
)

def fit_predict_fold(pipeline, X_train_k, y_train_k, X_test_k, y_test_k, log_model, label_encoder, kfold, pipeline_name, save_dir):
    pipeline_id = f"{pipeline_name}_kfold_{kfold}"
    try:
        start_time = time.time()
        pipeline.fit(X_train_k, y_train_k)
        training_time = time.time() - start_time

        start_time = time.time()
        y_prob = pipeline.predict_proba(X_test_k)
        prediction_time = time.time() - start_time

        log_model, fold_metric = report_prediction(log_model, y_test_k, y_prob, label_encoder, kfold)

        fold_metric["training_time"] = training_time
        fold_metric["prediction_time"] = prediction_time

        dump_pkl(pipeline, os.path.join(save_dir, f"{pipeline_name}_fold{kfold}.pkl"))

        return fold_metric, y_prob, y_test_k
    except Exception as e:
        log_model.error(f"Pipeline {pipeline_id} failed")
        log_model.error(e)
        return None, None, None

def predict_dataset(
    x,
    targets,
    fold_groups,
    output_dir,
    pipeline_params,
    label_encoder,
    log_results,
    save=True
):
    y_est_save, metrics = {}, {}

    for count, pipeline_name in enumerate(pipeline_params["pipeline_names"]):
        save_dir = os.path.join(output_dir, f"models/{pipeline_name}/")
        log_model, _ = init_logger(save_dir, f"{pipeline_name}_results")

        log_model.info(f"================== Fitting model {pipeline_name} ==================")

        y_est_save[pipeline_name] = {"y_true": [], "y_est": []}
        fold_metrics = []

        def fit_predict_fold_wrap(fold, train_index, test_index):
            X_train_k, y_train_k = x[train_index], targets[train_index]
            X_test_k, y_test_k = x[test_index], targets[test_index]

            return fit_predict_fold(
                parse_pipeline(pipeline_params, count),
                X_train_k, y_train_k,
                X_test_k, y_test_k,
                log_model,
                label_encoder,
                fold,
                pipeline_name,
                save_dir
            )

        results = Parallel(n_jobs=7)(
            delayed(fit_predict_fold_wrap)(kfold, train_index, test_index)
            for kfold, (train_index, test_index) in enumerate(fold_groups)
        )

        for (fold_metric, y_prob, y_test_k) in results:
            if fold_metric is not None:
                fold_metrics.append(fold_metric)
                y_est_save[pipeline_name]["y_est"].extend(y_prob)
                y_est_save[pipeline_name]["y_true"].extend(y_test_k)
        
        log_model = save_metrics(log_model, fold_metrics, pipeline_name)

        if save:
            dump_pkl(fold_metrics, os.path.join(save_dir, "metrics.pkl"))
        metrics[pipeline_name] = fold_metrics

    results_dir = os.path.join(output_dir, "results/plots/")
    plot_boxplots(metrics, save_dir=results_dir)
    plot_roc_curves(metrics, save_dir=results_dir)
    log_results = report_metric_from_log(log_results, metrics, pipeline_params["metrics_to_report"])

    return y_est_save