In [1]:
%load_ext autoreload
%autoreload 2


def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import sys
import os
import time


#needed to import utils.py
sys.path.append('../') 

import utils
import utils_preprocessing
import utils_exec_models
import utils_exec_models_new

import numpy as np
import pandas as pd
#to view entire text of the comuns
pd.set_option('display.max_colwidth', None) 

import sklearn as sk

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import HTML

from IPython.display import clear_output

# Import module for data visualization
from plotnine import *
import plotnine

import pickle

%matplotlib inline  

In [12]:
# get train and tezt sets INCLUDING the Diagnosis_Delay feature
X_train_all, y_train_all, X_valid_all, y_valid_all = utils.get_train_and_validation_data(
    scaled=True,
    use_diagnosis_delay=True,
)

# get train and tezt sets REMOVING the Diagnosis_Delay feature
X_train_ndd, y_train_ndd, X_valid_ndd, y_valid_ndd = utils.get_train_and_validation_data(
    scaled=True,
    use_diagnosis_delay=False,
)



datasets = [
    ['All_Features', X_train_all, y_train_all, X_valid_all, y_valid_all],
    ['without_Diagnosis_Delay', X_train_ndd, y_train_ndd, X_valid_ndd, y_valid_ndd],
] 



# pattern to save the instance for each gridSearch object generated
dir_dest = os.path.abspath('exec_results')


# define the Cross-Validation strategy
CV_N_SPLITS = 4
CV_N_REPEATS = 3
RANDOM_STATE = 42

cv = sk.model_selection.RepeatedStratifiedKFold(n_splits=CV_N_SPLITS, n_repeats=CV_N_REPEATS, random_state=RANDOM_STATE)


grids_executed = []


# get all param_grid combinations for each classifier
testing = True
testing = False

grid_configs = [
    utils_exec_models_new.create_models_NB_Gaussian_grid(testing=testing),
#     utils_exec_models_new.create_models_DT_grid(testing=testing),
#     utils_exec_models_new.create_models_NB_Complement_grid(testing=testing),
    utils_exec_models_new.create_models_kNN_grid(testing=testing),
#     utils_exec_models_new.create_models_RadiusNN_grid(testing=testing),
#     utils_exec_models_new.create_models_SVM_grid(testing=testing),    
#     utils_exec_models_new.create_models_RF_grid(testing=testing),
#     utils_exec_models_new.create_models_NN_grid(qty_features=X_train.shape[1], testing=testing),
]



# ======================================================================
# ======================================================================
# ======================================================================
# ======================================================================

# for each features_config and datasets
for features_config, X_train, y_train, X_valid, y_valid in datasets:
    # for each ML algorithm and param_grid
    for classifier, param_grid in grid_configs: 
        model_desc = utils.get_model_short_description(classifier).replace('-', '')
        utils.print_string_with_separators(f'{model_desc} - {features_config}')


        # ====================================================
        # execute gridSearch in the Single-Model scenario
        # ====================================================
        scenario = 'Single_Model'
        print(f'   Executing {scenario}')
        
        grid = sk.model_selection.GridSearchCV(
            estimator=classifier, 
            param_grid=param_grid, 
            scoring=utils_exec_models_new.get_default_scoring(), 
            cv=cv,
            verbose=1,
            n_jobs=utils_exec_models_new.N_JOBS, #7
            return_train_score=True,
            refit=utils_exec_models_new.DEFAULT_SCORE # balanced accuracy
        )

        # fit the grid and save the trainning and validation performances
        grid, df_validation_performances = utils_exec_models_new.exec_grid_search_and_save_performances(
            dir_dest=dir_dest, 
            testing=testing, 
            grid=grid, 
            classifier=classifier, 
            scenario=scenario, 
            features_config=features_config, 
            X_train=X_train, 
            y_train=y_train, 
            X_valid=X_valid, 
            y_valid=y_valid,
        )

        print(f' FINISHED !!! [{model_desc} - {scenario} - {features_config}]')
        print()

        
        # ====================================================
        # execute gridSearch in the Ensemble_Imbalance scenario
        # ====================================================
        scenario = 'Ensemble_Imbalance'
        print(f'   Executing {scenario}')

        models_to_use_as_estimator = utils_exec_models_new.create_model_instances_from_performances(
            df=df_validation_performances
        )

        es_classifier, es_estimator, es_param_grid = utils_exec_models_new.create_models_BalancedBagging_grid(
            estimator=models_to_use_as_estimator,
            testing=testing,
        )

        es_grid = sk.model_selection.GridSearchCV(
            estimator=es_classifier, 
            param_grid=es_param_grid, 
            scoring=utils_exec_models_new.get_default_scoring(), 
            cv=cv,
            verbose=1,
            n_jobs=utils_exec_models_new.N_JOBS, #7
            return_train_score=True,
            refit=utils_exec_models_new.DEFAULT_SCORE # balanced accuracy
        )

        # fit the grid and save the trainning and validation performances
        es_grid, es_df_validation_performances = utils_exec_models_new.exec_grid_search_and_save_performances(
            dir_dest=dir_dest, 
            testing=testing, 
            grid=es_grid, 
            classifier=es_classifier, 
            scenario=scenario, 
            features_config=features_config, 
            X_train=X_train, 
            y_train=y_train, 
            X_valid=X_valid, 
            y_valid=y_valid,
        )
        
        
        clear_output() 

        print(f' FINISHED !!! [{model_desc} - {scenario} - {features_config}]')
        print()
        
#         break
#     break
    

    
print(f' FINISHED ALL !!!')
    
display(df_validation_performances.head(2))    
display(es_df_validation_performances)    

# ======================================================================
# ======================================================================
# ======================================================================
# ======================================================================
        
        


 FINISHED !!! [kNN - Ensemble_Imbalance - without_Diagnosis_Delay]

 FINISHED ALL !!!


Unnamed: 0,Scenario,Features,Model,balanced_accuracy,sensitivity,specificity,f1_score,AUC,accuracy,precision,...,Estimator_Class,Estimator_Hyperparams,fit_time,train_balanced_accuracy,train_sensitivity,train_specificity,train_f1_score,train_AUC,train_accuracy,train_precision
0,Single_Model,without_Diagnosis_Delay,k-NN,0.62,0.28,0.95,0.34,0.74,0.87,0.45,...,,,0.01,0.62,0.29,0.96,0.37,0.73,0.87,0.51
1,Single_Model,without_Diagnosis_Delay,k-NN,0.62,0.28,0.96,0.35,0.73,0.88,0.46,...,,,0.02,0.62,0.28,0.97,0.37,0.73,0.88,0.55


Unnamed: 0,Scenario,Features,Model,balanced_accuracy,sensitivity,specificity,f1_score,AUC,accuracy,precision,...,Estimator_Class,Estimator_Hyperparams,fit_time,train_balanced_accuracy,train_sensitivity,train_specificity,train_f1_score,train_AUC,train_accuracy,train_precision
0,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.79,0.77,0.81,0.48,0.88,0.80,0.35,...,KNeighborsClassifier,"{'algorithm':'auto','leaf_size':30,'metric':'euclidean','metric_params':None,'n_jobs':None,'n_neighbors':15,'p':2,'weights':'distance'}",0.07,0.76,0.69,0.83,0.49,0.83,0.81,0.38
1,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.79,0.77,0.81,0.48,0.88,0.80,0.35,...,KNeighborsClassifier,"{'algorithm':'auto','leaf_size':30,'metric':'euclidean','metric_params':None,'n_jobs':None,'n_neighbors':15,'p':2,'weights':'distance'}",0.07,0.76,0.69,0.83,0.49,0.83,0.81,0.38
2,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.79,0.77,0.81,0.48,0.88,0.80,0.35,...,KNeighborsClassifier,"{'algorithm':'auto','leaf_size':30,'metric':'euclidean','metric_params':None,'n_jobs':None,'n_neighbors':15,'p':2,'weights':'distance'}",0.07,0.76,0.69,0.83,0.49,0.83,0.81,0.38
3,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.79,0.77,0.81,0.48,0.88,0.80,0.35,...,KNeighborsClassifier,"{'algorithm':'auto','leaf_size':30,'metric':'euclidean','metric_params':None,'n_jobs':None,'n_neighbors':15,'p':2,'weights':'distance'}",0.07,0.76,0.69,0.83,0.49,0.83,0.81,0.38
4,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.79,0.77,0.81,0.48,0.88,0.80,0.35,...,KNeighborsClassifier,"{'algorithm':'auto','leaf_size':30,'metric':'euclidean','metric_params':None,'n_jobs':None,'n_neighbors':15,'p':2,'weights':'distance'}",0.07,0.76,0.69,0.83,0.49,0.83,0.81,0.38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.52,0.06,0.98,0.11,0.57,0.87,0.33,...,KNeighborsClassifier,"{'algorithm':'auto','leaf_size':30,'metric':'chebyshev','metric_params':None,'n_jobs':None,'n_neighbors':15,'p':2,'weights':'uniform'}",0.08,0.50,0.01,0.99,0.03,0.52,0.86,0.17
1724,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.50,0.02,0.99,0.04,0.57,0.87,0.17,...,KNeighborsClassifier,"{'algorithm':'auto','leaf_size':30,'metric':'chebyshev','metric_params':None,'n_jobs':None,'n_neighbors':15,'p':2,'weights':'uniform'}",0.05,0.50,0.01,0.99,0.02,0.52,0.87,0.18
1725,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.50,0.02,0.99,0.04,0.57,0.87,0.17,...,KNeighborsClassifier,"{'algorithm':'auto','leaf_size':30,'metric':'chebyshev','metric_params':None,'n_jobs':None,'n_neighbors':15,'p':2,'weights':'uniform'}",0.05,0.50,0.01,0.99,0.02,0.52,0.87,0.18
1726,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.52,0.04,0.99,0.08,0.56,0.88,0.40,...,KNeighborsClassifier,"{'algorithm':'auto','leaf_size':30,'metric':'chebyshev','metric_params':None,'n_jobs':None,'n_neighbors':15,'p':2,'weights':'uniform'}",0.08,0.50,0.01,0.99,0.02,0.52,0.87,0.13


In [None]:
file = open(
    'exec_results/serialized_data/grid_search__TESTING__DT__All_Features__Ensemble_Imbalance__BalBagging.pickle', 
    'rb',
)

# dump information to that file
obj = pickle.load(file)

obj.cv_results_