In [1]:
%load_ext autoreload
%autoreload 2


def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import sys
import os
import time


#needed to import utils.py
sys.path.append('../') 

import utils
import utils_preprocessing
import utils_exec_models
import utils_exec_models_new

import numpy as np
import pandas as pd
#to view entire text of the comuns
pd.set_option('display.max_colwidth', None) 

import sklearn as sk

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import HTML

from IPython.display import clear_output

# Import module for data visualization
from plotnine import *
import plotnine

import pickle

%matplotlib inline  

In [8]:
# get train and tezt sets INCLUDING the Diagnosis_Delay feature
X_train_all, y_train_all, X_valid_all, y_valid_all = utils.get_train_and_validation_data(
    scaled=True,
    use_diagnosis_delay=True,
)

# get train and tezt sets REMOVING the Diagnosis_Delay feature
X_train_ndd, y_train_ndd, X_valid_ndd, y_valid_ndd = utils.get_train_and_validation_data(
    scaled=True,
    use_diagnosis_delay=False,
)



datasets = [
    ['All_Features', X_train_all, y_train_all, X_valid_all, y_valid_all],
    ['without_Diagnosis_Delay', X_train_ndd, y_train_ndd, X_valid_ndd, y_valid_ndd],
] 



# pattern to save the instance for each gridSearch object generated
dir_dest = os.path.abspath('exec_results')


# define the Cross-Validation strategy
CV_N_SPLITS = 4
CV_N_REPEATS = 3
RANDOM_STATE = 42

cv = sk.model_selection.RepeatedStratifiedKFold(n_splits=CV_N_SPLITS, n_repeats=CV_N_REPEATS, random_state=RANDOM_STATE)


grids_executed = []


# get all param_grid combinations for each classifier
testing = True
testing = False

grid_configs = [
#     utils_exec_models_new.create_models_DT_grid(testing=testing),
#     utils_exec_models_new.create_models_NB_Gaussian_grid(testing=testing),
#     utils_exec_models_new.create_models_NB_Complement_grid(testing=testing),
#     utils_exec_models_new.create_models_kNN_grid(testing=testing),
#     utils_exec_models_new.create_models_RadiusNN_grid(testing=testing),
#     utils_exec_models_new.create_models_SVM_grid(testing=testing),    
#     utils_exec_models_new.create_models_RF_grid(testing=testing),
#     utils_exec_models_new.create_models_NN_grid(qty_features=X_train.shape[1], testing=testing),
]



# ======================================================================
# ======================================================================
# ======================================================================
# ======================================================================

# for each features_config and datasets
for features_config, X_train, y_train, X_valid, y_valid in datasets:
    # for each ML algorithm and param_grid
    for classifier, param_grid in grid_configs: 
        model_desc = utils.get_model_short_description(classifier).replace('-', '')
        utils.print_string_with_separators(f'{model_desc} - {features_config}')


        # ====================================================
        # execute gridSearch in the Single-Model scenario
        # ====================================================
        scenario = 'Single_Model'
        print(f'   Executing {scenario}')
        
        grid = sk.model_selection.GridSearchCV(
            estimator=classifier, 
            param_grid=param_grid, 
            scoring=utils_exec_models_new.get_default_scoring(), 
            cv=cv,
            verbose=1,
            n_jobs=utils_exec_models_new.N_JOBS, #7
            return_train_score=True,
            refit=utils_exec_models_new.DEFAULT_SCORE # balanced accuracy
        )

        # fit the grid and save the trainning and validation performances
        grid, df_validation_performances = utils_exec_models_new.exec_grid_search_and_save_performances(
            dir_dest=dir_dest, 
            testing=testing, 
            grid=grid, 
            classifier=classifier, 
            scenario=scenario, 
            features_config=features_config, 
            X_train=X_train, 
            y_train=y_train, 
            X_valid=X_valid, 
            y_valid=y_valid,
        )

        print(f' FINISHED !!! [{model_desc} - {scenario} - {features_config}]')
        print()

        
        # ====================================================
        # execute gridSearch in the Ensemble_Imbalance scenario
        # ====================================================
        scenario = 'Ensemble_Imbalance'
        print(f'   Executing {scenario}')

        models_to_use_as_estimator = utils_exec_models_new.create_model_instances_from_performances(
            df=df_validation_performances
        )

        es_classifier, es_estimator, es_param_grid = utils_exec_models_new.create_models_BalancedBagging_grid(
            estimator=models_to_use_as_estimator,
            testing=testing,
        )

        es_grid = sk.model_selection.GridSearchCV(
            estimator=es_classifier, 
            param_grid=es_param_grid, 
            scoring=utils_exec_models_new.get_default_scoring(), 
            cv=cv,
            verbose=1,
            n_jobs=utils_exec_models_new.N_JOBS, #7
            return_train_score=True,
            refit=utils_exec_models_new.DEFAULT_SCORE # balanced accuracy
        )

        # fit the grid and save the trainning and validation performances
        es_grid, es_df_validation_performances = utils_exec_models_new.exec_grid_search_and_save_performances(
            dir_dest=dir_dest, 
            testing=testing, 
            grid=es_grid, 
            classifier=es_classifier, 
            scenario=scenario, 
            features_config=features_config, 
            X_train=X_train, 
            y_train=y_train, 
            X_valid=X_valid, 
            y_valid=y_valid,
        )
        
        
        clear_output() 

        print(f' FINISHED !!! [{model_desc} - {scenario} - {features_config}]')
        print()
        
#         break
#     break
    

    
print(f' FINISHED ALL !!!')
    
display(df_validation_performances.head(2))    
display(es_df_validation_performances)    

# ======================================================================
# ======================================================================
# ======================================================================
# ======================================================================
        
        


 FINISHED !!! [NN - Ensemble_Imbalance - without_Diagnosis_Delay]

 FINISHED ALL !!!


Unnamed: 0,Scenario,Features,Model,balanced_accuracy,sensitivity,specificity,f1_score,AUC,accuracy,precision,...,Estimator_Class,Estimator_Hyperparams,fit_time,train_balanced_accuracy,train_sensitivity,train_specificity,train_f1_score,train_AUC,train_accuracy,train_precision
0,Single_Model,without_Diagnosis_Delay,Neural Networks,0.66,0.36,0.96,0.44,0.85,0.89,0.57,...,,,0.01,0.67,0.4,0.94,0.45,0.8,0.87,0.52
1,Single_Model,without_Diagnosis_Delay,Neural Networks,0.7,0.45,0.95,0.48,0.87,0.89,0.52,...,,,0.01,0.67,0.39,0.95,0.46,0.82,0.88,0.57


Unnamed: 0,Scenario,Features,Model,balanced_accuracy,sensitivity,specificity,f1_score,AUC,accuracy,precision,...,Estimator_Class,Estimator_Hyperparams,fit_time,train_balanced_accuracy,train_sensitivity,train_specificity,train_f1_score,train_AUC,train_accuracy,train_precision
0,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.83,0.91,0.74,0.48,0.89,0.76,0.32,...,MLPClassifier,"{'activation':'relu','alpha':0.3,'batch_size':'auto','beta_1':0.9,'beta_2':0.999,'early_stopping':False,'epsilon':1e-08,'hidden_layer_sizes':22,'learning_rate':'constant','learning_rate_init':0.01,'max_fun':15000,'max_iter':2000,'momentum':0.9,'n_iter_no_change':10,'nesterovs_momentum':True,'power_t':0.5,'random_state':42,'shuffle':True,'solver':'sgd','tol':0.0001,'validation_fraction':0.1,'verbose':False,'warm_start':False}",0.01,0.79,0.79,0.78,0.48,0.85,0.78,0.35
1,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.83,0.91,0.74,0.48,0.89,0.76,0.32,...,MLPClassifier,"{'activation':'relu','alpha':0.3,'batch_size':'auto','beta_1':0.9,'beta_2':0.999,'early_stopping':False,'epsilon':1e-08,'hidden_layer_sizes':22,'learning_rate':'constant','learning_rate_init':0.01,'max_fun':15000,'max_iter':2000,'momentum':0.9,'n_iter_no_change':10,'nesterovs_momentum':True,'power_t':0.5,'random_state':42,'shuffle':True,'solver':'sgd','tol':0.0001,'validation_fraction':0.1,'verbose':False,'warm_start':False}",0.01,0.79,0.79,0.78,0.48,0.85,0.78,0.35
2,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.82,0.89,0.75,0.47,0.89,0.76,0.32,...,MLPClassifier,"{'activation':'relu','alpha':0.3,'batch_size':'auto','beta_1':0.9,'beta_2':0.999,'early_stopping':False,'epsilon':1e-08,'hidden_layer_sizes':22,'learning_rate':'constant','learning_rate_init':0.01,'max_fun':15000,'max_iter':2000,'momentum':0.9,'n_iter_no_change':10,'nesterovs_momentum':True,'power_t':0.5,'random_state':42,'shuffle':True,'solver':'sgd','tol':0.0001,'validation_fraction':0.1,'verbose':False,'warm_start':False}",0.01,0.78,0.78,0.78,0.48,0.85,0.78,0.35
3,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.82,0.89,0.75,0.47,0.89,0.76,0.32,...,MLPClassifier,"{'activation':'relu','alpha':0.3,'batch_size':'auto','beta_1':0.9,'beta_2':0.999,'early_stopping':False,'epsilon':1e-08,'hidden_layer_sizes':22,'learning_rate':'constant','learning_rate_init':0.01,'max_fun':15000,'max_iter':2000,'momentum':0.9,'n_iter_no_change':10,'nesterovs_momentum':True,'power_t':0.5,'random_state':42,'shuffle':True,'solver':'sgd','tol':0.0001,'validation_fraction':0.1,'verbose':False,'warm_start':False}",0.01,0.78,0.78,0.78,0.48,0.85,0.78,0.35
4,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.82,0.89,0.74,0.47,0.88,0.76,0.32,...,MLPClassifier,"{'activation':'relu','alpha':0.3,'batch_size':'auto','beta_1':0.9,'beta_2':0.999,'early_stopping':False,'epsilon':1e-08,'hidden_layer_sizes':22,'learning_rate':'constant','learning_rate_init':0.01,'max_fun':15000,'max_iter':2000,'momentum':0.9,'n_iter_no_change':10,'nesterovs_momentum':True,'power_t':0.5,'random_state':42,'shuffle':True,'solver':'sgd','tol':0.0001,'validation_fraction':0.1,'verbose':False,'warm_start':False}",0.01,0.78,0.78,0.78,0.47,0.85,0.78,0.34
5,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.82,0.89,0.74,0.47,0.88,0.76,0.32,...,MLPClassifier,"{'activation':'relu','alpha':0.3,'batch_size':'auto','beta_1':0.9,'beta_2':0.999,'early_stopping':False,'epsilon':1e-08,'hidden_layer_sizes':22,'learning_rate':'constant','learning_rate_init':0.01,'max_fun':15000,'max_iter':2000,'momentum':0.9,'n_iter_no_change':10,'nesterovs_momentum':True,'power_t':0.5,'random_state':42,'shuffle':True,'solver':'sgd','tol':0.0001,'validation_fraction':0.1,'verbose':False,'warm_start':False}",0.01,0.78,0.78,0.78,0.47,0.85,0.78,0.34
6,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.82,0.89,0.74,0.47,0.89,0.76,0.32,...,MLPClassifier,"{'activation':'relu','alpha':0.3,'batch_size':'auto','beta_1':0.9,'beta_2':0.999,'early_stopping':False,'epsilon':1e-08,'hidden_layer_sizes':22,'learning_rate':'constant','learning_rate_init':0.1,'max_fun':15000,'max_iter':2000,'momentum':0.9,'n_iter_no_change':10,'nesterovs_momentum':True,'power_t':0.5,'random_state':42,'shuffle':True,'solver':'sgd','tol':0.0001,'validation_fraction':0.1,'verbose':False,'warm_start':False}",0.01,0.77,0.77,0.77,0.47,0.84,0.77,0.34
7,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.83,0.89,0.76,0.49,0.89,0.78,0.34,...,MLPClassifier,"{'activation':'relu','alpha':0.1,'batch_size':'auto','beta_1':0.9,'beta_2':0.999,'early_stopping':False,'epsilon':1e-08,'hidden_layer_sizes':22,'learning_rate':'constant','learning_rate_init':0.01,'max_fun':15000,'max_iter':2000,'momentum':0.9,'n_iter_no_change':10,'nesterovs_momentum':True,'power_t':0.5,'random_state':42,'shuffle':True,'solver':'sgd','tol':0.0001,'validation_fraction':0.1,'verbose':False,'warm_start':False}",0.01,0.77,0.77,0.77,0.47,0.84,0.77,0.34
8,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.83,0.89,0.76,0.49,0.89,0.78,0.34,...,MLPClassifier,"{'activation':'relu','alpha':0.1,'batch_size':'auto','beta_1':0.9,'beta_2':0.999,'early_stopping':False,'epsilon':1e-08,'hidden_layer_sizes':22,'learning_rate':'constant','learning_rate_init':0.01,'max_fun':15000,'max_iter':2000,'momentum':0.9,'n_iter_no_change':10,'nesterovs_momentum':True,'power_t':0.5,'random_state':42,'shuffle':True,'solver':'sgd','tol':0.0001,'validation_fraction':0.1,'verbose':False,'warm_start':False}",0.01,0.77,0.77,0.77,0.47,0.84,0.77,0.34
9,Ensemble_Imbalance,without_Diagnosis_Delay,Balanced Bagging,0.82,0.89,0.75,0.48,0.89,0.77,0.33,...,MLPClassifier,"{'activation':'relu','alpha':0.1,'batch_size':'auto','beta_1':0.9,'beta_2':0.999,'early_stopping':False,'epsilon':1e-08,'hidden_layer_sizes':22,'learning_rate':'constant','learning_rate_init':0.01,'max_fun':15000,'max_iter':2000,'momentum':0.9,'n_iter_no_change':10,'nesterovs_momentum':True,'power_t':0.5,'random_state':42,'shuffle':True,'solver':'sgd','tol':0.0001,'validation_fraction':0.1,'verbose':False,'warm_start':False}",0.01,0.77,0.77,0.77,0.47,0.84,0.77,0.34


In [None]:
file = open(
    'exec_results/serialized_data/grid_search__TESTING__DT__All_Features__Ensemble_Imbalance__BalBagging.pickle', 
    'rb',
)

# dump information to that file
obj = pickle.load(file)

obj.cv_results_