In [1]:
%load_ext autoreload
%autoreload 2


def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import sys
import os
import time


#needed to import utils.py
sys.path.append('../') 

import utils
import utils_preprocessing
import utils_exec_models
import utils_exec_models_new

import numpy as np
import pandas as pd
#to view entire text of the comuns
pd.set_option('display.max_colwidth', None) 

import sklearn as sk

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import HTML

from IPython.display import clear_output

# Import module for data visualization
from plotnine import *
import plotnine

import pickle

%matplotlib inline  

In [21]:
# get train and tezt sets INCLUDING the Diagnosis_Delay feature
X_train_all, y_train_all, X_valid_all, y_valid_all = utils.get_train_and_validation_data(
    scaled=True,
    use_diagnosis_delay=True,
)

# get train and tezt sets REMOVING the Diagnosis_Delay feature
X_train_ndd, y_train_ndd, X_valid_ndd, y_valid_ndd = utils.get_train_and_validation_data(
    scaled=True,
    use_diagnosis_delay=False,
)



datasets = [
    ['All_Features', X_train_all, y_train_all, X_valid_all, y_valid_all],
    ['without_Diagnosis_Delay', X_train_ndd, y_train_ndd, X_valid_ndd, y_valid_ndd],
] 



# pattern to save the instance for each gridSearch object generated
dir_dest = os.path.abspath('exec_results')


# define the Cross-Validation strategy
CV_N_SPLITS = 4
CV_N_REPEATS = 3
RANDOM_STATE = 42

cv = sk.model_selection.RepeatedStratifiedKFold(n_splits=CV_N_SPLITS, n_repeats=CV_N_REPEATS, random_state=RANDOM_STATE)


grids_executed = []


# get all param_grid combinations for each classifier
testing = True
testing = False

grid_configs = [
# OK    utils_exec_models_new.create_models_NB_Gaussian_grid(testing=testing),
# OK    utils_exec_models_new.create_models_kNN_grid(testing=testing),

# OK    utils_exec_models_new.create_models_NB_Complement_grid(testing=testing),
    utils_exec_models_new.create_models_RadiusNN_grid(testing=testing),

#     utils_exec_models_new.create_models_DT_grid(testing=testing),
#     utils_exec_models_new.create_models_SVM_grid(testing=testing),    
#     utils_exec_models_new.create_models_RF_grid(testing=testing),
#     utils_exec_models_new.create_models_NN_grid(qty_features=X_train.shape[1], testing=testing),
]



# ======================================================================
# ======================================================================
# ======================================================================
# ======================================================================

# for each features_config and datasets
for features_config, X_train, y_train, X_valid, y_valid in datasets:
    # for each ML algorithm and param_grid
    for classifier, param_grid in grid_configs: 
        model_desc = utils.get_model_short_description(classifier).replace('-', '')
        utils.print_string_with_separators(f'{classifier} - {features_config}')


        # ====================================================
        # execute gridSearch in the Single-Model scenario
        # ====================================================
        scenario = 'Single_Model'
        print(f'   Executing {scenario}')
        
        grid = sk.model_selection.GridSearchCV(
            estimator=classifier, 
            param_grid=param_grid, 
            scoring=utils_exec_models_new.get_default_scoring(), 
            cv=cv,
            verbose=1,
            n_jobs=utils_exec_models_new.N_JOBS, #7
            return_train_score=True,
            refit=utils_exec_models_new.DEFAULT_SCORE # balanced accuracy
        )

        # fit the grid and save the trainning and validation performances
        grid, df_validation_performances = utils_exec_models_new.exec_grid_search_and_save_performances(
            dir_dest=dir_dest, 
            testing=testing, 
            grid=grid, 
            classifier=classifier, 
            scenario=scenario, 
            features_config=features_config, 
            X_train=X_train, 
            y_train=y_train, 
            X_valid=X_valid, 
            y_valid=y_valid,
        )

        print(f' FINISHED !!! [{model_desc} - {scenario} - {features_config}]')
        print()

        
        # ====================================================
        # execute gridSearch in the Ensemble_Imbalance scenario
        # ====================================================
        scenario = 'Ensemble_Imbalance'
        print(f'   Executing {scenario}')

        # use the best 10 performances as estimator to Balanced-Bagging    
        models_to_use_as_estimator = utils_exec_models_new.create_model_instances_from_performances(
            df=df_validation_performances.head(10)
        )
        
        es_classifier, es_estimator, es_param_grid = utils_exec_models_new.create_models_BalancedBagging_grid(
            estimator=models_to_use_as_estimator,
            testing=testing,
        )

        es_grid = sk.model_selection.GridSearchCV(
            estimator=es_classifier, 
            param_grid=es_param_grid, 
            scoring=utils_exec_models_new.get_default_scoring(), 
            cv=cv,
            verbose=1,
            n_jobs=utils_exec_models_new.N_JOBS, #7
            return_train_score=True,
            refit=utils_exec_models_new.DEFAULT_SCORE # balanced accuracy
        )

        # fit the grid and save the trainning and validation performances
        es_grid, es_df_validation_performances = utils_exec_models_new.exec_grid_search_and_save_performances(
            dir_dest=dir_dest, 
            testing=testing, 
            grid=es_grid, 
            classifier=es_classifier, 
            scenario=scenario, 
            features_config=features_config, 
            X_train=X_train, 
            y_train=y_train, 
            X_valid=X_valid, 
            y_valid=y_valid,
        )
        
        
        clear_output() 

        print(f' FINISHED !!! [{model_desc} - {scenario} - {features_config}]')
        print()
        
#         break
#     break
    

    
print(f' FINISHED ALL !!!')
    
display(df_validation_performances.head(2))    
display(es_df_validation_performances)    

# ======================================================================
# ======================================================================
# ======================================================================
# ======================================================================
        
        


------------------------------------------
RadiusNeighborsClassifier() - All_Features
------------------------------------------
   Executing Single_Model
Fitting 12 folds for each of 240 candidates, totalling 2880 fits
SAVING PERFORMANCE RESULTS...
240 samples were saved
SAVING GRID-SEARCH OBJECT...
 FINISHED !!! [kNN - Single_Model - All_Features]

   Executing Ensemble_Imbalance
Fitting 12 folds for each of 960 candidates, totalling 11520 fits


KeyboardInterrupt: 

In [14]:
df_validation_performances

Unnamed: 0,Scenario,Features,Model,balanced_accuracy,sensitivity,specificity,f1_score,AUC,accuracy,precision,...,Estimator_Class,Estimator_Hyperparams,fit_time,train_balanced_accuracy,train_sensitivity,train_specificity,train_f1_score,train_AUC,train_accuracy,train_precision
0,Single_Model,All_Features,k-NN,0.72,0.81,0.63,0.36,0.73,0.65,0.23,...,,,0.03,0.71,0.81,0.6,0.36,0.70,0.63,0.23
1,Single_Model,All_Features,k-NN,0.72,0.81,0.63,0.36,0.73,0.65,0.23,...,,,0.03,0.71,0.81,0.6,0.36,0.70,0.63,0.23
2,Single_Model,All_Features,k-NN,0.72,0.81,0.63,0.36,0.73,0.65,0.23,...,,,0.03,0.71,0.81,0.6,0.36,0.70,0.63,0.23
3,Single_Model,All_Features,k-NN,0.72,0.81,0.63,0.36,0.73,0.65,0.23,...,,,0.03,0.71,0.81,0.6,0.36,0.70,0.63,0.23
4,Single_Model,All_Features,k-NN,0.72,0.81,0.63,0.36,0.73,0.65,0.23,...,,,0.03,0.71,0.81,0.6,0.36,0.70,0.63,0.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,Single_Model,All_Features,k-NN,0.50,0.00,1.00,0.00,0.80,0.88,0.00,...,,,0.06,0.50,0.00,1.0,0.01,0.74,0.87,0.17
236,Single_Model,All_Features,k-NN,0.50,0.00,1.00,0.00,0.80,0.88,0.00,...,,,0.06,0.50,0.00,1.0,0.01,0.74,0.87,0.17
237,Single_Model,All_Features,k-NN,0.50,0.00,1.00,0.00,0.80,0.88,0.00,...,,,0.06,0.50,0.00,1.0,0.01,0.74,0.87,0.17
238,Single_Model,All_Features,k-NN,0.50,0.00,1.00,0.00,0.80,0.88,0.00,...,,,0.06,0.50,0.00,1.0,0.01,0.74,0.87,0.17


In [None]:
file = open(
    'exec_results/serialized_data/grid_search__TESTING__DT__All_Features__Ensemble_Imbalance__BalBagging.pickle', 
    'rb',
)

# dump information to that file
obj = pickle.load(file)

obj.cv_results_