In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import time


#needed to import utils.py
sys.path.append('../') 

import utils
import utils_preprocessing
import utils_exec_models

import numpy as np
import pandas as pd

from IPython.core.display import HTML

from IPython.display import clear_output


# Surpress warnings:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)


%matplotlib inline  



# Get and save the 10 best performances obtained by GridSearch grouping by ML algorithm

In [3]:
#read SINGLE-MODEL results
csv_file = 'exec_results/results_Single_Model.csv'
df_single_model = utils.read_csv(csv_file)
# display(df_single_model.head(3))

#read ENSEMBLE-IMBALANCE results
csv_file = 'exec_results/results_Ensemble_Imbalance.csv'
df_ens_imb = utils.read_csv(csv_file)
# display(df_ens_imb.head(3))


# get each algorithm analyzed
algorithms = list(df_single_model.Model.unique())
model_classes = list(df_single_model.Classifier.unique())
# print(model_classes)
# print(algorithms)

dir_dest = os.path.abspath('exec_results/')

# store all best models Object for both scenarios
best_models = list()

n_to_save = 25

for algorithm, model_class in zip(algorithms, model_classes):
    utils.print_string_with_separators(f'{algorithm} - {model_class}')
    
    # ==============================================================     
    # get the "n" best performances for the SINGLE-MODEL scenario
    # ==============================================================     
    print(f'   > SINGLE-MODEL ({n_to_save} best)')
    df_best_single_model = df_single_model.loc[(
          (df_single_model.Model == algorithm) 
    )].copy()

    
    # get and save the 5 best results for the algorithm    
    df_best_single_model = df_best_single_model.head(n_to_save)
#     display(df_best_single_model)
    csv_file = f'{dir_dest}/best_performances_{algorithm}_in_SINGLE_MODEL.csv'
    csv_file = csv_file.replace(' ', '_')
    utils.save_to_csv(df=df_best_single_model, csv_file=csv_file)

    # Get the best models and create instances using their hyperparameters    
    for model in utils_exec_models.get_models_object_from_results(df_best_single_model):
#         model['csv_file'] = csv_file
        best_models.append(model)
    
    
    
    # ==============================================================     
    # get the "n" best performances for the ENSEMBLE-IMBALANCE scenario
    # ==============================================================     
    print(f'   > ENSEMBLE-IMBALANCE ({n_to_save} best)')
    df_best_ens_imb = df_ens_imb.loc[(
          (df_ens_imb.Estimator_Desc == algorithm) 
    )].copy()

    
    # get and save the 5 best results for the algorithm    
    df_best_ens_imb = df_best_ens_imb.head(n_to_save)
#     display(df_best_ens_imb)
    csv_file = f'{dir_dest}/best_performances_{algorithm}_in_ENSEMBLE_IMBALANCE.csv'
    csv_file = csv_file.replace(' ', '_')
    utils.save_to_csv(df=df_best_ens_imb, csv_file=csv_file)
    
    # Get the best models and create instances using their hyperparameters    
    for model in utils_exec_models.get_models_object_from_results(df_best_ens_imb):
#         model['csv_file'] = csv_file
        best_models.append(model)

    


---------
SVM - SVC
---------
   > SINGLE-MODEL (25 best)
25 samples were saved
   > ENSEMBLE-IMBALANCE (25 best)
25 samples were saved
--------------------------
Naïve Bayes - ComplementNB
--------------------------
   > SINGLE-MODEL (25 best)
19 samples were saved
   > ENSEMBLE-IMBALANCE (25 best)
25 samples were saved
-------------------------------
Neural Networks - MLPClassifier
-------------------------------
   > SINGLE-MODEL (25 best)
25 samples were saved
   > ENSEMBLE-IMBALANCE (25 best)
25 samples were saved
--------------------------------------
Decision Tree - DecisionTreeClassifier
--------------------------------------
   > SINGLE-MODEL (25 best)
25 samples were saved
   > ENSEMBLE-IMBALANCE (25 best)
25 samples were saved
-----------------
k-NN - GaussianNB
-----------------
   > SINGLE-MODEL (25 best)
25 samples were saved
   > ENSEMBLE-IMBALANCE (25 best)
25 samples were saved
-----------------------------------------
Random Forest - RadiusNeighborsClassifier
--------

---
---
# Reexecute each best model as following:

 - ### Retrain the model using the full $Training$ set
 - ### Validate the model using the $Validation$ set 
 - ### Save the $Validation$ $performances$
 

In [5]:
%%time

csv_validation_performance = os.path.abspath('exec_results/validation_results.csv')



# verify if already exists an CSV with the results
overwrite_results_saved_previously = False

if os.path.exists(csv_validation_performance) and overwrite_results_saved_previously==False:
    print('Reading results saved previously...')
    df_validation_performance = utils.read_csv(csv_file=csv_validation_performance)
else:
    df_validation_performance = None


# Get the scaled Training and Validation subsets¶
X_train, y_train, X_valid, y_valid = utils.get_train_and_validation_data(scaled=True)

y_train = y_train[utils.CLASS_COLUMN].ravel()
y_valid = y_valid[utils.CLASS_COLUMN].ravel()


i = 0
tot = len(best_models)



## For each best model:
#    1. Retrain using using the Training data
#    2. Validate using the Validation data


for model_info in best_models:

    
    try:    

        # get the classifier name (without the parameters)
        model_instance = model_info['model_instance']
        scenario = model_info['Scenario']
        model = model_info['Model']
        model_params = model_info['Hyperparams']
        estimator = model_info['Estimator']
        estimator_params = model_info['Estimator_Hyperparams']

        model_str = f'{model}' + ('' if scenario=='Single-Model' else f'({estimator})')

        i += 1
        
        # check if model was already executed
        if df_validation_performance is not None:
            if scenario == 'Single-Model':
                df_executed = df_validation_performance.loc[
                    (df_validation_performance.Scenario == scenario)
                   &(df_validation_performance.Model == model) 
                   &(df_validation_performance.Model_Hyperparams == model_params) 
                   &(df_validation_performance.Estimator.isnull()) 
                   &(df_validation_performance.Estimator_Hyperparams.isnull()) 
                ].copy()
            else:
                df_executed = df_validation_performance.loc[
                    (df_validation_performance.Scenario == scenario)
                   &(df_validation_performance.Model == model) 
                   &(df_validation_performance.Model_Hyperparams == model_params) 
                   &(df_validation_performance.Estimator == estimator) 
                   &(df_validation_performance.Estimator_Hyperparams == estimator_params) 
                ].copy()
            
            if df_executed.shape[0] > 0:
                print(f'{i:>3} was already executed: {model_str} in "{scenario}" scenario')
                continue

        
        print(f'{i:>3}/{tot}: Executing {model_str} in "{scenario}" scenario',)

        # Retrain using the full traning set
        model_instance.fit(X_train, y_train)

        #predict using the Validation set
        y_pred = model_instance.predict(X_valid)

        #get Validation performance
        bal_acc, sens, spec, auc, acc, prec, f1 = utils_exec_models.get_scores_from_predict(
            y_validation=y_valid, 
            y_pred=y_pred, 
        )

        # Store the Validation and Training performances
        performance_to_save = {
            'Scenario': scenario,
            'Model': model,
            'Estimator': estimator,
            # Validation performance
            'Valid_BalAcc': bal_acc,
            'Valid_Sens'  : sens,
            'Valid_Spec'  : spec,
            'Valid_f1'    : f1,
            'Valid_AUC'   : auc,
            'Valid_Acc'   : acc,
            'Valid_Prec'  : prec,
            #
            'Model_Hyperparams': model_params,
            'Estimator_Hyperparams': estimator_params,
        }

        # create a dataFrame to store the performances
        df_aux = pd.DataFrame([performance_to_save])

        if df_validation_performance is None:
            df_validation_performance = df_aux
        else:
            df_validation_performance = pd.concat(
                [df_validation_performance, df_aux],
                ignore_index=True,
            )

    except Exception as ex:
        print('Instance')
        print(model_instance)
        print('INFO')
        print(model_info)
        print('ERROR')
        raise Exception(ex)
       


# sort the validation performances
df_validation_performance = utils_exec_models.sort_performances_results(
    df=df_validation_performance,
    cols_order_to_sort=[
        'Scenario', 
#         'Model', 
#         'Estimator', 
        'Valid_BalAcc', 
        'Valid_Sens', 
        'Valid_Spec'],
)



# save validation performances
utils.save_to_csv(
    df=df_validation_performance, 
    csv_file=csv_validation_performance
)

Reading results saved previously...
  1/294: Executing SVM in "Single-Model" scenario
  2/294: Executing SVM in "Single-Model" scenario
  3/294: Executing SVM in "Single-Model" scenario
  4/294: Executing SVM in "Single-Model" scenario
  5/294: Executing SVM in "Single-Model" scenario
  6/294: Executing SVM in "Single-Model" scenario
  7/294: Executing SVM in "Single-Model" scenario
  8/294: Executing SVM in "Single-Model" scenario
  9/294: Executing SVM in "Single-Model" scenario
 10/294: Executing SVM in "Single-Model" scenario
 11/294: Executing SVM in "Single-Model" scenario
 12/294: Executing SVM in "Single-Model" scenario
 13/294: Executing SVM in "Single-Model" scenario
 14/294: Executing SVM in "Single-Model" scenario
 15/294: Executing SVM in "Single-Model" scenario
 16/294: Executing SVM in "Single-Model" scenario
 17/294: Executing SVM in "Single-Model" scenario
 18/294: Executing SVM in "Single-Model" scenario
 19/294: Executing SVM in "Single-Model" scenario
 20/294: Execu

135/294: Executing Balanced-Bagging(NN) in "Ensemble-Imbalance" scenario
136/294: Executing Balanced-Bagging(NN) in "Ensemble-Imbalance" scenario
137/294: Executing Balanced-Bagging(NN) in "Ensemble-Imbalance" scenario
138/294: Executing Balanced-Bagging(NN) in "Ensemble-Imbalance" scenario
139/294: Executing Balanced-Bagging(NN) in "Ensemble-Imbalance" scenario
140/294: Executing Balanced-Bagging(NN) in "Ensemble-Imbalance" scenario
141/294: Executing Balanced-Bagging(NN) in "Ensemble-Imbalance" scenario
142/294: Executing Balanced-Bagging(NN) in "Ensemble-Imbalance" scenario
143/294: Executing Balanced-Bagging(NN) in "Ensemble-Imbalance" scenario
144/294: Executing Balanced-Bagging(NN) in "Ensemble-Imbalance" scenario
145/294: Executing DT in "Single-Model" scenario
146/294: Executing DT in "Single-Model" scenario
147/294: Executing DT in "Single-Model" scenario
148/294: Executing DT in "Single-Model" scenario
149/294: Executing DT in "Single-Model" scenario
150/294: Executing DT in 

  warn(
  warn(


175/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
176/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario


  warn(


177/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
178/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario


  warn(


179/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
180/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario


  warn(


181/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
182/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario


  warn(


183/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
184/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
185/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
186/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
187/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
188/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
189/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
190/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
191/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
192/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
193/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
194/294: Executing Balanced-Bagging(DT) in "Ensemble-Imbalance" scenario
195/294: Executing k-NN in "Single-Model" scenario
196/294: Executing k-NN in "Single-Model" scenario
197/294: Executing k-N

---
---
---
# OTHERS


In [29]:
y_train[utils.CLASS_COLUMN].ravel()

array([0., 0., 0., ..., 0., 0., 0.])