In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import time


#needed to import utils.py
sys.path.append('../') 

import utils
import utils_preprocessing
import utils_exec_models

import numpy as np
import pandas as pd

from IPython.core.display import HTML

from IPython.display import clear_output

%matplotlib inline  

---
# $Ensemble$-$Imbalance$ scenario
---

# Get the scaled Training and Validation subsets 

In [2]:
X_train, y_train, X_valid, y_valid = utils.get_train_and_validation_data(scaled=True)

X_train.head()



Unnamed: 0,Sex_Male,Site_Onset,Diagnosis_Delay,Age_at_Onset,Riluzole,FVC_at_Diagnosis,BMI_at_Diagnosis,Q1_Speech_slope_at_Diagnosis,Q2_Salivation_slope_at_Diagnosis,Q3_Swallowing_slope_at_Diagnosis,...,Q7_Turning_in_Bed_slope_at_Diagnosis,Q8_Walking_slope_at_Diagnosis,Q9_Climbing_Stairs_slope_at_Diagnosis,Q10_Respiratory_slope_at_Diagnosis,Qty_Regions_Involved_at_Diagnosis,Region_Involved_Bulbar_at_Diagnosis,Region_Involved_Upper_Limb_at_Diagnosis,Region_Involved_Lower_Limb_at_Diagnosis,Region_Involved_Respiratory_at_Diagnosis,Patient_with_Gastrostomy_at_Diagnosis
0,1.0,1.0,0.0,0.5,0.0,1.0,0.67,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.67,1.0,1.0,1.0,0.0,0.0
1,1.0,1.0,0.5,0.75,0.0,0.0,0.67,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,0.67,1.0,1.0,1.0,0.0,0.0
2,1.0,1.0,0.0,0.25,0.0,1.0,0.33,0.5,0.0,0.0,...,0.0,0.0,0.5,0.0,0.67,1.0,1.0,1.0,0.0,0.0
3,0.0,0.0,0.5,0.5,0.0,1.0,1.0,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.67,1.0,0.0,1.0,1.0,0.0
4,1.0,0.0,1.0,0.75,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,1.0,1.0,1.0,1.0,1.0,0.0


---
# Read the Single-Model results file and get all unique $Model+Hyperparameters$
### NOTE: DO NOT re-execute all models, see `classif_filtered` variable

In [3]:
%%time

# get a Set of models from the results CSV without repeating
dir_dest = os.path.abspath('exec_results/')

results_csv_file = f'{dir_dest}/results_Single_Model.csv'

# df_single_model_results = utils.read_csv(results_csv_file)
# display(df_single_model_results)

classifiers = utils_exec_models.get_models_set_from_results(results_csv_file=results_csv_file)


# classif_filtered = []

# # # ==================================================================
# # # Keep only SVM, i.e., only the SVM will be executed
# # # ==================================================================
# for clf in classifiers:
#     to_exec = ('SVC' in str(clf))
#     if to_exec:
#         classif_filtered.append(clf)
# # # ==================================================================
        

# # ==================================================================
# # Keep only RandomForest, i.e., only the RF will be executed
# # ==================================================================
# for clf in classifiers:
#     to_exec = ('RandomForest' in str(clf))
#     if to_exec:
#         classif_filtered.append(clf)

# # ==================================================================
  
    
# classifiers = classif_filtered.copy()        
        

utils.print_array_as_list(classifiers)


[
    'ComplementNB(alpha=0.1)',
    'ComplementNB(alpha=0.5)',
    'DecisionTreeClassifier(class_weight='balanced', max_depth=5, random_state=42)',
    'GaussianNB()',
    'KNeighborsClassifier(metric='manhattan', weights='distance')',
    'MLPClassifier(alpha=0.1, hidden_layer_sizes=23, learning_rate_init=0.7,
              max_iter=300, random_state=42, solver='sgd')',
    'RadiusNeighborsClassifier(leaf_size=50, metric='manhattan', outlier_label=1,
                          radius=0.3, weights='distance')',
    'RandomForestClassifier(class_weight='balanced', max_depth=5, n_estimators=50,
                       random_state=42)',
    'SVC(C=0.1, gamma='auto', kernel='linear', probability=True, random_state=42)',
    'SVC(C=0.1, gamma='auto', probability=True, random_state=42)',
    'SVC(C=0.3, gamma='auto', kernel='linear', probability=True, random_state=42)',
    'SVC(C=0.3, gamma='auto', probability=True, random_state=42)',
]
CPU times: user 6.88 ms, sys: 0 ns, total: 6.88 ms
Wal

# Train the Models using GridSearch using the inputs and outputs created in the previous step

### Store the `param_grid`´s that will be executed

In [4]:
%%time

## define the models and hyperparameters for the GridSearch
param_grids = []

# TESTING = False
TESTING = True

if len(classifiers) > 0:

    # execute GridSearch for each classifiers 
    for classifier in classifiers:
        
        param_grid = []
        
        _ = utils_exec_models.create_models_BalancedBagging_grid(
            classifiers=[classifier],
            param_grid=param_grid, 
            testing=TESTING,
        )

        # store the param_grid's that will be executed
        param_grids.append(param_grid)
        

# utils.print_array_as_list(param_grids)        
        
print(f'A total of {len(param_grids)} param_grids will be executed')
print()


A total of 12 param_grids will be executed

CPU times: user 432 µs, sys: 45 µs, total: 477 µs
Wall time: 456 µs


### Train the models added to  `param_grid`´s

In [12]:
%%time

csv_results_saved = os.path.abspath('exec_results/results_Ensemble_Imbalance.csv')


# verify if already exists an CSV with the results
overwrite_results_saved_previously = True

if os.path.exists(csv_results_saved) and overwrite_results_saved_previously==False:
    print('Reading results saved previously...')
    df_results = utils.read_csv(csv_file=csv_results_saved)
else:
    df_results = None


kfold = utils_exec_models.get_kfold_splits()


if len(param_grids) > 0:

    i = 0
    
    # execute GridSearch for each classifiers 
    for p_grid in param_grids:
        
        if i > 0:
            time.sleep((1 if TESTING else 10))

        i += 1
        
        # get the ensemble-estimator used by the BaggingEnsemble
        estimator = str(p_grid[0]['classifier__estimator'][0])
        
        estimator_class = estimator.split('(')[0]
        estimator_desc = utils.get_model_description(estimator_class)
        estimator_hyperparams = utils.get_model_description(estimator.split('(')[1]).replace(')', '')
        
        estimator_hyperparams = estimator_hyperparams.replace('\n', '').replace('                      ', '')
        estimator_hyperparams = utils_exec_models.convert_hyperparams_to_dict(estimator_hyperparams)

        
        print(f'Executing {estimator_class}...', end=' ')
        
        ## execute GridSearch
        grid, df_results_aux = utils_exec_models.exec_grid_search(
            param_grid=p_grid, 
            X=X_train, 
            y=y_train,
            cv=kfold,
            verbose=1,
            return_train_score=False,
            sort_results=False,
            dataset_info='Ensemble-Imbalance',
            features_info='All Features',
            n_jobs=8, 
        )
        


        df_results_aux['Estimator_Desc'] = str(estimator_desc)
        df_results_aux['Estimator_Class'] = str(estimator_class)
        df_results_aux['Estimator_Hyperparams'] = str(estimator_hyperparams)
        
#         ens_imb_hyperparams = p_grid[0].copy()
#         ens_imb_hyperparams.pop('classifier')
        

        if df_results is None:
            df_results = df_results_aux
        else:
            df_results = pd.concat([df_results, df_results_aux])

#         clear_output()

#         break
        
print('saving and waiting...')

# sort performances results and show results
df_results = utils_exec_models.sort_performances_results(df=df_results)       
display(df_results)

# save the results
utils.save_to_csv(df=df_results, csv_file=csv_results_saved)
    

       

print()
print('FINISHED !!!')


# # sort performances results and show results
# df_results = utils_exec_models.sort_performances_results(df=df_results)       
# display(df_results)

# # save the results
# utils.save_to_csv(df=df_results, csv_file=csv_results_saved)


aaaa
"alpha": "0.1",
Executing ComplementNB... Fitting 5 folds for each of 3 candidates, totalling 15 fits
aaaa
"alpha": "0.5",
Executing ComplementNB... Fitting 5 folds for each of 3 candidates, totalling 15 fits
aaaa
"class_weight": "balanced","max_depth": "5","random_state": "42",
Executing DecisionTreeClassifier... Fitting 5 folds for each of 3 candidates, totalling 15 fits
Executing GaussianNB... Fitting 5 folds for each of 3 candidates, totalling 15 fits
aaaa
"metric": "manhattan","weights": "distance",
Executing KNeighborsClassifier... Fitting 5 folds for each of 3 candidates, totalling 15 fits
aaaa
"alpha": "0.1","hidden_layer_sizes": "23","learning_rate_init": "0.7","max_iter": "300","random_state": "42","solver": "sgd",
Executing MLPClassifier... Fitting 5 folds for each of 3 candidates, totalling 15 fits
aaaa
"leaf_size": "50","metric": "manhattan","outlier_label": "1","radius": "0.3","weights": "distance",
Executing RadiusNeighborsClassifier... Fitting 5 folds for each of 3

Unnamed: 0,Dataset,Features,Model,BalAcc,Sens,Spec,f1,AUC,Acc,Prec,Classifier,Hyperparams,Estimator_Desc,Estimator_Class,Estimator_Hyperparams
0,Ensemble-Imbalance,All Features,Balanced Bagging,0.82,0.81,0.83,0.54,0.89,0.82,0.41,BalancedBaggingClassifier,"{'estimator': SVC(C=0.3, gamma='auto', probabi...",SVM,SVC,"{'C': '0.3', 'gamma': 'auto', 'probability': '..."
1,Ensemble-Imbalance,All Features,Balanced Bagging,0.82,0.81,0.83,0.54,0.89,0.82,0.41,BalancedBaggingClassifier,"{'estimator': SVC(C=0.3, gamma='auto', probabi...",SVM,SVC,"{'C': '0.3', 'gamma': 'auto', 'probability': '..."
2,Ensemble-Imbalance,All Features,Balanced Bagging,0.82,0.81,0.83,0.54,0.89,0.82,0.41,BalancedBaggingClassifier,"{'estimator': SVC(C=0.3, gamma='auto', probabi...",SVM,SVC,"{'C': '0.3', 'gamma': 'auto', 'probability': '..."
0,Ensemble-Imbalance,All Features,Balanced Bagging,0.82,0.79,0.85,0.57,0.9,0.84,0.45,BalancedBaggingClassifier,"{'estimator': SVC(C=0.3, gamma='auto', kernel=...",SVM,SVC,"{'C': '0.3', 'gamma': 'auto', 'kernel': 'linea..."
1,Ensemble-Imbalance,All Features,Balanced Bagging,0.82,0.79,0.85,0.57,0.9,0.84,0.45,BalancedBaggingClassifier,"{'estimator': SVC(C=0.3, gamma='auto', kernel=...",SVM,SVC,"{'C': '0.3', 'gamma': 'auto', 'kernel': 'linea..."
2,Ensemble-Imbalance,All Features,Balanced Bagging,0.82,0.79,0.85,0.57,0.9,0.84,0.45,BalancedBaggingClassifier,"{'estimator': SVC(C=0.3, gamma='auto', kernel=...",SVM,SVC,"{'C': '0.3', 'gamma': 'auto', 'kernel': 'linea..."
0,Ensemble-Imbalance,All Features,Balanced Bagging,0.81,0.78,0.84,0.54,0.89,0.83,0.42,BalancedBaggingClassifier,"{'estimator': ComplementNB(alpha=0.1), 'n_esti...",Naïve Bayes,ComplementNB,{'alpha': '0.1'}
1,Ensemble-Imbalance,All Features,Balanced Bagging,0.81,0.78,0.84,0.54,0.89,0.83,0.42,BalancedBaggingClassifier,"{'estimator': ComplementNB(alpha=0.1), 'n_esti...",Naïve Bayes,ComplementNB,{'alpha': '0.1'}
2,Ensemble-Imbalance,All Features,Balanced Bagging,0.81,0.78,0.84,0.54,0.89,0.83,0.42,BalancedBaggingClassifier,"{'estimator': ComplementNB(alpha=0.1), 'n_esti...",Naïve Bayes,ComplementNB,{'alpha': '0.1'}
0,Ensemble-Imbalance,All Features,Balanced Bagging,0.81,0.78,0.84,0.55,0.9,0.84,0.43,BalancedBaggingClassifier,"{'estimator': SVC(C=0.1, gamma='auto', kernel=...",SVM,SVC,"{'C': '0.1', 'gamma': 'auto', 'kernel': 'linea..."


36 samples were saved

FINISHED !!!
CPU times: user 738 ms, sys: 59.8 ms, total: 797 ms
Wall time: 13.7 s


---
---
---
# OTHERS

### Show other grid properties

In [None]:
print(f'Best Bal.Acc.: {grid.best_score_:.2f}')
print(f'        Model: {grid.best_params_["classifier"]} ') 
print(f'Performance using the Validation set:  {grid.score(X_valid, y_valid):.2f}')


