In [5]:
%load_ext autoreload
%autoreload 2

import sys
import os
import time
import gc


#needed to import utils.py
sys.path.append('../') 

import utils
import utils_preprocessing
import utils_exec_models

import numpy as np
import pandas as pd

from IPython.core.display import HTML

from IPython.display import clear_output

%matplotlib inline  

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


---
# $Ensemble$-$Imbalance$ scenario
---

# Get the scaled Training and Validation subsets 

In [2]:
X_train, y_train, X_valid, y_valid = utils.get_train_and_validation_data(scaled=True)

X_train.head()



Unnamed: 0,Sex_Male,Site_Onset,Diagnosis_Delay,Age_at_Onset,Riluzole,FVC_at_Diagnosis,BMI_at_Diagnosis,Q1_Speech_slope_at_Diagnosis,Q2_Salivation_slope_at_Diagnosis,Q3_Swallowing_slope_at_Diagnosis,...,Q7_Turning_in_Bed_slope_at_Diagnosis,Q8_Walking_slope_at_Diagnosis,Q9_Climbing_Stairs_slope_at_Diagnosis,Q10_Respiratory_slope_at_Diagnosis,Qty_Regions_Involved_at_Diagnosis,Region_Involved_Bulbar_at_Diagnosis,Region_Involved_Upper_Limb_at_Diagnosis,Region_Involved_Lower_Limb_at_Diagnosis,Region_Involved_Respiratory_at_Diagnosis,Patient_with_Gastrostomy_at_Diagnosis
0,1.0,1.0,0.0,0.5,0.0,1.0,0.67,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.67,1.0,1.0,1.0,0.0,0.0
1,1.0,1.0,0.5,0.75,0.0,0.0,0.67,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,0.67,1.0,1.0,1.0,0.0,0.0
2,1.0,1.0,0.0,0.25,0.0,1.0,0.33,0.5,0.0,0.0,...,0.0,0.0,0.5,0.0,0.67,1.0,1.0,1.0,0.0,0.0
3,0.0,0.0,0.5,0.5,0.0,1.0,1.0,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.67,1.0,0.0,1.0,1.0,0.0
4,1.0,0.0,1.0,0.75,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,1.0,1.0,1.0,1.0,1.0,0.0


---
# Read the Single-Model results file and get all unique $Model+Hyperparameters$
### NOTE: DO NOT re-execute all models, see `classif_filtered` variable

In [3]:
%%time

# get a Set of models from the results CSV without repeating
dir_dest = os.path.abspath('exec_results/')

results_csv_file = f'{dir_dest}/results_Single_Model.csv'

classifiers = utils_exec_models.get_models_set_from_results(results_csv_file=results_csv_file)
# display(classifiers)


model_classes = [
    'SVC',
    #
# OK    'GaussianNB', 
# OK    'ComplementNB', 
    #
# OK    'MLPClassifier', 
    #
#     'DecisionTreeClassifier', 
    #
# OK    'RadiusNeighborsClassifier', 
# OK    'KNeighborsClassifier',
    #
#     'RandomForestClassifier', 
]


classif_filtered = []
for clf in classifiers:
    clf_model_class = str(clf).split('(')[0]
    to_exec = (clf_model_class in model_classes)
    if to_exec:
        classif_filtered.append(clf)

    
classifiers = classif_filtered.copy()        
        

utils.print_array_as_list(classifiers)


[
    'SVC(C=0.1, class_weight='balanced', gamma='auto', kernel='linear',
    probability=True, random_state=42)',
    'SVC(C=0.1, class_weight='balanced', gamma='auto', probability=True,
    random_state=42)',
    'SVC(C=0.1, class_weight='balanced', kernel='linear', probability=True,
    random_state=42)',
    'SVC(C=0.1, class_weight='balanced', probability=True, random_state=42)',
    'SVC(C=0.1, gamma='auto', kernel='linear', probability=True, random_state=42)',
    'SVC(C=0.1, gamma='auto', probability=True, random_state=42)',
    'SVC(C=0.1, kernel='linear', probability=True, random_state=42)',
    'SVC(C=0.1, probability=True, random_state=42)',
    'SVC(C=0.3, class_weight='balanced', gamma='auto', kernel='linear',
    probability=True, random_state=42)',
    'SVC(C=0.3, class_weight='balanced', gamma='auto', probability=True,
    random_state=42)',
    'SVC(C=0.3, class_weight='balanced', kernel='linear', probability=True,
    random_state=42)',
    'SVC(C=0.3, class_weight='

# Train the Models using GridSearch using the inputs and outputs created in the previous step

### Store the `param_grid`´s that will be executed

In [4]:
%%time

## define the models and hyperparameters for the GridSearch
param_grids = []

TESTING = True
TESTING = False

if len(classifiers) > 0:

    # execute GridSearch for each classifiers 
    for classifier in classifiers:
        
        param_grid = []
        
        _ = utils_exec_models.create_models_BalancedBagging_grid(
            classifiers=[classifier],
            param_grid=param_grid, 
            testing=TESTING,
        )

        # store the param_grid's that will be executed
        param_grids.append(param_grid)
        
        if TESTING and len(param_grids) >= 5:
            break
        

# utils.print_array_as_list(param_grids)        
        
print(f'A total of {len(param_grids)} param_grids will be executed')
print()

utils.print_array_as_list(param_grids)

A total of 112 param_grids will be executed

[
    '[{'classifier__estimator': [SVC(C=0.1, class_weight='balanced', gamma='auto', kernel='linear',
    probability=True, random_state=42)], 'classifier__n_estimators': [11, 15, 51, 75, 101, 201, 301], 'classifier__sampling_strategy': ['all', 'majority', 'auto'], 'classifier__warm_start': [False, True], 'classifier__random_state': [42], 'classifier': [BalancedBaggingClassifier()]}]',
    '[{'classifier__estimator': [SVC(C=0.1, class_weight='balanced', gamma='auto', probability=True,
    random_state=42)], 'classifier__n_estimators': [11, 15, 51, 75, 101, 201, 301], 'classifier__sampling_strategy': ['all', 'majority', 'auto'], 'classifier__warm_start': [False, True], 'classifier__random_state': [42], 'classifier': [BalancedBaggingClassifier()]}]',
    '[{'classifier__estimator': [SVC(C=0.1, class_weight='balanced', kernel='linear', probability=True,
    random_state=42)], 'classifier__n_estimators': [11, 15, 51, 75, 101, 201, 301], 'classif

### Train the models added to  `param_grid`´s

In [12]:
%%time

csv_results_saved = os.path.abspath('exec_results/results_Ensemble_Imbalance.csv')


# verify if already exists an CSV with the results
overwrite_results_saved_previously = False

if os.path.exists(csv_results_saved) and overwrite_results_saved_previously==False:
    print('Reading results saved previously...')
    df_results = utils.read_csv(csv_file=csv_results_saved)
else:
    df_results = None


kfold = utils_exec_models.get_kfold_splits()

was_executed = False

if len(param_grids) > 0:

    i = 0
    
    tot = len(param_grids)
    
    # execute GridSearch for each classifiers 
    for p_grid in param_grids:
        
        if i > 0 and not was_executed:
#             pass
            time.sleep((1 if TESTING else 5))
            clear_output()

        was_executed = False
        
        i += 1
        
        
        estimator_class = p_grid[0]['classifier__estimator'][0]
        
        estimator_name = str(estimator_class).split('(')[0]

        estimator_desc = utils.get_model_description(estimator_name)
        
        estimator_params = estimator_class.get_params()    
        estimator_params = utils_exec_models.convert_hyperparams_to_dict(estimator_params)

        # check if model was already executed
        if df_results is not None:
            df_executed = df_results.loc[
                (df_results.Estimator_Desc == str(estimator_desc))
               &(df_results.Estimator_Class == str(estimator_name)) 
               &(df_results.Estimator_Hyperparams == str(estimator_params)) 
            ].copy()
            
            if df_executed.shape[0] > 0:
                print(f'{i:>3} was already executed')
                was_executed = True
                continue
        
        
        print(f'({i}/{tot}) Executing {estimator_name}...', end=' ')
        
        ## execute GridSearch
        grid, df_results_aux = utils_exec_models.exec_grid_search(
            param_grid=p_grid, 
            X=X_train, 
            y=y_train,
            cv=kfold,
            verbose=1,
            return_train_score=False,
            sort_results=False,
            dataset_info='Ensemble-Imbalance',
            features_info='All Features',
            n_jobs=12, 
        )
        


        df_results_aux['Estimator_Desc'] = str(estimator_desc)
        df_results_aux['Estimator_Class'] = str(estimator_name)
        df_results_aux['Estimator_Hyperparams'] = str(estimator_params)
        

        if df_results is None:
            df_results = df_results_aux
        else:
            df_results = pd.concat([df_results, df_results_aux])

        # delete results witn NAN in the Balanced Accuracy
        to_delete = df_results.loc[(df_results.BalAcc.isnull())]
        df_results = utils.remove_rows(df=df_results, to_delete=to_delete)
    
        print('saving and waiting...')

        # sort performances results and show results
        df_results = utils_exec_models.sort_performances_results(df=df_results)       

        # save the results
        utils.save_to_csv(df=df_results, csv_file=csv_results_saved)

        gc.collect()
        
        
#         break

display(df_results.head(10))
       

print()
print('FINISHED !!!')



(112/112) Executing SVC... Fitting 5 folds for each of 42 candidates, totalling 210 fits
  -  Previous=24613, To delete=0, After=24613
saving and waiting...
24613 samples were saved


Unnamed: 0,Dataset,Features,Model,BalAcc,Sens,Spec,f1,AUC,Acc,Prec,Classifier,Hyperparams,Estimator_Desc,Estimator_Class,Estimator_Hyperparams
0,Ensemble-Imbalance,All Features,Balanced Bagging,0.84,0.85,0.83,0.57,0.91,0.83,0.43,BalancedBaggingClassifier,"{'n_estimators': 51, 'random_state': 42, 'samp...",Neural Networks,MLPClassifier,"{'activation': 'relu', 'alpha': 0.1, 'batch_si..."
1,Ensemble-Imbalance,All Features,Balanced Bagging,0.84,0.85,0.83,0.57,0.91,0.83,0.43,BalancedBaggingClassifier,"{'n_estimators': 51, 'random_state': 42, 'samp...",Neural Networks,MLPClassifier,"{'activation': 'relu', 'alpha': 0.1, 'batch_si..."
2,Ensemble-Imbalance,All Features,Balanced Bagging,0.84,0.84,0.84,0.58,0.91,0.84,0.45,BalancedBaggingClassifier,"{'n_estimators': 75, 'random_state': 42, 'samp...",Neural Networks,MLPClassifier,"{'activation': 'relu', 'alpha': 0.1, 'batch_si..."
3,Ensemble-Imbalance,All Features,Balanced Bagging,0.84,0.84,0.84,0.58,0.91,0.84,0.45,BalancedBaggingClassifier,"{'n_estimators': 75, 'random_state': 42, 'samp...",Neural Networks,MLPClassifier,"{'activation': 'relu', 'alpha': 0.1, 'batch_si..."
4,Ensemble-Imbalance,All Features,Balanced Bagging,0.83,0.85,0.81,0.54,0.91,0.81,0.4,BalancedBaggingClassifier,"{'n_estimators': 101, 'random_state': 42, 'sam...",Neural Networks,MLPClassifier,"{'activation': 'relu', 'alpha': 0.0001, 'batch..."
5,Ensemble-Imbalance,All Features,Balanced Bagging,0.83,0.85,0.81,0.54,0.91,0.81,0.4,BalancedBaggingClassifier,"{'n_estimators': 101, 'random_state': 42, 'sam...",Neural Networks,MLPClassifier,"{'activation': 'relu', 'alpha': 0.0001, 'batch..."
6,Ensemble-Imbalance,All Features,Balanced Bagging,0.83,0.85,0.81,0.54,0.91,0.81,0.4,BalancedBaggingClassifier,"{'n_estimators': 101, 'random_state': 42, 'sam...",Neural Networks,MLPClassifier,"{'activation': 'relu', 'alpha': 0.0001, 'batch..."
7,Ensemble-Imbalance,All Features,Balanced Bagging,0.83,0.85,0.81,0.54,0.91,0.81,0.4,BalancedBaggingClassifier,"{'n_estimators': 101, 'random_state': 42, 'sam...",Neural Networks,MLPClassifier,"{'activation': 'relu', 'alpha': 0.0001, 'batch..."
8,Ensemble-Imbalance,All Features,Balanced Bagging,0.83,0.85,0.81,0.54,0.91,0.81,0.4,BalancedBaggingClassifier,"{'n_estimators': 101, 'random_state': 42, 'sam...",Neural Networks,MLPClassifier,"{'activation': 'relu', 'alpha': 0.0001, 'batch..."
9,Ensemble-Imbalance,All Features,Balanced Bagging,0.83,0.85,0.81,0.54,0.91,0.81,0.4,BalancedBaggingClassifier,"{'n_estimators': 101, 'random_state': 42, 'sam...",Neural Networks,MLPClassifier,"{'activation': 'relu', 'alpha': 0.0001, 'batch..."



FINISHED !!!
CPU times: user 1h 40min 47s, sys: 2.66 s, total: 1h 40min 50s
Wall time: 22h 24min 38s


---
---
---
# OTHERS

## Test create a classifier using the  model + hyperparams from the results

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.svm import SVC

from imblearn.ensemble import BalancedBaggingClassifier

dd = list()

for index, row in df_results.iterrows():
    dd.append([row.Classifier, row.Hyperparams, row.Estimator_Class, row.Estimator_Hyperparams])


for m, h, est, est_h in dd:
    
    model = utils_exec_models.create_model_from_string(
        model=m,
        hyperparams=h,
        estimator_model=est,
        estimator_hyperparams=est_h
    )

#     model.fit(
#         X_train, 
#         y_train[utils.CLASS_COLUMN].ravel()
#     )
    
#     y_pred = model.predict(X_valid)
    
    print(model)
#     print(y_pred)
    print()
    

BalancedBaggingClassifier(estimator=MLPClassifier(hidden_layer_sizes=(23, 23),
                                                  learning_rate_init=0.7,
                                                  max_iter=1000,
                                                  random_state=42,
                                                  solver='sgd'),
                          n_estimators=3, random_state=42,
                          sampling_strategy='all')

BalancedBaggingClassifier(estimator=MLPClassifier(hidden_layer_sizes=(23, 23),
                                                  learning_rate='adaptive',
                                                  learning_rate_init=0.7,
                                                  max_iter=1000,
                                                  random_state=42,
                                                  solver='sgd'),
                          n_estimators=3, random_state=42,
                          sampling_strategy='majority'

### Show other grid properties

In [None]:
print(f'Best Bal.Acc.: {grid.best_score_:.2f}')
print(f'        Model: {grid.best_params_["classifier"]} ') 
print(f'Performance using the Validation set:  {grid.score(X_valid, y_valid):.2f}')


