In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import time


#needed to import utils.py
sys.path.append('../') 

import utils
import utils_preprocessing
import utils_exec_models

import numpy as np
import pandas as pd

from IPython.core.display import HTML

from IPython.display import clear_output

%matplotlib inline  

# Get the scaled Training and Validation subsets 

In [2]:
X_train, y_train, X_valid, y_valid = utils.get_train_and_validation_data(scaled=True)

X_train.head()



Unnamed: 0,Sex_Male,Site_Onset,Diagnosis_Delay,Age_at_Onset,Riluzole,FVC_at_Diagnosis,BMI_at_Diagnosis,Q1_Speech_slope_at_Diagnosis,Q2_Salivation_slope_at_Diagnosis,Q3_Swallowing_slope_at_Diagnosis,...,Q7_Turning_in_Bed_slope_at_Diagnosis,Q8_Walking_slope_at_Diagnosis,Q9_Climbing_Stairs_slope_at_Diagnosis,Q10_Respiratory_slope_at_Diagnosis,Qty_Regions_Involved_at_Diagnosis,Region_Involved_Bulbar_at_Diagnosis,Region_Involved_Upper_Limb_at_Diagnosis,Region_Involved_Lower_Limb_at_Diagnosis,Region_Involved_Respiratory_at_Diagnosis,Patient_with_Gastrostomy_at_Diagnosis
0,1.0,1.0,0.0,0.5,0.0,1.0,0.67,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.67,1.0,1.0,1.0,0.0,0.0
1,1.0,1.0,0.5,0.75,0.0,0.0,0.67,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,0.67,1.0,1.0,1.0,0.0,0.0
2,1.0,1.0,0.0,0.25,0.0,1.0,0.33,0.5,0.0,0.0,...,0.0,0.0,0.5,0.0,0.67,1.0,1.0,1.0,0.0,0.0
3,0.0,0.0,0.5,0.5,0.0,1.0,1.0,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.67,1.0,0.0,1.0,1.0,0.0
4,1.0,0.0,1.0,0.75,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,1.0,1.0,1.0,1.0,1.0,0.0


# Train the Models using GridSearch using the inputs and outputs created in the previous step

In [6]:
%%time

overwrite_results_saved_previously = True

# TESTING = False
TESTING = True

kfold = utils_exec_models.get_kfold_splits()


#=======================================================================
# SINGLE-MODEL SCENARIO
#=======================================================================
# verify if already exists an CSV with the results
csv_results_saved = os.path.abspath('exec_results/results_Single_Model.csv')
if os.path.exists(csv_results_saved) and overwrite_results_saved_previously==False:
    print('Reading results saved previously...')
    df_results = utils.read_csv(csv_file=csv_results_saved)
else:
    df_results = None

## define the models and hyperparameters for the GridSearch
param_grid = []

utils_exec_models.create_models_SVM_grid(param_grid, testing=TESTING)
utils_exec_models.create_models_NB_grid(param_grid, testing=TESTING)
utils_exec_models.create_models_DT_grid(param_grid, testing=TESTING)
utils_exec_models.create_models_kNN_grid(param_grid, testing=TESTING)
utils_exec_models.create_models_RF_grid(param_grid, testing=TESTING)
utils_exec_models.create_models_NN_grid(qty_features=X_train.shape[1],  param_grid=param_grid, testing=TESTING)


#=======================================================================
# ENSEMBLE-IMBALANCE SCENARIO
#=======================================================================
# verify if already exists an CSV with the results
csv_results_saved_Ens_Imb = os.path.abspath('exec_results/results_Ensemble_Imbalance.csv')
if os.path.exists(csv_results_saved_Ens_Imb) and overwrite_results_saved_previously==False:
    print('Reading results saved previously...')
    df_results_Ens_Imb = utils.read_csv(csv_file=csv_results_saved_Ens_Imb)
else:
    df_results_Ens_Imb = None
    
    
#display(param_grid)

i = 1

if len(param_grid) > 0:
    
    #=======================================================================
    # SINGLE-MODEL SCENARIO
    #=======================================================================
    #  execute GridSearch
    grid, df_results_aux = utils_exec_models.exec_grid_search(
        param_grid=param_grid, 
        X=X_train, 
        y=y_train,
        cv=kfold,
        verbose=1,
        return_train_score=False,
        sort_results=False,
        dataset_info='Single-Model',
        features_info='All Features',
        #
        n_jobs=8, 
    )

    if df_results is None:
        df_results = df_results_aux
    else:
        df_results = pd.concat([df_results, df_results_aux])

        
    #=======================================================================
    # ENSEMBLE-IMBALANCE SCENARIO
    #=======================================================================
        
    param_grid = []

    _ = utils_exec_models.create_models_BalancedBagging_grid(
        classifiers=param_grid,
        param_grid=param_grid, 
        testing=TESTING,
    )

    # store the param_grid's that will be executed
    param_grids.append(param_grid)

        
        
    estimator_class = p_grid[0]['classifier__estimator'][0]

    estimator_name = str(estimator_class).split('(')[0]

    estimator_desc = utils.get_model_description(estimator_name)

    estimator_params = estimator_class.get_params()    
    estimator_params = utils_exec_models.convert_hyperparams_to_dict(estimator_params)

    #         print(f'{estimator_desc}', end=' ')
    print(f'({i}/{tot}) Executing {estimator_name}...', end=' ')

    ## execute GridSearch
    grid, df_results_aux = utils_exec_models.exec_grid_search(
        param_grid=p_grid, 
        X=X_train, 
        y=y_train,
        cv=kfold,
        verbose=1,
        return_train_score=False,
        sort_results=False,
        dataset_info='Ensemble-Imbalance',
        features_info='All Features',
        n_jobs=8, 
    )



    df_results_aux['Estimator_Desc'] = str(estimator_desc)
    df_results_aux['Estimator_Class'] = str(estimator_name)
    df_results_aux['Estimator_Hyperparams'] = str(estimator_params)


    if df_results is None:
        df_results = df_results_aux
    else:
        df_results = pd.concat([df_results, df_results_aux])
        
        
        
    clear_output()

    time.sleep(2)

    
print()
print('FINISHED !!!')


# sort performances results and show results
df_results = utils_exec_models.sort_performances_results(df=df_results)       
display(df_results)

# save the results
utils.save_to_csv(df=df_results, csv_file=csv_results_saved)



FINISHED !!!


Unnamed: 0,Dataset,Features,Model,BalAcc,Sens,Spec,f1,AUC,Acc,Prec,Classifier,Hyperparams
21,Single-Model,All Features,SVM,0.83,0.82,0.84,0.57,0.91,0.84,0.44,SVC,"{'C': 0.5, 'class_weight': 'balanced', 'gamma'..."
23,Single-Model,All Features,SVM,0.83,0.82,0.84,0.57,0.91,0.84,0.44,SVC,"{'C': 0.5, 'class_weight': 'balanced', 'gamma'..."
29,Single-Model,All Features,SVM,0.83,0.82,0.84,0.57,0.91,0.84,0.44,SVC,"{'C': 0.7, 'class_weight': 'balanced', 'gamma'..."
31,Single-Model,All Features,SVM,0.83,0.82,0.84,0.57,0.91,0.84,0.44,SVC,"{'C': 0.7, 'class_weight': 'balanced', 'gamma'..."
37,Single-Model,All Features,SVM,0.83,0.82,0.84,0.58,0.91,0.84,0.45,SVC,"{'C': 1, 'class_weight': 'balanced', 'gamma': ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
710,Single-Model,All Features,Neural Networks,0.50,0.00,1.00,0.00,0.50,0.87,0.00,MLPClassifier,"{'activation': 'relu', 'alpha': 0.5, 'hidden_l..."
712,Single-Model,All Features,Neural Networks,0.50,0.00,1.00,0.00,0.50,0.87,0.00,MLPClassifier,"{'activation': 'relu', 'alpha': 0.5, 'hidden_l..."
714,Single-Model,All Features,Neural Networks,0.50,0.00,1.00,0.00,0.50,0.87,0.00,MLPClassifier,"{'activation': 'relu', 'alpha': 0.5, 'hidden_l..."
716,Single-Model,All Features,Neural Networks,0.50,0.00,1.00,0.00,0.50,0.87,0.00,MLPClassifier,"{'activation': 'relu', 'alpha': 0.5, 'hidden_l..."


719 samples were saved
CPU times: user 3.08 s, sys: 203 ms, total: 3.29 s
Wall time: 3min 39s


---
---
---
# OTHERS

## Test create a classifier using the  model + hyperparams from the results

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.svm import SVC

dd = list()

for index, row in df_results.iterrows():
    dd.append([row.Classifier, row.Hyperparams])


for m, h in dd:

    model = utils_exec_models.create_model_from_string(
        model=m,
        hyperparams=h,
    )

    model.fit(
        X_train, 
        y_train[utils.CLASS_COLUMN].ravel()
    )
    
    y_pred = model.predict(X_valid)
    
    print(model)
    print(y_pred)
    print()
    

ComplementNB(alpha=0.5)
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0.
 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1.
 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1.
 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1.
 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1.
 0. 1. 0. 0. 0. 0. 0. 0. 0.

SVC(C=0.1, gamma='auto', probability=True, random_state=42)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

In [None]:
df_results

### Show other grid properties

In [None]:
print(f'Best Bal.Acc.: {grid.best_score_:.2f}')
print(f'        Model: {grid.best_params_["classifier"]} ') 
print(f'Performance using the Validation set:  {grid.score(X_valid, y_valid):.2f}')


