In [17]:
#this is to auto-reload modules (like utils.py)
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
from IPython.display import clear_output

import os
import sys
import time
import gc
import json
import glob
import ast

import shutil

import utils

import numpy as np
import pandas as pd
# config to not reduce column width
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)


import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm, ListedColormap

import seaborn as sns
plt.style.use('seaborn-whitegrid')

import plotly as ply
import plotly.express as px

from itertools import permutations, combinations

from tabulate import tabulate

import pickle
from joblib import dump, load


import sklearn
from sklearn.model_selection import StratifiedKFold

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
# import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)
# warnings.filterwarnings('always')

# Surpress warnings:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)


SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=SMALL_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

%matplotlib inline  



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Read the performances obtained by GridSearch


In [18]:
#read
csv_file = 'exec_results/results.csv'
df_best = utils.read_csv(csv_file)
df_best.head(3)


df_best = df_best.loc[(
    (df_best.Dataset  == 'Imbalanced')
   &(df_best.Features == 'All Features')
)].copy()


df_best = df_best.groupby(
    by=['Model']
).first().reset_index(drop=False).sort_values(
    by=['BalAcc','Sens', 'Spec'],
    ascending=False,
).reset_index(drop=True)


df_best['Model'] = df_best.Model.apply(lambda x: utils.get_model_short_description(x))

display(df_best)


Unnamed: 0,Model,Dataset,Features,BalAcc,Sens,Spec,f1,AUC,Acc,Prec,Classifier,Hyperparams
0,SVM,Imbalanced,All Features,0.83,0.82,0.85,0.57,0.91,0.84,0.44,SVC,"{'probability': True, 'C': 3, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'random_state': 42}"
1,NB,Imbalanced,All Features,0.79,0.78,0.79,0.49,0.84,0.79,0.36,ComplementNB,"{'alpha': 3.5, 'norm': False}"
2,DT,Imbalanced,All Features,0.78,0.83,0.74,0.46,0.85,0.75,0.32,DecisionTreeClassifier,"{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 4, 'random_state': 42}"
3,NN,Imbalanced,All Features,0.77,0.61,0.94,0.59,0.91,0.9,0.6,MLPClassifier,"{'activation': 'relu', 'alpha': 0.3, 'hidden_layer_sizes': (23,), 'learning_rate': 'constant', 'learning_rate_init': 0.7, 'max_iter': 1000, 'random_state': 42, 'solver': 'sgd'}"
4,k-NN,Imbalanced,All Features,0.74,0.8,0.68,0.41,0.75,0.7,0.27,RadiusNeighborsClassifier,"{'leaf_size': 50, 'metric': 'manhattan', 'outlier_label': 1, 'radius': 1.0, 'weights': 'uniform'}"
5,XGBoost,Imbalanced,All Features,0.72,0.49,0.95,0.53,0.88,0.89,0.59,XGBClassifier,"{'eval_metric': 'mlogloss', 'learning_rate': 0.5, 'max_depth': 25, 'n_estimators': 100, 'random_state': 42, 'use_label_encoder': False}"
6,RF,Imbalanced,All Features,0.7,0.44,0.96,0.52,0.88,0.89,0.64,RandomForestClassifier,"{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 100, 'random_state': 42}"
7,CatBoost,Imbalanced,All Features,0.68,0.38,0.98,0.49,0.9,0.9,0.72,CatBoostClassifier,"{'depth': 10, 'gpu_ram_part': 0.8, 'iterations': 100, 'l2_leaf_reg': 3, 'learning_rate': 0.03, 'logging_level': 'Silent', 'max_bin': 32, 'random_state': 42}"


### Get Training, Validation, Balanced sets 

##### NOTE: the "training_sets" contains the sets of training and validation to help the models execution, with the following information:
 - `Dataset Info `  ["Training", "Training Balanced"]
 - `Features Info`  ["All Features", "Feature Selection"]
 - `"X" to train`   
 - `"y" to train` 
 - `"X" to validation`


In [19]:
X_train, X_train_fs, y_train, \
X_train_balanced, X_train_fs_balanced, y_train_balanced,\
X_valid, X_valid_fs, y_valid, training_sets = utils.get_train_and_validation_data(
    return_training_sets=True
)


#### Auxiliary function to get X and y according to the datasets (training, balanced...)

In [20]:
def get_X_y(dset_info, feat_info):
    X_ret = None
    y_ret = None
    X_valid_ret = None
    
    for dataset_info, features_info, X, y , X_valid in training_sets: 
        if (dataset_info == dset_info) and (features_info == feat_info):
            X_ret = X
            y_ret = y
            X_valid_ret = X_valid
            
    return X_ret, y_ret, X_valid_ret
        

### Get the best models and create instances using their hyperparameters

In [21]:
df_best_by_model = df_best.copy()

df_best_by_model = df_best.groupby(
    by=['Model', 'Dataset', 'Features']#,'balanced_accuracy','sensitivity', 'specificity']
).head(20).reset_index(drop=True).sort_values(
    by=['BalAcc','Sens', 'Spec'],
    ascending=False,
).reset_index(drop=True)


best_models = []

df_best_models = None

for index, row in df_best_by_model.iterrows():
    
    model   = row.Model
    dataset = row.Dataset
    features = row.Features
    classif = row.Classifier
    params  = row.Hyperparams

    performance_gridsearch = {
        'BalAcc': row.BalAcc,
        'Sens': row.Sens,
        'Spec': row.Spec,
        'f1': row.f1,
        'AUC': row.AUC,
        'Acc': row.Acc,
        'Prec': row.Prec,
        
    }
    
    try:
        params_dict = ast.literal_eval(params)
    except Exception as ex:
        print(params)
        print(f'<<ERROR>>: {ex}')
        print()
    

    klass = globals()[classif]
    clf = klass(**params_dict)
    
    best_models.append([model, dataset, features, clf, params, performance_gridsearch])
   
    # store best models into dataFrame
    data = {
        'Model': model,
        'Dataset': dataset,
        'Features': features,
        
        'BalAcc': row.BalAcc,
        'Sens': row.Sens,
        'Spec': row.Spec,
        'f1': row.f1,
        'AUC': row.AUC,
        'Acc': row.Acc,
        'Prec': row.Prec,
        'Classifier': classif,
        'Hyperparams': params,
        
    }

    if df_best_models is None:
        df_best_models = pd.DataFrame(data, index=[0])
    else:
        df_best_models = df_best_models.append(data, ignore_index=True)

        
print(f'Qty. of Models: {len(best_models)}')
print('Example:')
# display(best_models[:1][0])

# display(df_best_models)

csv_file = os.path.abspath(f'exec_results/best_results_by_model.csv')
utils.save_to_csv(df=df_best_models, csv_file=csv_file)

Qty. of Models: 8
Example:
8 samples were saved


---
---
## Reexecute each best model as following:

 - ### $Create$ an model instance using the best hyperparameters
 - ### $Fit$ the model using the Training set
 - ### $Validate$ the model using the Validation set 
 
 
<img src="slides/figures/grid_search_workflow.png" width="45%" >

[https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation]

In [25]:
%%time


models_results = [] 

i = 1
tot = len(best_models)



## For each best model:
#    1. Fit using using the Training data
#    2. Validate using the Validation data

for model_desc, dataset, features, clf, params, performance_grid in best_models:

    # get the classifier name (without the parameters)
    classifier_name = clf.__class__.__name__ 

    
#     # Just for testing    
#     if features != 'All Features':
#         continue
#     if classifier_name != 'CatBoostClassifier':
#         continue
#     if len(models_results) > 2:
#         break     
    
    
    try:
        print(f'{i}/{tot}: Executing {model_desc} using "{dataset}" with "{features}"', 
              end='\r'
        )
        i += 1
        
        # get the correct X, y, and X_validation to be used on model execution
        X, y, X_valid = get_X_y(dset_info=dataset, feat_info=features) 

        # if found the correct data
        if X is not None:
            # fit using the traning set
            clf.fit(X, y)

            #predict using the validation set
            y_pred = clf.predict(X_valid)
            
            #get performance
            bal_acc, sens, spec, auc, acc, prec, f1 = utils.get_scores_from_predict(
                y_validation=y_valid, 
                y_pred=y_pred, 
            )
            
            # Store the Validation and Training performances
            performance_to_save = {
                'Model': model_desc,
                'Dataset': dataset,
                'Features': features,
                # Validation performance
                'Valid_BalAcc': bal_acc,
                'Valid_Sens'  : sens,
                'Valid_Spec'  : spec,
                'Valid_f1'    : f1,
                'Valid_AUC'   : auc,
                'Valid_Acc'   : acc,
                'Valid_Prec'  : prec,
                # Voting performance
                'Train_BalAcc': performance_grid['BalAcc'],
                'Train_Sens'  : performance_grid['Sens'],
                'Train_Spec'  : performance_grid['Spec'],
                'Train_f1'    : performance_grid['f1'],
                'Train_AUC'   : performance_grid['AUC'],
                'Train_Acc'   : performance_grid['Acc'],
                'Train_Prec'  : performance_grid['Prec'],
                #
                #
                'Classifier': classifier_name,
                'Hyperparams': params,
            }
            #
            models_results.append(performance_to_save)
            
        else:
            raise Exception('"X" and "y" sets not found!')
            
            
    except Exception as ex:
        display(X.head(3))
        raise ex
        

        
clear_output(wait=False)
       
# create a dataFrame to store the performances and sort them
df_validation_performance = pd.DataFrame(models_results)

df_validation_performance = utils.sort_performances_results(
    df=df_validation_performance,
    cols_order_to_sort=['Valid_BalAcc', 'Valid_Sens', 'Valid_Spec'],
)


# save validation performance
csv_file = os.path.abspath('exec_results/validation_results_for_best_models.csv')
utils.save_to_csv(
    df=df_validation_performance, 
    csv_file=csv_file
)

8 samples were saved
CPU times: user 9.23 s, sys: 827 ms, total: 10.1 s
Wall time: 1.71 s


In [23]:
gc.collect()

18901

---
---
---
# OTHERS
