In [1]:
#this is to auto-reload modules (like utils.py)
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
from IPython.display import clear_output

import os
import sys
import time
import gc
import json
import glob

import shutil

import utils

import numpy as np
import pandas as pd
# config to not reduce column width
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)


import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm, ListedColormap

import seaborn as sns
plt.style.use('seaborn-whitegrid')

import plotly as ply
import plotly.express as px

from itertools import permutations, combinations

from tabulate import tabulate

import pickle
from joblib import dump, load


import sklearn

# import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)
# warnings.filterwarnings('always')

# Surpress warnings:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=SMALL_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

%matplotlib inline  



### Get Data (Training, Validation, Balanced etc) 

In [3]:
X_train, X_train_fs, y_train, \
X_train_balanced, X_train_fs_balanced, y_train_balanced,\
X_valid, X_valid_fs, y_valid, training_sets = utils.get_train_and_validation_data(
    return_training_sets=True
)



# training_sets = [
#     # Training
#     ['Training', 'All Features'     , X_train   , y_train],
#     ['Training', 'Feature Selection', X_train_fs, y_train],
#     # Training Balanced
#     ['Training Balanced', 'All Features'     , X_train_balanced   , y_train_balanced],
#     ['Training Balanced', 'Feature Selection', X_train_fs_balanced, y_train_balanced],
# ]

# validation_set = [X_valid, y_valid]


### Create and Execute the Models using GridSearch using the _n_ inputs and outputs created inf the previous step

In [18]:
%%time

csv_results_saved = os.path.abspath('exec_results/results.csv')


i = 1
total = len(training_sets)


# verify if already exists an CSV with the results
overwrite_results_saved_previously = False

if os.path.exists(csv_results_saved) and overwrite_results_saved_previously==False:
    print('Reading results saved previously...')
    df_results = utils.read_csv(csv_file=csv_results_saved)
else:
    df_results = None


kfold = utils.get_kfold_splits()

testing=False


for dataset_info, features_info, X, y, _ in training_sets:
    print()
    utils.print_string_with_separators(f'{i}/{total}: Executing using "{dataset_info}" set  with "{features_info}"...')

    i += 1
    
    ## define the models and hyperparameters for the GridSearch
    param_grid = []

#     utils.create_models_SVM_grid(param_grid, testing=testing)
#     utils.create_models_NB_grid(param_grid, testing=testing)
#     utils.create_models_DT_grid(param_grid, testing=testing)
#     utils.create_models_kNN_grid(param_grid, testing=testing)
#     utils.create_models_RF_grid(param_grid, testing=testing)
#     utils.create_models_NN_grid(qty_features=X.shape[1],  param_grid=param_grid, testing=testing)
#     utils.create_models_XGB_grid(param_grid, testing=testing)
#     utils.create_models_CatBoost_grid(param_grid, testing=testing)

    if len(param_grid) > 0:

        ## execute GridSearch
        grid, df_results_aux = utils.exec_grid_search(
            param_grid=param_grid, 
            X=X, 
            y=y,
            cv=kfold,
            verbose=1,
            return_train_score=False,
            sort_results=False,
            dataset_info=dataset_info,
            features_info=features_info,
            #
        #     n_jobs=utils.N_JOBS, 
        #     scoring='roc_auc',
        )

        if df_results is None:
            df_results = df_results_aux
        else:
            df_results = pd.concat([df_results, df_results_aux])

        clear_output(wait=True)

        time.sleep(10)

print()
print('FINISHED !!!')

# sort performances results and show results
df_results = utils.sort_performances_results(df=df_results)       
# display(df_results)

# save the results
utils.save_to_csv(df=df_results, csv_file=csv_results_saved)



FINISHED !!!
3180 samples were saved
CPU times: user 5.66 s, sys: 493 ms, total: 6.16 s
Wall time: 1min 6s


---
---
---
# OTHERS

### Show other grid properties

In [None]:
print(f'Best Bal.Acc.: {grid.best_score_:.2f}')
print(f'        Model: {grid.best_params_["classifier"]} ') 
print(f'Performance using the Validation set:  {grid.score(X_valid, y_valid):.2f}')


