In [None]:
import os
import sys
import argparse

import json
from datetime import datetime

from IPython.display import HTML

import pandas as pd
from scipy.io import arff
import matplotlib

from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,\
                             ExtraTreesClassifier, GradientBoostingClassifier

# over-sampling techniques
from imblearn.over_sampling import RandomOverSampler, SMOTE, \
                                   ADASYN, BorderlineSMOTE, \
                                   KMeansSMOTE, SVMSMOTE, \
                                   SMOTENC, SMOTEN

# under-sampling techniques
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, \
                                    NearMiss, CondensedNearestNeighbour, \
                                    TomekLinks, EditedNearestNeighbours, \
                                    OneSidedSelection, NeighbourhoodCleaningRule


from estimators import EstimatorSelectionHelper
from graphics import generate_graphics_from_gridsearchcv_results

In [None]:
CONFIG_FILE = '.config_ipynb'
with open(CONFIG_FILE) as f:
    sys.argv = f.read().split()

parser = argparse.ArgumentParser()
parser.add_argument('--dataset_path', default='', type=str, help='path of dataset')
args, _ = parser.parse_known_args(sys.argv[1:])

dataset_path = args.dataset_path

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})
%matplotlib inline

In [None]:
def create_dataset_results_folder(dataset_name):

    def __create_folder(folder_name):
        if not os.path.exists(folder_name): 
            os.makedirs(folder_name)
    
    for f in [
        results_dataset_folder := f'results/{dataset_name}/',
       *[f'{results_dataset_folder}/{sf}' for sf in ['imgs', 'pkls', 'test_results']],
    ]:
        __create_folder(f)

In [None]:
TODAY = datetime.today().strftime('%Y%m%d')

### Criação das etapas de processamento

- Algoritmos de pré-processamento
- Parâmetros dos algoritmos de pré-processamento


- Algoritmos de classificação
- Parâmetros dos algoritmos de classificação


- Métricas

In [None]:
# pré-processamento
transformers = {
    None: None,
    SMOTE(): {
        'sampling_strategy': [0.5, 0.8, 1],
        'k_neighbors': [3, 7, 9],
    },
    BorderlineSMOTE(): {
        'sampling_strategy': [0.5, 0.8, 1],
        'kind': ['borderline-1', 'borderline-2'],
    },
    ClusterCentroids(): {
        'sampling_strategy': [0.5, 0.8, 1],        
    },
    NearMiss():{
        'version': [1, 2],
        'n_neighbors': [3, 7, 9],
    }
}

# learning models
models = {
    ExtraTreesClassifier(): {
        'n_estimators': [16, 32, 64, 128],
        'criterion': ['gini', 'entropy'],
        'max_depth': [8, 16, 32, None],
    },
    RandomForestClassifier(): {
        'n_estimators': [16, 32, 64, 128],
        'criterion': ['gini', 'entropy'],
        'max_depth': [8, 16, 32, None],
    },
    DecisionTreeClassifier(): { 
        'criterion': ['gini', 'entropy'],
        'max_depth': [8, 16, 32, None],
    },
    AdaBoostClassifier(algorithm="SAMME"): {
        'n_estimators': [8, 16, 32, 64, 128],
    },
    GradientBoostingClassifier(): {
        'n_estimators': [16, 32, 64, 128],
        'learning_rate': [0.01, 0.1, 0.5, 0.8],
    },
    # SVC(): [
    #     {'kernel': ['linear'], 'C': [1, 10]},
    #     {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
    # ],
}

scoring = {
    'AUPRC': 'average_precision',
    'AUROC': 'roc_auc',
    'f1': 'f1',
}

In [None]:
def cell_to_latex(c):
    if c in [None, '', {}]:
        return ''

    if isinstance(c, str):
        c = json.loads(c)

    return (
        r'\makecell[l]{' +
        ' \\\\ '.join(
            [f'{k} = {v}' for k, v in c.items()]
        ) + 
        '}'
    )

def class_to_repr(c):
    return c.__repr__()

HIPER_PARAMS = 'Hiper-parâmetros'
param_df = (
    pd.DataFrame.from_dict(
        transformers | models,
        orient='index',
        columns=[HIPER_PARAMS]
    )
    .fillna('')
).reset_index().rename(
    columns={'index': 'Algoritmo'}
)

param_df['Algoritmo'] = param_df['Algoritmo'].apply(lambda x: x.__class__.__name__)
param_df[HIPER_PARAMS] = param_df[HIPER_PARAMS].apply(cell_to_latex)
param_df.to_csv('results/params_df.csv', sep='\t')
print(param_df.style.hide(axis='index').to_latex().replace('_', r'\_'))
param_df

### Leitura e tratamento dos dados

In [None]:
if dataset_path.endswith('.arff'):

    data = arff.loadarff(f'{dataset_path}')
    df = pd.DataFrame(data[0])

    # separa em atributos em variável X, e classe em variável y
    x = df.drop('Class', axis=1)
    y = df['Class'].astype(int)

else:
    df = pd.read_csv(f'{dataset_path}')

    # separa em atributos em variável X, e classe em variável y
    # dropa variavel de tempo
    x = df.drop(['Class', 'Time'], axis=1)
    y = df['Class']

dataset_name = os.path.splitext(os.path.basename(dataset_path))[0]
create_dataset_results_folder(dataset_name)

In [None]:
# Para testes, limitar a quantidade de dados aumentando test_size...
x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    stratify=y,
    test_size=0.15
)

helper = EstimatorSelectionHelper(
    transformers,
    models, 
)

### Fitting the estimators

In [None]:
t_initial = datetime.now()

stratified_kfold = StratifiedKFold(
    n_splits=5,
)
helper.fit_predict(
    x, y,
    scoring=scoring, cv=stratified_kfold, 
    n_jobs=5, 
    refit='AUPRC', 
    verbose=2,
    dataset_name=dataset_name
)

metadata_summary = helper.generate_metadata_summary()
metadata_summary.to_csv(f'results/{dataset_name}/metadata_summary.txt', sep='\t')

estimators = pd.unique(metadata_summary.index.unique(level='estimator'))
map_estimators = dict(zip(estimators, [chr(ord('a') + i) for i, x in enumerate(estimators)]))

score_summary = helper.generate_score_summary()
score_summary['Caractere'] = score_summary.estimator.map(map_estimators)
score_summary \
    .sort_values(by='mean_test_AUPRC_score', ascending=False) \
    .to_csv(f'results/{dataset_name}/score_summary.txt', sep='\t', index=False)
# score_summary.to_csv(f'results/{dataset_name}/score_summary.txt', sep='\t')

t_final = datetime.now()

# get difference
delta = t_final - t_initial

# time difference in seconds
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(f'[{now}] Total execution time is {delta} ({delta.total_seconds()} seconds).')

generate_graphics_from_gridsearchcv_results(
    dataset_name,
    score_summary,
    scoring
)

In [None]:
metadata_summary.index.names = ['estimator', 'n']
metadata_summary.reset_index(inplace=True)
metadata_summary['Caractere'] = metadata_summary.estimator.map(map_estimators)
metadata_summary.set_index(['estimator', 'n'], inplace=True)

metadata_summary.head()

tabela_estimadores = pd.Series(map_estimators).to_frame('Caractere')
tabela_estimadores.index.name = 'Estimador'
tabela_estimadores.reset_index(inplace=True)

print(tabela_estimadores.style.hide(axis="index").to_latex())