# Treinamento e Avaliação de Modelos nas bases utilizadas

In [1]:
!pip install -qqU openml

## Imports

In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from tqdm.notebook import tqdm
import numpy as np
import openml
import pandas as pd 

from config import dataset_ids

## Avaliando Modelos

### Funções Auxiliares

In [23]:
def evaluate_clf(clf):
    datasets = openml.datasets.get_datasets(dataset_ids)
    result = []
    for dataset in tqdm(datasets):
        scores = evaluate_data(clf, dataset)
        result.append(scores)
    return gen_result_df(result)
    
def evaluate_data(clf, dataset):
    
    X, y, categorical_indicator, attribute_names  = dataset.get_data(target=dataset.default_target_attribute)
    
    categorical_preprocessor = OneHotEncoder(sparse_output=False,
                                             handle_unknown='infrequent_if_exist')

    preprocessor = ColumnTransformer([
        ('one-hot-encoder', categorical_preprocessor, categorical_indicator)],
                                     remainder='passthrough' )

    pipeline = make_pipeline(preprocessor, clf)

    scores = ['accuracy', 'f1_micro',
              'f1_macro', 'f1_weighted']

    skf = StratifiedKFold(n_splits=5)

    scores= cross_validate(pipeline, X, y,
                           cv=skf,
                           scoring=scores,error_score='raise')
    scores['dataset_id'] = dataset.id
    
    return scores


def gen_result_df(result):
    df = pd.DataFrame.from_records(result)
    columns = df.columns[:-1]
    
    for column in columns:
        df[column+'_mean'] = df[column].map(lambda x: x.mean())
        df[column+'_std'] = df[column].map(lambda x: x.std())
        
    return df.drop(columns=columns)

### Modelos Utilizados

In [27]:
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

models = [
    ('linear_svm', LinearSVC(dual='auto')),
    ('rbf_svm', SVC()),
    ('random_forest', RandomForestClassifier()),
    ('knn', KNeighborsClassifier()),
    ('mpl', MLPClassifier()),
    ('logistic_regression', LogisticRegression()),
    ('decision_tree', DecisionTreeClassifier()),
     ('gaussian_nb', GaussianNB())
]

### Teste de Modelos em Bases de Dados

In [28]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

for model_name, model in models:
    print(f'Obtendo scores para {model_name}...')
    scores = evaluate_clf(model)
    scores.to_csv(f'{model_name}_scores.csv')

Obtendo scores para linear_svm...


  0%|          | 0/70 [00:00<?, ?it/s]



Obtendo scores para rbf_svm...


  0%|          | 0/70 [00:00<?, ?it/s]

Obtendo scores para random_forest...


  0%|          | 0/70 [00:00<?, ?it/s]

Obtendo scores para knn...


  0%|          | 0/70 [00:00<?, ?it/s]

Obtendo scores para mpl...


  0%|          | 0/70 [00:00<?, ?it/s]



Obtendo scores para logistic_regression...




  0%|          | 0/70 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Obtendo scores para decision_tree...


  0%|          | 0/70 [00:00<?, ?it/s]

Obtendo scores para gaussian_nb...


  0%|          | 0/70 [00:00<?, ?it/s]