Importando funções necessárias

In [2]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from scipy import stats
import pandas as pd
import numpy as np

Função para adicionar as medidas na tabela de acurácia

In [3]:
def add_result_measure(met, df, scores):
    inf, sup = stats.norm.interval(0.95, loc=scores.mean(), 
                               scale=scores.std()/np.sqrt(len(scores)))
    new_row = pd.DataFrame([[met, round(scores.mean(), 2), round(scores.std(), 2), round(inf, 2), round(sup, 2)]],
                            columns=['Método', 'Média', 'Desvio Padrão', 'Limite Inferior', 'Limite Superior'])
    df = pd.concat([df, new_row])
    return df

Função que cria a tabela de p-valores

In [4]:
def table_htest(list_scores):
    from scipy.stats import ttest_rel, wilcoxon
    list_estimators = ['ZR', 'NBG', 'KMC','KNN', 'AD']

    for row in range(len(list_estimators)):
        for col in range(len(list_estimators)):
            if(row == col):
                print(list_estimators[row], end = "\t")
            else:
                if(col > row):
                    s,p = ttest_rel(list_scores[row],list_scores[col])
                    print(round(p, 6), end = "\t")
                else:
                    s,p = wilcoxon(list_scores[row],list_scores[col])
                    print(round(p, 6), end = "\t")
        print ("\n")

Inicializando a tabela de medidas para ir adicionando os resultados

In [5]:
table_result_measure = pd.DataFrame(columns=['Método', 'Média', 'Desvio Padrão', 'Limite Inferior', 'Limite Superior'])

Importando a base de dados

In [6]:
from sklearn import datasets

data = datasets.load_breast_cancer()
data_X = data.data
data_y = data.target

Parte I
- Importando os Classificadores ZeroR e Naive Bayes Gaussiano
- Inicializando o escalar e o repetidor estratificado

In [7]:
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB

scalar = StandardScaler()
rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

- Chamando os construtores de cada classificador da parte I
- Realizando as validações cruzadas
- Adicionando os resultados na tabela de medidas

In [8]:
zR = DummyClassifier()
pipeline_zR = Pipeline([('transformer', scalar), ('estimator', zR)])

gNB = GaussianNB()
pipeline_gNB = Pipeline([('transformer', scalar), ('estimator', gNB)])

zr_scores = cross_val_score(pipeline_zR, data_X, data_y, scoring='accuracy', cv=rkf)
table_result_measure = add_result_measure('ZR', table_result_measure, cross_val_score(pipeline_zR, data_X, data_y, scoring='accuracy', cv=rkf))

nbg_scores = cross_val_score(pipeline_gNB, data_X, data_y, scoring='accuracy', cv=rkf)
table_result_measure = add_result_measure('NBG', table_result_measure, cross_val_score(pipeline_gNB, data_X, data_y, scoring='accuracy', cv=rkf))

Parte II
- Importando os Classificadores KNN, KMeans e Arvore de Decisão
- Importando outras funções necessárias

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y
from sklearn.model_selection import GridSearchCV

Classe do classificador KMC
- Init com os parâmetros de classe:
    - K
    - Lista de centróides
    - Lista de classificação das centróides

- Função de treino:
    - Aplicando o Kmeans em cada grupo de classe
    - Encontrando os centróides de cada grupo

- Função de predição:
    - Aplicando o KNN para encontrar a centróide mais próxima

In [10]:
class KMCClassifier(BaseEstimator):
    def __init__(self, k=1):
        super().__init__()
        self.k = k
        self.cent = []
        self.label = []

    def fit(self, X_train, y_train):
        X_train, y_train = check_X_y(X_train, y_train)
        for classe in range(len(np.unique(y_train))):
            kmeans = KMeans(n_clusters=self.k)
            kmeans.fit(X_train [y_train == classe])
            self.cent.append(kmeans.cluster_centers_)
            self.cent.append(classe)

    def predict(self, X_test):
        knn = KNeighborsClassifier(n_neighbors=1)
        knn.fit(self.cent, self.label)
        return knn.predict(X_test)

Definindo os hiperparâmetros

In [11]:
parameters_KMC = {'estimator__k':[1,3,5,7]}
parameters_KNN = {'estimator__n_neighbors':[1,3,5,7]}
parameters_AD = {'estimator__max_depth':[None,3,5,10]}

- Chamando os construtores de cada classificador da parte II
- Realizando as validações cruzadas com ciclos aninhados
- Adicionando os resultados na tabela de medidas

In [12]:
kMC = KMCClassifier()
pipeline_kMC = Pipeline([('transformer', scalar), ('estimator', kMC)])
p_KMC = GridSearchCV(pipeline_kMC, parameters_KMC, scoring='accuracy', cv=4)

kNN = KNeighborsClassifier()
pipeline_kNN = Pipeline([('transformer', scalar), ('estimator', kNN)])
p_KNN = GridSearchCV(pipeline_kNN, parameters_KNN, scoring='accuracy', cv=4)

aD = DecisionTreeClassifier()
pipeline_AD = Pipeline([('transformer', scalar), ('estimator', aD)])
p_AD = GridSearchCV(pipeline_AD, parameters_AD, scoring='accuracy', cv=4)

kmc_scores = cross_val_score(p_KMC, data_X, data_y, scoring='accuracy', cv=rkf)
table_result_measure = add_result_measure('KMC', table_result_measure, kmc_scores)

knn_scores = cross_val_score(p_KNN, data_X, data_y, scoring='accuracy', cv=rkf)
table_result_measure = add_result_measure('KNN', table_result_measure, knn_scores)

ad_scores = cross_val_score(p_AD, data_X, data_y, scoring='accuracy', cv=rkf)
table_result_measure = add_result_measure('AD', table_result_measure, ad_scores)

  array = np.asarray(array, order=order, dtype=dtype)
Traceback (most recent call last):
  File "C:\Users\quiel\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\quiel\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_scorer.py", line 219, in __call__
    return self._score(
  File "C:\Users\quiel\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\quiel\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_scorer.py", line 71, i

KeyboardInterrupt: 

Tabela de resultados das medidas dos classificadores

In [None]:
table_result_measure.reset_index(drop=True, inplace=True)
print(table_result_measure)

Tabela triangular dos p-valores

In [None]:
list_scores = [zr_scores, nbg_scores, kmc_scores, knn_scores, ad_scores]
table_htest(list_scores)