In [1]:
import os
import glob

import os.path as path
import pandas  as pd
import numpy   as np

from itertools         import product
from sklearn.metrics   import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import GridSearchCV

from IPython.display         import display, Markdown, Latex

cur_folder = os.getcwd() # Diretório atual

datasets_folder = '../datasets/'

print('Done')

Done


In [2]:
# Recebe o número máximo de vizinhos e cria as configurações
def create_gridsearch_params(max_n_neighbors):

    # Listas com valores diferentes para testar no gridsearch.
    return {
        'n_jobs'      : [None], #não rodar gridsearch e o fit do regressor em paralelo para não dar problema
        'weights'     : ['distance'],
        'n_neighbors' : list(range(1, max_n_neighbors))
    }

# Função de erro para o gridsearch 
def RMSE(yhat, y):
    return np.sqrt(np.square(yhat - y).mean())
    
# Teste da criação de configurações -----------------------
confs = create_gridsearch_params(10)
print(confs)

{'n_jobs': [None], 'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9]}


In [3]:
n_folds         = 5
max_n_neighbors = 30 # Porcentagem máxima de vizinhos (0-100)

# kNN é determinístico, não precisamos fazer repetições
datasets = [
    'airfoil',
    'concrete',
    'energyCooling',
    'energyHeating',
    'GeographicalOriginalofMusic',
    'towerData',
    'tecator',
    'wineRed',
    'wineWhite',
    'yacht',
]    

# ---------------------------
columns = ['dataset','conf','Fold','RMSE_train','RMSE_test']

fname = '../docs/knn-resultsregression.csv'
results = {c:[] for c in columns}

if os.path.isfile(fname):
    resultsDF = pd.read_csv(fname)
    results   = resultsDF.to_dict('list')

for ds in datasets:
    print(f'Gridsearch para base {ds}')
    
    for fold in range(n_folds):
        
        dataset_train = None
        dataset_test  = None
        
        # evitar tentar abrir arquivos que não existem
        try:
            dataset_train = np.loadtxt(f'{datasets_folder}/{ds}-train-{fold}.dat', delimiter=',')
            dataset_test  = np.loadtxt(f'{datasets_folder}/{ds}-test-{fold}.dat', delimiter=',')
        except:
            continue
            
        # Retomar testes se interrompido
        if os.path.isfile(fname):
            resultsDF = pd.read_csv(fname)
            results   = resultsDF.to_dict('list')

            if len(resultsDF[
                (resultsDF['dataset']==ds) &
                (resultsDF['Fold']==fold)
            ])==1:
                print(f'already evaluated {ds}-{fold}')

                continue
            
        X_train, y_train = dataset_train[:, :-1], dataset_train[:, -1]
        X_test,  y_test  = dataset_test[:, :-1],  dataset_test[:, -1]
        
        # Criando as configurações para um dataset específico.
        # Save_file salva em um arquivo as configurações, que são numeradas no
        # report de resultados
        confs = create_gridsearch_params((len(X_train)*max_n_neighbors)//100)
        print(confs)
        # Usando o gridsearch para obtenção da melhor configuração
        
        print(f'evaluating {ds}-{fold}')
        
        #cv=2 especifica que os dados devem ser divididos em 2 folds, e realizar uma validação cruzada
        grid = GridSearchCV(
            KNeighborsRegressor(),
            confs,
            cv=5,
            verbose=1,
            scoring=make_scorer(RMSE, greater_is_better=False), # Greater is better vai trocar o sinal e transformar em um problema de maximização. Na prática, isso significa que temos que trocar o sinal na hora de reportar o melhor resultado retornado
            return_train_score=True
        ).fit(X_train, y_train)
        
        #display(pd.DataFrame(grid.cv_results_))

        # Utilizando a melhor configuração para treinar o modelo e obter os scores        
        regressor = KNeighborsRegressor(**grid.best_params_).fit(X_train, y_train)  

        RMSE_train = -1*grid.best_score_ #Melhor score no gridsearch corresponde ao treino
        RMSE_test  = mean_squared_error(regressor.predict(X_test).ravel(), y_test.ravel(), squared=False)

        # Vamos salvar o número da configuração para ficar mais sucinto
        results['dataset'].append(ds)
        results['conf'].append(grid.best_params_)
        results['RMSE_train'].append(RMSE_train)
        results['RMSE_test'].append(RMSE_test)
        results['Fold'].append(fold)

        df = pd.DataFrame(results)
        df.to_csv(fname, index=False)

print('done')

Gridsearch para base airfoil
already evaluated airfoil-0
already evaluated airfoil-1
already evaluated airfoil-2
already evaluated airfoil-3
already evaluated airfoil-4
Gridsearch para base concrete
already evaluated concrete-0
already evaluated concrete-1
already evaluated concrete-2
already evaluated concrete-3
already evaluated concrete-4
Gridsearch para base energyCooling
already evaluated energyCooling-0
already evaluated energyCooling-1
already evaluated energyCooling-2
already evaluated energyCooling-3
already evaluated energyCooling-4
Gridsearch para base energyHeating
already evaluated energyHeating-0
already evaluated energyHeating-1
already evaluated energyHeating-2
already evaluated energyHeating-3
already evaluated energyHeating-4
Gridsearch para base GeographicalOriginalofMusic
already evaluated GeographicalOriginalofMusic-0
already evaluated GeographicalOriginalofMusic-1
already evaluated GeographicalOriginalofMusic-2
already evaluated GeographicalOriginalofMusic-3
alrea

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2396 out of 2396 | elapsed: 27.2min finished


{'n_jobs': [None], 'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2396 out of 2396 | elapsed: 27.2min finished


{'n_jobs': [None], 'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2396 out of 2396 | elapsed: 27.1min finished


{'n_jobs': [None], 'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2396 out of 2396 | elapsed: 27.1min finished


{'n_jobs': [None], 'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2398 out of 2398 | elapsed: 31.1min finished


Gridsearch para base tecator
already evaluated tecator-0
already evaluated tecator-1
already evaluated tecator-2
already evaluated tecator-3
already evaluated tecator-4
Gridsearch para base wineRed
already evaluated wineRed-0
already evaluated wineRed-1
already evaluated wineRed-2
already evaluated wineRed-3
already evaluated wineRed-4
Gridsearch para base wineWhite
already evaluated wineWhite-0
already evaluated wineWhite-1
already evaluated wineWhite-2
already evaluated wineWhite-3
already evaluated wineWhite-4
Gridsearch para base yacht
already evaluated yacht-0
already evaluated yacht-1
already evaluated yacht-2
already evaluated yacht-3
already evaluated yacht-4
done


In [4]:
fname = '../docs/knn-resultsregression.csv'

resultsDF = pd.read_csv(fname)

pd.set_option('display.max_colwidth', None) #não truncar colunas usando display

display(Markdown('## Tabela dos resultados'))
display(resultsDF)

# Obtendo a melhor configuração para cada dataset

# Calculando as medianas e tirando a coluna de fold (configuração já some pois não é numérico)
resultsDF_median = resultsDF.groupby('dataset').mean().iloc[:, 1:]
resultsDF_median.columns = ['RMSE_train_mean', 'RMSE_test_mean']
display(Markdown('## Médias dos folds para cada dataset'))
display(resultsDF_median)

# Colocando o desvio padrão 
resultsDF_std = resultsDF.groupby('dataset').std().iloc[:, 1:]
resultsDF_std.columns = ['RMSE_train_std', 'RMSE_test_std']
display(Markdown('## Desvios padrões dos folds para cada dataset'))
display(resultsDF_std)

# juntando tudo em um só
resultsDF_ = pd.merge(resultsDF_median, resultsDF_std, left_index=True, right_index=True)
display(Markdown('## Juntando tudo'))
display(resultsDF_)

## Tabela dos resultados

Unnamed: 0,dataset,conf,Fold,RMSE_train,RMSE_test
0,airfoil,"{'n_jobs': None, 'n_neighbors': 179, 'weights': 'distance'}",0,6.085912,5.991677
1,airfoil,"{'n_jobs': None, 'n_neighbors': 179, 'weights': 'distance'}",1,6.251632,5.761461
2,airfoil,"{'n_jobs': None, 'n_neighbors': 179, 'weights': 'distance'}",2,6.281535,5.719953
3,airfoil,"{'n_jobs': None, 'n_neighbors': 179, 'weights': 'distance'}",3,6.292133,6.006404
4,airfoil,"{'n_jobs': None, 'n_neighbors': 179, 'weights': 'distance'}",4,6.126346,6.341319
5,yacht,"{'n_jobs': None, 'n_neighbors': 5, 'weights': 'distance'}",0,11.60012,7.192731
6,yacht,"{'n_jobs': None, 'n_neighbors': 5, 'weights': 'distance'}",1,11.428934,9.963663
7,yacht,"{'n_jobs': None, 'n_neighbors': 1, 'weights': 'distance'}",2,10.758165,8.443382
8,yacht,"{'n_jobs': None, 'n_neighbors': 3, 'weights': 'distance'}",3,10.660353,10.070486
9,yacht,"{'n_jobs': None, 'n_neighbors': 1, 'weights': 'distance'}",4,11.668508,6.238448


## Médias dos folds para cada dataset

Unnamed: 0_level_0,RMSE_train_mean,RMSE_test_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
GeographicalOriginalofMusic,33.492014,30.431291
airfoil,6.207512,5.964163
concrete,9.360119,7.827606
energyCooling,2.552068,2.22188
energyHeating,3.137259,2.989505
tecator,1.662343,1.373888
towerData,17.336751,14.710986
wineRed,0.705017,0.656625
wineWhite,0.750946,0.688085
yacht,11.223216,8.381742


## Desvios padrões dos folds para cada dataset

Unnamed: 0_level_0,RMSE_train_std,RMSE_test_std
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
GeographicalOriginalofMusic,0.684362,3.128171
airfoil,0.094817,0.247747
concrete,0.266988,0.558481
energyCooling,0.0457,0.159352
energyHeating,0.084212,0.509814
tecator,0.064324,0.331096
towerData,0.368153,0.998657
wineRed,0.022614,0.047068
wineWhite,0.005578,0.020861
yacht,0.478474,1.685644


## Juntando tudo

Unnamed: 0_level_0,RMSE_train_mean,RMSE_test_mean,RMSE_train_std,RMSE_test_std
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GeographicalOriginalofMusic,33.492014,30.431291,0.684362,3.128171
airfoil,6.207512,5.964163,0.094817,0.247747
concrete,9.360119,7.827606,0.266988,0.558481
energyCooling,2.552068,2.22188,0.0457,0.159352
energyHeating,3.137259,2.989505,0.084212,0.509814
tecator,1.662343,1.373888,0.064324,0.331096
towerData,17.336751,14.710986,0.368153,0.998657
wineRed,0.705017,0.656625,0.022614,0.047068
wineWhite,0.750946,0.688085,0.005578,0.020861
yacht,11.223216,8.381742,0.478474,1.685644
