In [1]:
import os
import glob

import os.path as path
import pandas  as pd
import numpy   as np

from itertools         import product
from sklearn.metrics   import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import GridSearchCV

from IPython.display         import display, Markdown, Latex

cur_folder = os.getcwd() # Diretório atual

datasets_folder = '../datasets/'

print('Done')

Done


In [2]:
# Recebe o número máximo de vizinhos e cria as configurações
def create_gridsearch_params(max_n_neighbors):

    # Listas com valores diferentes para testar no gridsearch.
    return {
        'n_jobs'      : [None], #não rodar gridsearch e o fit do regressor em paralelo para não dar problema
        'weights'     : ['distance'],
        'n_neighbors' : list(range(1, max_n_neighbors))
    }

# Função de erro para o gridsearch 
def RMSE(yhat, y):
    return np.sqrt(np.square(yhat - y).mean())
    
# Teste da criação de configurações -----------------------
confs = create_gridsearch_params(10)
print(confs)

{'n_jobs': [None], 'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9]}


In [6]:
n_folds         = 5
max_n_neighbors = 30 # Porcentagem máxima de vizinhos (0-100)

# kNN é determinístico, não precisamos fazer repetições
datasets = [
    'airfoil',
    'concrete',
    'energyCooling',
    'energyHeating',
    'Geographical',
    'towerData',
    'tecator',
    'wineRed',
    'wineWhite',
    'yacht',
]    

# ---------------------------
columns = ['dataset','conf','Fold','RMSE_train','RMSE_test']

fname = '../docs/knn-resultsregression.csv'
results = {c:[] for c in columns}

if os.path.isfile(fname):
    resultsDF = pd.read_csv(fname)
    results   = resultsDF.to_dict('list')

for ds in datasets:
    print(f'Gridsearch para base {ds}')
    
    for fold in range(n_folds):
        
        dataset_train = None
        dataset_test  = None
        
        # evitar tentar abrir arquivos que não existem
        try:
            dataset_train = np.loadtxt(f'{datasets_folder}/{ds}-train-{fold}.dat', delimiter=',')
            dataset_test  = np.loadtxt(f'{datasets_folder}/{ds}-test-{fold}.dat', delimiter=',')
        except:
            continue
            
        # Retomar testes se interrompido
        if os.path.isfile(fname):
            resultsDF = pd.read_csv(fname)
            results   = resultsDF.to_dict('list')

            if len(resultsDF[
                (resultsDF['dataset']==ds) &
                (resultsDF['Fold']==fold)
            ])==1:
                print(f'already evaluated {ds}-{fold}')

                continue
            
        X_train, y_train = dataset_train[:, :-1], dataset_train[:, -1]
        X_test,  y_test  = dataset_test[:, :-1],  dataset_test[:, -1]
        
        # Criando as configurações para um dataset específico.
        # Save_file salva em um arquivo as configurações, que são numeradas no
        # report de resultados
        confs = create_gridsearch_params((len(X_train)*max_n_neighbors)//100)
        print(confs)
        # Usando o gridsearch para obtenção da melhor configuração
        
        print(f'evaluating {ds}-{fold}')
        
        #cv=5 especifica que os dados devem ser divididos em 2 folds, e realizar uma validação cruzada
        grid = GridSearchCV(
            KNeighborsRegressor(),
            confs,
            cv=5,
            verbose=1,
            scoring=make_scorer(RMSE, greater_is_better=False), # Greater is better vai trocar o sinal e transformar em um problema de maximização. Na prática, isso significa que temos que trocar o sinal na hora de reportar o melhor resultado retornado
            return_train_score=True
        ).fit(X_train, y_train)
        
        #display(pd.DataFrame(grid.cv_results_))

        # Utilizando a melhor configuração para treinar o modelo e obter os scores        
        regressor = KNeighborsRegressor(**grid.best_params_).fit(X_train, y_train)  

        RMSE_train = -1*grid.best_score_ #Melhor score no gridsearch corresponde ao treino
        RMSE_test  = mean_squared_error(regressor.predict(X_test).ravel(), y_test.ravel(), squared=False)

        # Vamos salvar o número da configuração para ficar mais sucinto
        results['dataset'].append(ds)
        results['conf'].append(grid.best_params_)
        results['RMSE_train'].append(RMSE_train)
        results['RMSE_test'].append(RMSE_test)
        results['Fold'].append(fold)

        df = pd.DataFrame(results)
        df.to_csv(fname, index=False)

print('done')

Gridsearch para base airfoil
already evaluated airfoil-0
already evaluated airfoil-1
already evaluated airfoil-2
already evaluated airfoil-3
already evaluated airfoil-4
Gridsearch para base concrete
already evaluated concrete-0
already evaluated concrete-1
already evaluated concrete-2
already evaluated concrete-3
already evaluated concrete-4
Gridsearch para base energyCooling
already evaluated energyCooling-0
already evaluated energyCooling-1
already evaluated energyCooling-2
already evaluated energyCooling-3
already evaluated energyCooling-4
Gridsearch para base energyHeating
already evaluated energyHeating-0
already evaluated energyHeating-1
already evaluated energyHeating-2
already evaluated energyHeating-3
already evaluated energyHeating-4
Gridsearch para base Geographical
{'n_jobs': [None], 'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 4

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1265 out of 1265 | elapsed:  1.9min finished


{'n_jobs': [None], 'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1265 out of 1265 | elapsed:  1.8min finished


{'n_jobs': [None], 'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1265 out of 1265 | elapsed:  1.8min finished


{'n_jobs': [None], 'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1265 out of 1265 | elapsed:  1.8min finished


{'n_jobs': [None], 'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1265 out of 1265 | elapsed:  1.9min finished


Gridsearch para base towerData
already evaluated towerData-0
already evaluated towerData-1
already evaluated towerData-2
already evaluated towerData-3
already evaluated towerData-4
Gridsearch para base tecator
already evaluated tecator-0
already evaluated tecator-1
already evaluated tecator-2
already evaluated tecator-3
already evaluated tecator-4
Gridsearch para base wineRed
already evaluated wineRed-0
already evaluated wineRed-1
already evaluated wineRed-2
already evaluated wineRed-3
already evaluated wineRed-4
Gridsearch para base wineWhite
already evaluated wineWhite-0
already evaluated wineWhite-1
already evaluated wineWhite-2
already evaluated wineWhite-3
already evaluated wineWhite-4
Gridsearch para base yacht
already evaluated yacht-0
already evaluated yacht-1
already evaluated yacht-2
already evaluated yacht-3
already evaluated yacht-4
done


In [7]:
fname = '../docs/knn-resultsregression.csv'

resultsDF = pd.read_csv(fname)

pd.set_option('display.max_colwidth', None) #não truncar colunas usando display

display(Markdown('## Tabela dos resultados'))
display(resultsDF)

# Obtendo a melhor configuração para cada dataset

# Calculando as medianas e tirando a coluna de fold (configuração já some pois não é numérico)
resultsDF_median = resultsDF.groupby('dataset').mean().iloc[:, 1:]
resultsDF_median.columns = ['RMSE_train_mean', 'RMSE_test_mean']
display(Markdown('## Médias dos folds para cada dataset'))
display(resultsDF_median)

# Colocando o desvio padrão 
resultsDF_std = resultsDF.groupby('dataset').std().iloc[:, 1:]
resultsDF_std.columns = ['RMSE_train_std', 'RMSE_test_std']
display(Markdown('## Desvios padrões dos folds para cada dataset'))
display(resultsDF_std)

# juntando tudo em um só
resultsDF_ = pd.merge(resultsDF_median, resultsDF_std, left_index=True, right_index=True)
display(Markdown('## Juntando tudo'))
display(resultsDF_)

## Tabela dos resultados

Unnamed: 0,dataset,conf,Fold,RMSE_train,RMSE_test
0,airfoil,"{'n_jobs': None, 'n_neighbors': 359, 'weights': 'distance'}",0,6.017243,5.990185
1,airfoil,"{'n_jobs': None, 'n_neighbors': 359, 'weights': 'distance'}",1,6.186459,5.761461
2,airfoil,"{'n_jobs': None, 'n_neighbors': 219, 'weights': 'distance'}",2,6.183848,5.721287
3,airfoil,"{'n_jobs': None, 'n_neighbors': 355, 'weights': 'distance'}",3,5.992385,6.006452
4,airfoil,"{'n_jobs': None, 'n_neighbors': 351, 'weights': 'distance'}",4,5.871803,6.341234
5,concrete,"{'n_jobs': None, 'n_neighbors': 4, 'weights': 'distance'}",0,8.354459,6.650786
6,concrete,"{'n_jobs': None, 'n_neighbors': 3, 'weights': 'distance'}",1,8.276878,7.938719
7,concrete,"{'n_jobs': None, 'n_neighbors': 6, 'weights': 'distance'}",2,8.119726,8.395398
8,concrete,"{'n_jobs': None, 'n_neighbors': 6, 'weights': 'distance'}",3,8.257371,8.486236
9,concrete,"{'n_jobs': None, 'n_neighbors': 6, 'weights': 'distance'}",4,8.326966,7.795772


## Médias dos folds para cada dataset

Unnamed: 0_level_0,RMSE_train_mean,RMSE_test_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
Geographical,31.514253,31.170637
airfoil,6.050348,5.964124
concrete,8.26708,7.853382
energyCooling,2.204574,2.172953
energyHeating,2.866991,2.817327
tecator,1.39158,1.293588
towerData,15.113805,14.806532
wineRed,0.678045,0.657223
wineWhite,0.707626,0.687557
yacht,8.751017,7.563321


## Desvios padrões dos folds para cada dataset

Unnamed: 0_level_0,RMSE_train_std,RMSE_test_std
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
Geographical,0.520696,0.86705
airfoil,0.134798,0.247348
concrete,0.091001,0.733336
energyCooling,0.040885,0.124064
energyHeating,0.057281,0.374745
tecator,0.036197,0.086805
towerData,0.142306,0.851318
wineRed,0.013981,0.049284
wineWhite,0.009144,0.021037
yacht,0.684408,1.305219


## Juntando tudo

Unnamed: 0_level_0,RMSE_train_mean,RMSE_test_mean,RMSE_train_std,RMSE_test_std
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Geographical,31.514253,31.170637,0.520696,0.86705
airfoil,6.050348,5.964124,0.134798,0.247348
concrete,8.26708,7.853382,0.091001,0.733336
energyCooling,2.204574,2.172953,0.040885,0.124064
energyHeating,2.866991,2.817327,0.057281,0.374745
tecator,1.39158,1.293588,0.036197,0.086805
towerData,15.113805,14.806532,0.142306,0.851318
wineRed,0.678045,0.657223,0.013981,0.049284
wineWhite,0.707626,0.687557,0.009144,0.021037
yacht,8.751017,7.563321,0.684408,1.305219
