In [2]:
import os
import glob

import os.path as path
import pandas  as pd
import numpy   as np

from itertools         import product
from sklearn.metrics   import mean_squared_error, mean_absolute_error, make_scorer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

datasets_folder = '../datasets/'


# ------------------------------------------------------------------
# Código do Fabrício - tudo menos o main
import dcgpy
import pygmo as pg
print(dcgpy.__version__)
    
def plog1p(x):
    if hasattr(x, "shape"):
        z = np.empty_like(x)
        z[x>-1.0] = np.log1p(x[x>-1.0])
        z[x<=-1.0] = 0.0
        return z
    if x<=-1.0:
        return 0.0
    return np.log1p(x)
    
def pmul(x, y):
    z = np.multiply(x,y)
    if hasattr(z, "shape"):
        z = np.nan_to_num(z)
        return z
    if np.isnan(z) or np.isinf(z):
         return 0.0
         
def safe_mean_square(y, yhat):
    z = np.square(y - yhat).mean()
    if np.isnan(z) or np.isinf(z):
        return 1e+30
    return z
    
    
def RMSE(yhat, y):
    return np.sqrt(np.square(yhat - y).mean())

print('Done')

1.4.1
Done


In [4]:
# Criação das configurações

# gridsearch_configuration é um dicionário, onde cada key é um parâmetro
# e o seu valor pode ser uma das duas opções:
# - lista (nativa): contém os valores que serão utilizados na busca (colocar só 1 se for fixar)
# - funções lambda: usada para parâmetros que assumem o valor baseado em outros

# Criação dos parâmetros: será feito um produto cartesiano
# sobre todas as listas passadas, e então as funções lambda serão aplicadas
# sobre cada configuração obtida pelo prod. cartesiano.

gridsearch_configurations = {
    'cols' : [100, 250, 500],
    'gen'  : lambda conf:  100000//conf['cols']
}

keys, values, varying = [], [], []
for k,v in gridsearch_configurations.items():
    if isinstance(v, list): 
        values.append(v)
        if len(v) > 1: # Salvando quem varia para printar dicionário colorido
            varying.append(k)
    elif callable(v): 
        continue
    else:
        raise Exception('o que é isso?')
    keys.append(k)
        
confs = [dict(zip(keys,items)) for items in product(*values)]

for conf in confs:
    for k,v in gridsearch_configurations.items():
        if callable(v):
            conf[k] = v(conf)
            varying.append(k)
                
# Verificar se temos em todas as confs o mesmo número
# que deveriamos ter do dicionário de valores
for conf in confs:
    if set(conf.keys()) != set(gridsearch_configurations.keys()):
        raise Exception(f'Parâmetros de busca e da configuração específica não batem. Configuração:\n{conf}')

# Criando um dataframe para enumerar e visualizar melhor as configurações
confs_df = pd.DataFrame(confs, index=[f'conf {i}' for i in range(len(confs))]).T
confs_df.index.names = ['Parameters']
confs_df.to_csv('gridsearch_configurations.csv')

# Destacando apenas os parâmetros que são diferentes entre algumas configurações
confs_df.style.apply(
    lambda x: ['background: lightgreen' if x.name in varying else '' for i in x], 
    axis=1
)

Unnamed: 0_level_0,conf 0,conf 1,conf 2
Parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cols,100,250,500
gen,1000,400,200


In [None]:
# Função que recebe um dataset e uma configuração e executa o algoritmo
def run(dataset_train, dataset_test, cols, gen):
    # kernels: https://darioizzo.github.io/dcgp/docs/cpp/kernel_list.html
    ss = dcgpy.kernel_set_double(["sum", "diff", "mul", "pdiv", "sin", "cos", "tanh", "log", "exp", "psqrt"])
        
    Xtrain, ytrain = dataset_train[:, :-1], dataset_train[:, -1]
    Xtest,  ytest  = dataset_test[:, :-1],  dataset_test[:, -1]
    
    udp = dcgpy.symbolic_regression(
        points = Xtrain, labels = ytrain[:,np.newaxis], kernels=ss(), 
        rows=1, cols=cols, levels_back=21, arity=2, 
        n_eph=3, multi_objective=False, parallel_batches = 0
    )
    
    uda  = dcgpy.es4cgp(gen = gen, mut_n = 1)

    algo = pg.algorithm(uda)
    pop = pg.population(udp, 4)
    pop = algo.evolve(pop)     
    
    return RMSE(udp.predict(Xtrain, pop.champion_x), ytrain), RMSE(ytest, udp.predict(Xtest, pop.champion_x))
    
    
# Função que faz a busca pela melhor configuração:
def gridsearch(dataset_train, confs):
    
    kf = KFold(n_splits=5, shuffle=True)

    # (rmse_cv, configuração, indice da configuração)
    best_conf = (np.inf, {}, -1)
    
    for i, conf in enumerate(confs):
        print(f'Testando configuração {i}/{len(confs)}', end='')
        
        RMSE_cv = []
        for train_index, test_index in kf.split(dataset_train):
            RMSE_train, RMSE_test = run(dataset_train[train_index, :], dataset_train[test_index, :], **conf)
            RMSE_cv.append(RMSE_test)

        print(f': {np.mean(RMSE_cv)}, {RMSE_cv}')
        if np.mean(RMSE_cv) < best_conf[0]:
            best_conf = (np.mean(RMSE_cv), conf,  i)
            
    return best_conf

In [5]:
# Gridsearch

n_folds       = 5
n_runs        = 30
runs_per_fold = n_runs//n_folds

datasets = [
    'airfoil',
    'concrete',
    'energyCooling',
    'energyHeating',
    'GeographicalOriginalofMusic',
    'towerData',
    'tecator',
    'wineRed',
    'wineWhite',
    'yacht',
]    

# ---------------------------
columns   = ['dataset','conf','Fold','Rep','RMSE_cv','RMSE_train','RMSE_test']

fname     = '../docs/dCartesian-resultsregression.csv'

results   = {c:[] for c in columns}
resultsDF = pd.DataFrame(columns=columns)

if os.path.isfile(fname):
    resultsDF = pd.read_csv(fname)
    results   = resultsDF.to_dict('list')

for ds in datasets:
    print(f'Executando agora para o dataset {ds}')
    for fold in range(n_folds):
        dataset_train, dataset_test = None, None
        
        # evitar tentar abrir arquivos que não existem
        try:
            dataset_train = np.loadtxt(f'{datasets_folder}/{ds}-train-{fold}.dat', delimiter=',')
            dataset_test  = np.loadtxt(f'{datasets_folder}/{ds}-test-{fold}.dat', delimiter=',')
        except:
            print(f'Dataset {dataset_train} does not exist.')
            continue
            
        print(f'Executando para o fold {fold}')

        RMSE_cv, conf, conf_id = None, None, None
        if len(resultsDF[(resultsDF['dataset']==ds) &(resultsDF['Fold']==fold)])>0:
            # Verificar se aquele fold já foi avaliado em alguma repetição, e caso tenha sido
            # pega a configuração utilizada em uma delas (vao ser todas iguais, tanto faz a repetição
            # contanto que seja no mesmo fold)
            aux_resultsDF = resultsDF[(resultsDF['dataset']==ds) &(resultsDF['Fold']==fold)
                                     ]
            conf_id = aux_resultsDF['conf'].values[0]
            RMSE_cv = aux_resultsDF['RMSE_cv'].values[0]
            conf    = confs[conf_id]

            print(f'Pegando configuração avaliada anteriormente: {RMSE_cv}, {conf_id}')
        else:
            # Obtendo melhor configuração para esse treino-teste
            print('Obtendo a melhor configuração utilizando 5-fold cv no treino')
            RMSE_cv, conf, conf_id = gridsearch(dataset_train, confs)

        print('Começando a testar a melhor configuração obtida')
        for rep in range(runs_per_fold):
            if len(resultsDF[
                (resultsDF['dataset']==ds) &
                (resultsDF['Fold']==fold)  &
                (resultsDF['Rep']==rep)
            ])==1:
                print(f'already evaluated {ds}-{fold}-{rep}')

                continue

            print(f'evaluating config {conf_id} for {ds}-{fold}-{rep}')
            
            RMSE_train, RMSE_test = run(dataset_train, dataset_test, **conf)

            results['dataset'].append(ds)
            results['conf'].append(conf_id)
            results['RMSE_cv'].append(RMSE_cv)
            results['RMSE_train'].append(RMSE_train)
            results['RMSE_test'].append(RMSE_test)
            results['Fold'].append(fold)
            results['Rep'].append(rep)

            df = pd.DataFrame(results)
            df.to_csv(fname, index=False)

print('done')

Executando agora para o dataset airfoil
Executando para o fold 0
Pegando configuração avaliada anteriormente: 6.897974362581838, 0
Começando a testar a melhor configuração obtida
already evaluated airfoil-0-0
already evaluated airfoil-0-1
already evaluated airfoil-0-2
already evaluated airfoil-0-3
already evaluated airfoil-0-4
already evaluated airfoil-0-5
Executando para o fold 1
Pegando configuração avaliada anteriormente: 14.024675150633335, 0
Começando a testar a melhor configuração obtida
already evaluated airfoil-1-0
already evaluated airfoil-1-1
already evaluated airfoil-1-2
already evaluated airfoil-1-3
already evaluated airfoil-1-4
already evaluated airfoil-1-5
Executando para o fold 2
Pegando configuração avaliada anteriormente: 8.911016073392071, 0
Começando a testar a melhor configuração obtida
already evaluated airfoil-2-0
already evaluated airfoil-2-1
already evaluated airfoil-2-2
already evaluated airfoil-2-3
already evaluated airfoil-2-4
already evaluated airfoil-2-5
Ex

Executando para o fold 0
Pegando configuração avaliada anteriormente: 106.68666652297605, 0
Começando a testar a melhor configuração obtida
already evaluated towerData-0-0
already evaluated towerData-0-1
already evaluated towerData-0-2
already evaluated towerData-0-3
already evaluated towerData-0-4
already evaluated towerData-0-5
Executando para o fold 1
Pegando configuração avaliada anteriormente: 110.20409813866029, 0
Começando a testar a melhor configuração obtida
already evaluated towerData-1-0
already evaluated towerData-1-1
already evaluated towerData-1-2
already evaluated towerData-1-3
already evaluated towerData-1-4
already evaluated towerData-1-5
Executando para o fold 2
Pegando configuração avaliada anteriormente: 104.1227960494684, 0
Começando a testar a melhor configuração obtida
already evaluated towerData-2-0
already evaluated towerData-2-1
already evaluated towerData-2-2
evaluating config 0 for towerData-2-3


NameError: name 'run' is not defined

In [17]:
# documentação: https://darioizzo.github.io/dcgp/installation.html

fname = '../docs/dCartesian-resultsregression.csv'

resultsDF = pd.read_csv(fname)

pd.set_option('display.max_colwidth', None) #não truncar colunas usando display

# Tá dando crash no número de constantes aleatórias
display(resultsDF)

# Obtendo a melhor configuração para cada dataset


# Pegar, para cada dataset-fold-rep, a configuração de menor RMSE_cv
resultsDF_ = resultsDF.loc[resultsDF.groupby(['dataset', 'Fold', 'Rep'])['RMSE_cv'].idxmin()]
resultsDF_ = resultsDF_.set_index(['dataset', 'Fold', 'Rep'])
display(resultsDF_)

# Tirando a média da melhor configuração em cada fold (e descartando 2 primeiras colunas, configuração e cv)
resultsDF_median = resultsDF_.groupby(['dataset']).mean().iloc[:, 2:]
resultsDF_median.columns = ['RMSE_train_mean', 'RMSE_test_mean']
display(resultsDF_median)

# Colocando o desvio padrão e tirando as 2 primeiras colunas (fold e rep, não interessam)
resultsDF_std = resultsDF_.groupby(['dataset']).std().iloc[:, 2:]
resultsDF_std.columns = ['RMSE_train_std', 'RMSE_test_std']
display(resultsDF_std)

# juntando tudo em um só
resultsDF_ = pd.merge(resultsDF_median, resultsDF_std, left_index=True, right_index=True)
display(resultsDF_)

Unnamed: 0,dataset,conf_id,Fold,Rep,RMSE_cv,RMSE_train,RMSE_test
0,airfoil,0,0,0,9.144841,8.477716,8.278631
1,airfoil,0,0,1,9.144841,7.870977,7.695525
2,airfoil,0,0,2,9.144841,6.896727,6.895256
3,airfoil,0,0,3,9.144841,6.908958,6.924614
4,airfoil,0,0,4,9.144841,7.170106,7.133389
...,...,...,...,...,...,...,...
295,yacht,2,4,1,16.681005,15.314454,14.625565
296,yacht,2,4,2,16.681005,18.140060,16.719771
297,yacht,2,4,3,16.681005,15.240160,14.726183
298,yacht,2,4,4,16.681005,18.226529,16.797193


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,conf_id,RMSE_cv,RMSE_train,RMSE_test
dataset,Fold,Rep,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GeographicalOriginalofMusic,0,0,0,52.440069,61.634917,311.384187
GeographicalOriginalofMusic,0,1,0,52.440069,52.649652,54.107464
GeographicalOriginalofMusic,0,2,0,52.440069,53.306399,54.701052
GeographicalOriginalofMusic,0,3,0,52.440069,60.263786,59.991457
GeographicalOriginalofMusic,0,4,0,52.440069,52.764772,54.267429
...,...,...,...,...,...,...
yacht,4,1,2,16.681005,15.314454,14.625565
yacht,4,2,2,16.681005,18.140060,16.719771
yacht,4,3,2,16.681005,15.240160,14.726183
yacht,4,4,2,16.681005,18.226529,16.797193


Unnamed: 0_level_0,RMSE_train_mean,RMSE_test_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
GeographicalOriginalofMusic,55.597305,63.745581
airfoil,7.563105,7.545861
concrete,19.447081,19.499374
energyCooling,12.894605,12.840859
energyHeating,13.735053,13.669918
tecator,4.102555,4.101387
towerData,106.337122,106.933651
wineRed,1.141357,1.140067
wineWhite,1.142949,1.144954
yacht,17.085201,17.00525


Unnamed: 0_level_0,RMSE_train_std,RMSE_test_std
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
GeographicalOriginalofMusic,3.411872,46.898324
airfoil,1.334553,1.381212
concrete,2.418705,2.270004
energyCooling,2.346743,2.246996
energyHeating,2.017241,1.880084
tecator,0.353487,0.426294
towerData,24.033633,24.386087
wineRed,0.973346,0.965135
wineWhite,0.840545,0.833901
yacht,1.764258,2.203958


Unnamed: 0_level_0,RMSE_train_mean,RMSE_test_mean,RMSE_train_std,RMSE_test_std
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GeographicalOriginalofMusic,55.597305,63.745581,3.411872,46.898324
airfoil,7.563105,7.545861,1.334553,1.381212
concrete,19.447081,19.499374,2.418705,2.270004
energyCooling,12.894605,12.840859,2.346743,2.246996
energyHeating,13.735053,13.669918,2.017241,1.880084
tecator,4.102555,4.101387,0.353487,0.426294
towerData,106.337122,106.933651,24.033633,24.386087
wineRed,1.141357,1.140067,0.973346,0.965135
wineWhite,1.142949,1.144954,0.840545,0.833901
yacht,17.085201,17.00525,1.764258,2.203958
