# Notebook to perform gridsearch in GSGP

The original implementation did not have the same transformation functions as those used in the ITEA algorithm, and used MAE as a cost function. The code used in this paper was modified from it's original version.

## Input format:

The program has a somewhat annoying file format, and it is necessary to convert the datasets that we use in other programs to an appropriate form. For documentation:

> Input files are .txt files where values ​​are separated by a TAB. The first two rows of each file represent the number of independent variables of the problem and the number of instances in the dataset. Each row represents an instance, while each column contains the values ​​of a variable. The last column contains the target values. The library comes with two example input files (training and test files).

> NOTE: in test mode the test file must not have the number of instances in the dataset and also the target column must be removed.

## New transformation functions

The following functions were added: {id, sin, cos, tanh, sqrt |. |, log, exp}

## Gridsearch

Since the implementation is in C ++ but python is more practical to automate the tests, this notebook implements functions that manipulate the files related to the program.


In [None]:
import os.path   as path
import os
import glob
import pandas as pd
import numpy as np
from itertools import product
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold

# Current directory 
cur_folder = os.getcwd()

original_datasets_folder = '../../../datasets/commaSeparated'
adapted_datasets_folder  = '../../../datasets/tabSeparated'

# compile the source code
! cd "{cur_folder}/"
! g++ GP.cc -o GP.exe

print('Done')

## Generating the databases

You should run the cell below only if the tabSeparated folder is empty

In [None]:
# Creating the output folder (if not exists)
if not os.path.exists(f'{cur_folder}/{adapted_datasets_folder}'):
    os.makedirs(f'{cur_folder}/{adapted_datasets_folder}')
    
# For each dataset
for dataset in glob.glob(f'{cur_folder}/{original_datasets_folder}/*.dat'):

    # Extracting the file nale
    file_name = dataset.replace(f'{cur_folder}/{original_datasets_folder}/', '').replace('.dat', '')
    
    print(f'{cur_folder}/{adapted_datasets_folder}/{file_name}.txt')
    
    # loading with pandas
    df = pd.read_csv(dataset, header=None, sep=',')
    
    nrow, nvar = df.shape
    nvar = nvar - 1
    
    # Saving the train file separated with tab
    df.to_csv(
        f'{cur_folder}/{adapted_datasets_folder}/{file_name}.txt',
        sep='\t', index=None, header=None, float_format ='%1.9f'
    )
    
    # appending at the beggining of the file the size of the data set
    with open(f'{cur_folder}/{adapted_datasets_folder}/{file_name}.txt', 'r+') as f:
        content = f.read()
        f.seek(0, 0)
        f.write(f'{nvar}\n{nrow}\n' +  content[:-1])
        
    # Train partitions are divided in 5-folds
    if 'train' in file_name:
        # Repeating the procedure above
        for i, (train_index, test_index) in enumerate(KFold(n_splits=5).split(df.iloc[:, :-1])):
            
            df.iloc[train_index, :].to_csv(
                f'{cur_folder}/{adapted_datasets_folder}/{file_name}-train-{i}.txt',
                sep='\t', index=None, header=None, float_format ='%1.9f'
            )

            with open(f'{cur_folder}/{adapted_datasets_folder}/{file_name}-train-{i}.txt', 'r+') as f:
                content = f.read()
                f.seek(0, 0)
                f.write(f'{nvar}\n{len(train_index)}\n' +  content[:-1])
                
            df.iloc[test_index, :].to_csv(
                f'{cur_folder}/{adapted_datasets_folder}/{file_name}-validation-{i}.txt',
                sep='\t', index=None, header=None, float_format ='%1.9f'
            )

            with open(f'{cur_folder}/{adapted_datasets_folder}/{file_name}-validation-{i}.txt', 'r+') as f:
                content = f.read()
                f.seek(0, 0)
                f.write(f'{nvar}\n{len(test_index)}\n' +  content[:-1])

## Functions that perform gridsearch

In [None]:
# Update the configuration file with a new configuration to run
def update_config(conf, pred=False):
    data = []
    with open(f'{cur_folder}/configuration.ini', 'r+') as f:
        data = f.read().splitlines()

    for line_id, line in enumerate(data):
        param = line.split(' ')[0]
        value = line.split(' ')[-1]
        
        if param == 'expression_file':
            data[line_id] = line.replace(str(value), f'{1 if pred else 0}\n')
        elif param == 'USE_TEST_SET': 
            data[line_id] = line.replace(str(value), f'{1 if pred else 0}\n')
        else:
            data[line_id] = line.replace(str(value), f'{conf[param]}\n')

    with open(f'{cur_folder}/configuration.ini', 'w') as f:
        f.writelines( data )

        
# function that takes a train and test dataset, the parameters to use in the
# algorithm, and performs a single run.
def run(dataset_train, dataset_test, conf=None):
    update_config(conf, pred=False)
    
    ! ./GP.exe -train_file {dataset_train} -test_file {dataset_test}

    # Getting the best fitness on train and test
    fitness_train    = pd.read_csv(f'{cur_folder}/fitnesstrain.txt', header=None)
    fitness_test     = pd.read_csv(f'{cur_folder}/fitnesstest.txt', header=None)
    
    return (
        fitness_train.iloc[-1, 0],
        fitness_test.iloc[-1, 0],
    )


# Our gridsearch implementation.
# Function that receives training and validation partitions (5-fold), and a
# list of settings, and determines the best setting.
# Returns the average RMSE of the best configuration, a dictionary of the best
# configuration, and the id of the best configuration.
# The configuration found will be used in all repetitions of a training-test division.
# The last 2 parameters are for saving information
def gridsearch(dataset_train_cv, dataset_validation_cv, confs, ds, fold):

    # Creating checkpoints during the gridsearch
    gridDF = pd.DataFrame(columns = ['dataset', 'Fold', 'conf'] + [f'RMSE_{i}' for i in range(5)])
    gridDF = gridDF.set_index(['dataset', 'Fold', 'conf'])

    if os.path.isfile('../../../results/gridsearch/GSGP-gridsearch.csv'):
        gridDF = pd.read_csv('../../../results/gridsearch/GSGP-gridsearch.csv')
        gridDF = gridDF.set_index(['dataset', 'Fold', 'conf'])
    
    # (rmse_cv, configuration, configuration index)
    best_conf = (np.inf, None, -1)
    
    for i, conf in enumerate(confs):
        
        update_config(conf, pred=False)
        
        if gridDF.index.isin([(ds, fold, i)]).any():
            print(f'successfully loaded result for configuration {i}')
        else:
            print(f'Testing configuration {i}/{len(confs)}', end='')

            gridDF.loc[(ds, fold, i), :] = (np.nan, np.nan, np.nan, np.nan, np.nan)

            gridDF.to_csv('../../../results/gridsearch/GSGP-gridsearch.csv', index=True)
        
        RMSE_cv = []
        for j, (train_cv, validation_cv) in enumerate(zip(dataset_train_cv, dataset_validation_cv)):
            if not np.isnan(gridDF.loc[(ds, fold, i), f'RMSE_{j}']):
                RMSE_cv.append(gridDF.loc[(ds, fold, i), f'RMSE_{j}'])
                print(f'recovered information for fold {j}: {RMSE_cv[-1]}')

                continue
            else:
                print(f'evauating fold {j} on cross-validation...')
                    
            ! ./GP.exe -train_file {train_cv} -test_file {validation_cv}
            
            cv_test = pd.read_csv(f'{cur_folder}/fitnesstest.txt', header=None)
            RMSE_cv.append(cv_test.iloc[-1, 0])

            # Here we know that this line exists
            gridDF.loc[(ds, fold, i), f'RMSE_{j}'] = cv_test.iloc[-1, 0]
            
            gridDF.to_csv('../../../results/gridsearch/GSGP-gridsearch.csv', index=True)

        if np.mean(RMSE_cv) < best_conf[0]:
            best_conf = (np.mean(RMSE_cv), conf,  i)
            
    return best_conf

In [None]:
# gridsearch_configurations is a dictionary, where each key is a parameter
# and its value can be one of two options:
# - list (python native):
#       contains the values that will be used in the search
# - lambda functions:
#       used for dynamic parameters, that assumes the value based on others

# Creation of parameters: a Cartesian product will be made
# over all passed lists, then the lambda functions will be applied
# about each configuration obtained.

gridsearch_configurations = {
    'population_size'        : [100, 250, 500],
    'max_number_generations' : lambda conf:  100000//conf['population_size'],
    'init_type'              : 2,
    'p_crossover'            : [0.2, 0.5, 0.8],
    'p_mutation'             : lambda conf: 1 - conf['p_crossover'],
    'max_depth_creation'     : 5,
    'tournament_size'        : 4,
    'zero_depth'             : 0,
    'mutation_step'          : 1.0, 
    'num_random_constants'   : 0,
    'min_random_constant'    : -100,
    'max_random_constant'    : 100,
    'minimization_problem'   : 1,
    'random_tree'            : 500
    
    # As de baixo ficam reservadas para o script controlar. Elas são relacionadas a
    # executar a evolução ou usar um resultado encontrado
    
    #'expression_file'        : 0,
    #'USE_TEST_SET'           : 0
}

varying = []

keys, values = [], []
for k,v in gridsearch_configurations.items():
    if isinstance(v, list):
        values.append(v)
        varying.append(k)
    elif (isinstance(v, int) or isinstance(v, float)): 
        values.append([v])
    elif callable(v):
        continue
    else:
        raise Exception('Error creating the configurations')
    keys.append(k)
        
confs = [dict(zip(keys,items)) for items in product(*values)]

for conf in confs:
    for k,v in gridsearch_configurations.items():
        if callable(v):
            conf[k] = v(conf)
            varying.append(k)

# Saving the configuration informations
confs_df = pd.DataFrame(confs, index=[f'conf {i}' for i in range(len(confs))]).T
confs_df.index.names = ['Parameters']
confs_df.to_csv('GSGP-new-gridsearch_configurations.csv')

confs_df.style.apply(
    lambda x: ['background: lightgreen' if x.name in varying else '' for i in x], 
    axis=1
)

## GRIDSEARCH

A célula abaixo usa tudo que as células anteriores criaram para rodar um gridsearch. Note que esse processo envolve várias chamadas do terminal e manipulação dos arquivos que o GSGP cria na pasta.

O código é feito de forma que possa ser interrompido, mas não sei se isso pode gerar efeitos colaterais (por conta de abrir arquivos, etc). 

Idealmente, se souber de antemão que o código precisará ser interrompido se exceder o tempo, uma opção é rodar um dataset por vez.

In [None]:
n_folds       = 5
n_runs        = 30
runs_per_fold = n_runs//n_folds

datasets = [
    'airfoil',
    'concrete',
    'energyCooling',
    'energyHeating',
    'Geographical',
    'towerData',
    'tecator',
    'wineRed',
    'wineWhite',
    'yacht',
]    

columns = ['dataset','conf','Fold','Rep','RMSE_cv','RMSE_train','RMSE_test']


fname = '../../../results/rmse/GSGP-resultsregression.csv'
results = {c:[] for c in columns}

if os.path.isfile(fname):
    resultsDF = pd.read_csv(fname)
    results   = resultsDF.to_dict('list')

for ds in datasets:
    print(f'Gridsearch --- data set: {ds}')
        
    for fold in range(n_folds):
        dataset_train_cv      = [f'{adapted_datasets_folder}/{ds}-train-{fold}-train-{i}.txt' for i in range(5)]
        dataset_validation_cv = [f'{adapted_datasets_folder}/{ds}-train-{fold}-validation-{i}.txt' for i in range(5)]
        
        dataset_train         = f'{adapted_datasets_folder}/{ds}-train-{fold}.txt'
        dataset_test          = f'{adapted_datasets_folder}/{ds}-test-{fold}.txt'

        if len(glob.glob(dataset_train))==0:
            print(f'Dataset {dataset_train} does not exist.')
            continue
            
        RMSE_cv, conf, conf_id = None, None, None
        if os.path.isfile(fname):
            resultsDF = pd.read_csv(fname)
            results   = resultsDF.to_dict('list')

            if len(resultsDF[
                    (resultsDF['dataset']==ds) &
                    (resultsDF['Fold']==fold)
                ])>0:
                aux_resultsDF = resultsDF[
                    (resultsDF['dataset']==ds) &
                    (resultsDF['Fold']==fold)
                ]
                conf_id = aux_resultsDF['conf'].values[0]
                RMSE_cv = aux_resultsDF['RMSE_cv'].values[0]
                conf = confs[conf_id]

                print(f'Using previously configuration: {RMSE_cv}, {conf_id}')

        if RMSE_cv == conf == conf_id == None:
            print('Evaluating fold in gridsearch')  
            RMSE_cv, conf, conf_id = gridsearch(dataset_train_cv, dataset_validation_cv, confs, ds, fold)

        for rep in range(runs_per_fold):
            if os.path.isfile(fname):
                resultsDF = pd.read_csv(fname)
                results   = resultsDF.to_dict('list')

                if len(resultsDF[
                    (resultsDF['dataset']==ds) &
                    (resultsDF['Fold']==fold)  &
                    (resultsDF['Rep']==rep)
                ])==1:
                    print(f'already evaluated {ds}-{fold}-{rep}')

                    continue

            print(f'evaluating config {conf_id} for {ds}-{fold}-{rep}')
            
            RMSE_train, RMSE_test = run(dataset_train, dataset_test, conf)

            results['dataset'].append(ds)
            results['conf'].append(conf_id)
            results['RMSE_cv'].append(RMSE_cv)
            results['RMSE_train'].append(RMSE_train)
            results['RMSE_test'].append(RMSE_test)
            results['Fold'].append(fold)
            results['Rep'].append(rep)

            df = pd.DataFrame(results)
            df.to_csv(fname, index=False)

print('done')


# Cleaning auxiliary files
files = ['fitnesstest.txt', 'fitnesstrain.txt', 'trace.txt']
for f in files:
    os.remove(f'{cur_folder}/{f}')