In [None]:
import logging
import sys
import os
import pandas as pd
import numpy as np

# to include the SR code without installing in the environment
sys.path.append('../')

from symbolic_regression.SymbolicRegressor import SymbolicRegressor

# The operations

Here we define the list of allowed operations. In this project we implemented most of the arithmetic operations we expect to need in a normal use. Please have a look at the file in `symbolic_regression/operators.py` to see how we define them and to define your own operators.

In [None]:
from symbolic_regression.operators import *

operations = [
    OPERATOR_ADD,
    OPERATOR_SUB,
    OPERATOR_MUL,
    OPERATOR_DIV,
    # OPERATOR_ABS,
    # OPERATOR_MOD,
    # OPERATOR_NEG,
    # OPERATOR_INV,
    OPERATOR_LOG,
    OPERATOR_EXP,
    OPERATOR_POW,
    OPERATOR_SQRT,
    # OPERATOR_MAX,
    # OPERATOR_MIN
]


# The example dataset: counterfeit banknotes classification

This is a very simple binary classification task to predict whether a banknote is counterfeit based on four characteristics. This dataset is publicly available on [Kaggle](https://www.kaggle.com/datasets/ritesaluja/bank-note-authentication-uci-data)

In [None]:
data = pd.read_csv('./banknotes.csv')

data = data.sample(frac=1)  # To shuffle the dataset.

data['w'] = np.where(data['y'] == 1, 1./(2*data['y'].mean()),
                     1./(2*(1-data['y'].mean())))

features = ['x1', 'x2', 'x3', 'x4']
target = 'y'
weights = 'w'

print(f'Dataset {data.shape}')


Here we define the base range for which to generate the constants in the individuals. Furthermore, we also define how to optimize those constants in order to make them converge to the best value they can have in their expression.

We are using ADAM with the following configuration parameters.

In [None]:
const_range = (0, 1)

constants_optimization = 'ADAM'
constants_optimization_conf = {
    'task': 'binary:logistic',  # or 'regression:wmse'
    'learning_rate': 1e-4,
    'batch_size': 64,
    'epochs': 50,
    'verbose': 0,
    'gradient_clip': False,
    'beta_1': 0.9,
    'beta_2': 0.999,
    'epsilon': 1e-7,
    'l1_param': 1e-1,
    'l2_param': 0,
}


In [None]:
from symbolic_regression.multiobjective.fitness.Classification import BinaryCrossentropy, AUC
from symbolic_regression.multiobjective.fitness.Regression import NotConstant

fitness_functions = [
    BinaryCrossentropy(label='bce', target=target, weights=weights, logistic=True, constants_optimization=constants_optimization,
                       constants_optimization_conf=constants_optimization_conf, minimize=True, hypervolume_reference=1.1),
    AUC(label='1-auc', target=target, weights=weights,
        logistic=True, one_minus=True, minimize=True, hypervolume_reference=1.1),
    NotConstant(label='not_constant', epsilon=.01, minimize=True, hypervolume_reference=1.1)
]

''' Use this to modulate the relative frequency of genetic operations
    E.g., crossover is chosen 2 times more frequently than mutation
        {
            'crossover': 2,
            'mutation': 1,
            # etc...
        }
'''
genetic_operators_frequency = {
    'crossover': 1,
    'randomize': 1,
    'mutation': 1,
    'insert_node': 1,
    'delete_node': 1,
    'mutate_leaf': 1,
    'mutate_operator': 1,
    'recalibrate': 1
}


Here we define a population size of 100 individuals, the training process to be 20 generations long and the tournament size for the genetic operations to be 3.

Setting the checkpoint file allows us to progressively save the population to recover the training in future training sessions or to share the population with other participants.

In [None]:
POPULATION_SIZE = 300
TOURNAMENT_SIZE = 3

logging.info(f'Running with POPULATION_SIZE {POPULATION_SIZE}')
logging.info(f'Running with TOURNAMENT_SIZE {TOURNAMENT_SIZE}')


sr = SymbolicRegressor(
    client_name='banknotes',
    checkpoint_file='./banknotes_checkpoint.save',
    checkpoint_frequency=10,
    const_range=const_range,
    parsimony=.8,
    parsimony_decay=.85,  # Expected depth = parsimony / (1-parsimony_decay)
    population_size=POPULATION_SIZE,
    tournament_size=TOURNAMENT_SIZE
)


In [None]:
GENERATIONS = 100

sr.fit(
    data=data,
    features=features,
    operations=operations,
    fitness_functions=fitness_functions,
    generations_to_train=GENERATIONS,
    n_jobs=-1,
    stop_at_convergence=False,
    verbose=2
)

print('End')
