In [None]:
import logging
import sys
import pandas as pd

sys.path.append('../')  # to include the SR code without installing in the environment

from symbolic_regression.SymbolicRegressor import SymbolicRegressor

# The operations

Here we define the list of allowed operations. In this project we implemented most of the arithmetic operations we expect to need in a normal use. Please have a look at the file in `symbolic_regression/operators.py` to see how we define them and to define your own operators.

In [None]:
from symbolic_regression.operators import *

operations = [
    OPERATOR_ADD,
    OPERATOR_SUB,
    OPERATOR_MUL,
    OPERATOR_DIV,
    # OPERATOR_ABS,
    # OPERATOR_MOD,
    # OPERATOR_NEG,
    # OPERATOR_INV,
    OPERATOR_LOG,
    OPERATOR_EXP,
    OPERATOR_POW,
    OPERATOR_SQRT,
    OPERATOR_MAX,
    OPERATOR_MIN
]

# The example dataset: Body Fat Index

This is the generation of a score to predict the Body Fat Intex of a person.

In [None]:
def min_max_normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

def std_normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        mean_val = df[feature_name].mean()
        std_val  = df[feature_name].std()
        result[feature_name] = (df[feature_name] - mean_val) / std_val
    return result

In [None]:
data=pd.read_csv(f'./body_fat.csv')
data=data.drop(41)  # Drop the outlier

### Data engineering and normalization

Weight_lb_to_kg=data['Weight']*0.453592
Height_inches_to_m=data['Height']*0.0254
BMI=Weight_lb_to_kg/(Height_inches_to_m**2)

proxy=(BMI-BMI.mean())/BMI.std()

data=std_normalize(data)

bins=16

features = list(data.columns)
features.remove('BodyFat')
features.remove('Density')
target='BodyFat'

Here we define the base range for which to generate the constants in the individuals. Furthermore, we also define how to optimize those constants in order to make them converge to the best value they can have in their expression.

We are using ADAM with the following configuration parameters.

In [None]:
const_range = (0, 1)

constants_optimization = 'ADAM'
constants_optimization_conf = {
    'task': 'regression:wmse', #or 'binary:logistic'
    'learning_rate': 1e-4,
    'batch_size': 64,
    'epochs': 50,
    'verbose': 0,
    'gradient_clip':False,
    'beta_1': 0.9,
    'beta_2': 0.999,
    'epsilon': 1e-7    
}

In [None]:
from symbolic_regression.multiobjective.fitness.OrderPreserving import Wasserstein
from symbolic_regression.multiobjective.fitness.Correlation import KendallTauCorrelation
from symbolic_regression.multiobjective.fitness.Regression import WeightedMeanSquaredError

fitness_functions = [
    WeightedMeanSquaredError(label='wmse', target=target, weights='w', minimize=True, hypervolume_reference=100),
    KendallTauCorrelation(label='1-kendalltau', target=target, one_minus=True, minimize=True, hypervolume_reference=1.1),
    Wasserstein(label='wasserstein', target=target, weights='w', bins=10, minimize=True, hypervolume_reference=1.1)
]


''' Use this to modulate the relative frequency of genetic operations
    E.g., crossover is chosen 2 times more frequently than mutation
        {
            'crossover': 2,
            'mutation': 1,
            # etc...
        }
'''
genetic_operators_frequency = {
    'crossover': 1,
    'randomize': 1,
    'mutation': 1,
    'insert_node': 1,
    'delete_node': 1,
    'mutate_leaf': 1,
    'mutate_operator': 1,
    'recalibrate': 1
}


Here we define a population size of 100 individuals, the training process to be 20 generations long and the tournament size for the genetic operations to be 3.

Setting the checkpoint file allows us to progressively save the population to recover the training in future training sessions or to share the population with other participants.

In [None]:
POPULATION_SIZE = 300
TOURNAMENT_SIZE = 3

logging.info(f'Running with POPULATION_SIZE {POPULATION_SIZE}')
logging.info(f'Running with TOURNAMENT_SIZE {TOURNAMENT_SIZE}')


sr = SymbolicRegressor(
    client_name='body_fat',
    checkpoint_file='./body_fat_checkpoint.save',
    checkpoint_frequency=10,
    const_range=const_range,
    parsimony=.8,
    parsimony_decay=.85,  # Expected depth = parsimony / (1-parsimony_decay)
    population_size=POPULATION_SIZE,
    tournament_size=TOURNAMENT_SIZE
)

In [None]:
GENERATIONS = 100

sr.fit(
    data=data,
    features=features,
    operations=operations,
    fitness_functions=fitness_functions,
    generations_to_train=GENERATIONS,
    n_jobs=-1,
    stop_at_convergence=False,
    verbose=2
)

print('End')