In [None]:
import logging
import sys
import pandas as pd

sys.path.append('../')
# to include the SR code without installing in the environment
from symbolic_regression.SymbolicRegressor import SymbolicRegressor

# The operations

Here we define the list of allowed operations. In this project we implemented most of the arithmetic operations we expect to need in a normal use. Please have a look at the file in `symbolic_regression/operators.py` to see how we define them and to define your own operators.

In [None]:
from symbolic_regression.operators import *

operations = [
    OPERATOR_ADD,
    OPERATOR_SUB,
    OPERATOR_MUL,
    OPERATOR_DIV,
    # OPERATOR_ABS,
    # OPERATOR_MOD,
    # OPERATOR_NEG,
    # OPERATOR_INV,
    OPERATOR_LOG,
    OPERATOR_EXP,
    OPERATOR_POW,
    OPERATOR_SQRT,
    OPERATOR_MAX,
    OPERATOR_MIN
]

# The example dataset: counterfeit banknotes classification

This is a very simple binary classification task to predict whether a banknote is counterfeit based on four characteristics. This dataset is publicly available on [Kaggle](https://www.kaggle.com/datasets/ritesaluja/bank-note-authentication-uci-data)

In [None]:
data = pd.read_csv('./banknotes.csv')

data = data.sample(frac=1)  # To shuffle the dataset.

data['w'] = np.where(data['y'] == 1, 1./(2*data['y'].mean()),
                     1./(2*(1-data['y'].mean())))

features = ['x1', 'x2', 'x3', 'x4']
target = 'y'
weights = 'w'

print(f'Dataset {data.shape}')

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, stratify=data[target])

print(f'Train {train.shape}')
print(f'Test {test.shape}')

Here we define the base range for which to generate the constants in the individuals. Furthermore, we also define how to optimize those constants in order to make them converge to the best value they can have in their expression.

We are using ADAM with the following configuration parameters.

In [None]:
const_range = (0, 1)

constants_optimization = 'scipy'
constants_optimization_conf = {'task': 'binary:logistic'}

In [None]:
from symbolic_regression.multiobjective.fitness.Classification import Accuracy, BCEAkaike, BinaryCrossentropy, AUC, ClassificationMinimumDescriptionLength, F1Score, Precision, Recall, Specificity
from symbolic_regression.multiobjective.fitness.Regression import NotConstant

fitness_functions = [
    BinaryCrossentropy(label='BCE', target=target, weights=weights,
                       logistic=True, minimize=True,
                       constants_optimization=constants_optimization, constants_optimization_conf=constants_optimization_conf),


    # Add also the other metrics with minimize=False so that they are not used
    # for optimization but only for visualization. You can use minimize=True
    # for those metrics that you want push the model to prioritize one or more
    # metrics over the others, but remember to change the one_minus to True as
    # this algorithm work on minimization problems. Just add the same metric another
    # time with minimize=True and one_minus=True to have both the version of the metric
    # in the optimization and the one for visualization.

    Accuracy(label='Accuracy', target=target, weights=weights, threshold=.5,
             logistic=True, one_minus=False, minimize=False),
    Precision(label='Precision', target=target, weights=weights, threshold=.5,
              logistic=True, one_minus=False, minimize=False),
    Recall(label='Sensitivity (Recall)', target=target, weights=weights, threshold=.5,
           logistic=True, one_minus=False, minimize=False),
    Specificity(label='Specificity', target=target, weights=weights, threshold=.5,
                logistic=True, one_minus=False, minimize=False),
    F1Score(label='F1', target=target, weights=weights, threshold=.5,
            logistic=True, one_minus=False, minimize=False),
    AUC(label='AUC', target=target, weights=weights, convergence_threshold=0.1,
        logistic=True, one_minus=False, minimize=False),

    # For demonstration purposes only. We need the 1-F1 score to be minimized.
    # Use this in imbalanced datasets.
    F1Score(label='1-F1', target=target, weights=weights, threshold=.5,
            logistic=True, one_minus=True, minimize=True),
]

''' Use this to modulate the relative frequency of genetic operations
    E.g., crossover is chosen 2 times more frequently than mutation
        {
            'crossover': 2,
            'mutation': 1,
            # etc...
        }
'''
genetic_operators_frequency = {
    'crossover': 1,
    'randomize': 1,
    'mutation': 1,
    'insert_node': 1,
    'delete_node': 1,
    'mutate_leaf': 1,
    'mutate_operator': 1,
    'recalibrate': 1
}

In [None]:
from symbolic_regression.callbacks.CallbackSave import MOSRCallbackSaveCheckpoint
from symbolic_regression.callbacks.CallbackStatistics import MOSRHistory, MOSRStatisticsComputation

file_name = f'./banknotes'

callbacks = [
    MOSRCallbackSaveCheckpoint(
        checkpoint_file=file_name, checkpoint_frequency=1, checkpoint_overwrite=True),
    MOSRStatisticsComputation(),
    MOSRHistory(history_fpf_frequency=5),
]

In [None]:
POPULATION_SIZE = 100
TOURNAMENT_SIZE = 3

logging.info(f'Running with POPULATION_SIZE {POPULATION_SIZE}')
logging.info(f'Running with TOURNAMENT_SIZE {TOURNAMENT_SIZE}')


sr = SymbolicRegressor(
    client_name='client',
    const_range=const_range,
    parsimony=.80,
    parsimony_decay=.85,  # Expected depth = parsimony / (1-parsimony_decay)
    population_size=POPULATION_SIZE,
    tournament_size=TOURNAMENT_SIZE,
    genetic_operators_frequency=genetic_operators_frequency,
    callbacks=callbacks
)

In [None]:
GENERATIONS = 100

sr.fit(
    data=train,
    val_data=test,
    features=features,
    operations=operations,
    fitness_functions=fitness_functions,
    generations_to_train=GENERATIONS,
    n_jobs=-1,
    stop_at_convergence=True,
    convergence_rolling_window=5,
    verbose=1  # The output could be very verbose. Consider using 0, 1, or 2 to reduce the verbosity
)

print('End')

### How to access the models and use them

You can access the models from ```sr.population: List``` or from ```sr.first_pareto_front: List```. The first one contains all the models generated during the evolution process, while the second one contains only the models that are in the Pareto front.

E.g., 
```python
model = sr.population[0]  # OR model = sr.first_pareto_front[0]
```

To see the model expression, use
```python
>>> str(model.program)  # It is only the string representation
```

Some relevant attributes of the model are
```python
>>> model.features_used
>>> model.fitness
>>> model.fitness_validation
```

To evaluate the model, use
```python
>>> model.evaluate(data)  # data should be a Dict, pd.Series or pd.DataFrame
```

In [None]:
model = sr.population[0]

str(model.program)

In [None]:
print(f"\nModel complexity:\n\t{model.complexity}")
print(f"\nModel fitness:\n\t{model.fitness}")
# Is empty if no validation set is provided
# print(f"\nModel fitness_validation:\n\t{model.fitness_validation}")

In [None]:
model.evaluate(data=data[features], logistic=True).round(0)