In [None]:
import logging
import sys
import pandas as pd

# to include the SR code without installing in the environment
sys.path.append('../')

from symbolic_regression.SymbolicRegressor import SymbolicRegressor

# The operations

Here we define the list of allowed operations. In this project we implemented most of the arithmetic operations we expect to need in a normal use. Please have a look at the file in `symbolic_regression/operators.py` to see how we define them and to define your own operators.

In [None]:
from symbolic_regression.operators import *

operations = [
    OPERATOR_ADD,
    OPERATOR_SUB,
    OPERATOR_MUL,
    OPERATOR_DIV,
    # OPERATOR_ABS,
    # OPERATOR_MOD,
    # OPERATOR_NEG,
    # OPERATOR_INV,
    OPERATOR_LOG,
    OPERATOR_EXP,
    OPERATOR_POW,
    OPERATOR_SQRT,
    OPERATOR_MAX,
    OPERATOR_MIN
]


# The example dataset: used cars prices

This is a regression task to predict the price of a used car based on its features. The dataset is available on Kaggle: https://www.kaggle.com/datasets/aishwaryamuthukumar/cars-dataset-audi-bmw-ford-hyundai-skoda-vw

In [None]:
def min_max_normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (
            df[feature_name] - min_value) / (max_value - min_value)
    return result


def std_normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        mean_val = df[feature_name].mean()
        std_val = df[feature_name].std()
        result[feature_name] = (df[feature_name] - mean_val) / std_val
    return result


In [None]:
data = pd.read_csv('./audi.csv')

In [None]:
import re
for index, row in data.iterrows():
    model = row['model']
    serie = "".join(re.findall("[a-zA-Z]+", model))
    model_number = "".join(re.findall("[0-9]+", model))
    if model_number == '':
        model_number = 1

    data.at[index, f'serie_{serie}'] = float(model_number)

data = data.fillna(0)
data

In [None]:
features = ['year', 'mileage', 'tax', 'mpg',
            'engineSize', 'serie_A', 'serie_Q', 'serie_RS', 'serie_TT', 'serie_S', 'serie_SQ', 'serie_R']
to_normalize_std = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
to_normalize_min_max = ['serie_A', 'serie_Q', 'serie_RS',
                        'serie_TT', 'serie_S', 'serie_SQ', 'serie_R']
target = 'price'
weights = 'w'

data[to_normalize_std+[target]] = std_normalize(data[to_normalize_std+[target]])
data[to_normalize_min_max] = min_max_normalize(data[to_normalize_min_max])
data['price_bin'] = pd.qcut(data['price'], 10, labels=False).astype('int')

bins = 15
from symbolic_regression.multiobjective.fitness.DistributionPreserving import get_cumulant_hist
from symbolic_regression.multiobjective.fitness.Regression import create_regression_weights

F_y=get_cumulant_hist(data=data,target=target,bins=bins)
data[weights]=create_regression_weights(data=data,target=target,bins=bins)

print(f'Dataset {data.shape}')


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[features+[weights]], data[target], test_size=0.2, random_state=42, stratify=data['price_bin'])

# Unify features and target in a single dataframe
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

print(f'Train {train.shape}')
print(f'Test {test.shape}')


Here we define the base range for which to generate the constants in the individuals. Furthermore, we also define how to optimize those constants in order to make them converge to the best value they can have in their expression.

We are using ADAM with the following configuration parameters.

In [None]:
const_range = (0, 1)

constants_optimization = 'ADAM'
constants_optimization_conf = {
    'task': 'regression:wmse',
    'learning_rate': 1e-4,
    'batch_size': int(np.ceil(len(data)/10)),
    'epochs': 50,
    'verbose': 0,
    'gradient_clip': False,
    'beta_1': 0.9,
    'beta_2': 0.999,
    'epsilon': 1e-7,
    'l1_param': 0,
    'l2_param': 0,
}


In [None]:
from symbolic_regression.multiobjective.fitness.DistributionPreserving import Wasserstein
from symbolic_regression.multiobjective.fitness.Correlation import KendallTauCorrelation
from symbolic_regression.multiobjective.fitness.Regression import WeightedMeanSquaredError

fitness_functions = [
    WeightedMeanSquaredError(label='wmse', target=target, convergence_threshold=0.02,
                             weights=weights, minimize=True, hypervolume_reference=data[target].abs().max(), 
                             constants_optimization=constants_optimization, 
                             constants_optimization_conf=constants_optimization_conf),
    KendallTauCorrelation(label='1-kendalltau', target=target,
                          one_minus=True, minimize=True, hypervolume_reference=1),
    Wasserstein(label='wasserstein', target=target, weights=weights, F_y=F_y,
                bins=15, minimize=True, hypervolume_reference=1)
]

''' Use this to modulate the relative frequency of genetic operations
    E.g., crossover is chosen 2 times more frequently than mutation
        {
            'crossover': 2,
            'mutation': 1,
            # etc...
        }
'''
genetic_operators_frequency = {
    'crossover': 1,
    'mutation': 1,
    'insert_node': 1,
    'delete_node': 1,
    'mutate_leaf': 1,
    'mutate_operator': 1,
    'recalibrate': 1
}


In [None]:
from symbolic_regression.callbacks.CallbackSave import MOSRCallbackSaveCheckpoint
from symbolic_regression.callbacks.CallbackStatistics import MOSRHistory, MOSRStatisticsComputation

file_name = f'./audi'

callbacks = [
    MOSRCallbackSaveCheckpoint(
        checkpoint_file=file_name, checkpoint_frequency=25, checkpoint_overwrite=False),
    MOSRStatisticsComputation(),
    MOSRHistory(history_fpf_frequency=5),
]

In [None]:
POPULATION_SIZE = 300
TOURNAMENT_SIZE = 3

logging.info(f'Running with POPULATION_SIZE {POPULATION_SIZE}')
logging.info(f'Running with TOURNAMENT_SIZE {TOURNAMENT_SIZE}')


sr = SymbolicRegressor(
    client_name='client',
    const_range=const_range,
    parsimony=.8,
    parsimony_decay=.85,  # Expected depth = parsimony / (1-parsimony_decay)
    population_size=POPULATION_SIZE,
    tournament_size=TOURNAMENT_SIZE,
    genetic_operators_frequency=genetic_operators_frequency,
)


In [None]:
GENERATIONS = 100

sr.fit(
    data=data,
    features=features,
    operations=operations,
    fitness_functions=fitness_functions,
    generations_to_train=GENERATIONS,
    n_jobs=-1,
    stop_at_convergence=False,
    verbose=3  # The output could be very verbose. Consider using 0, 1, or 2 to reduce the verbosity
)

print('End')


### How to access the models and use them

You can access the models from ```sr.population: List``` or from ```sr.first_pareto_front: List```. The first one contains all the models generated during the evolution process, while the second one contains only the models that are in the Pareto front.

E.g., 
```python
model = sr.population[0]  # OR model = sr.first_pareto_front[0]
```

To see the model expression, use
```python
>>> str(model.program)  # It is only the string representation
```

Some relevant attributes of the model are
```python
>>> model.features_used
>>> model.fitness
>>> model.fitness_validation
```

To evaluate the model, use
```python
>>> model.evaluate(data)  # data should be a Dict, pd.Series or pd.DataFrame
```

In [None]:
model = sr.population[0]

str(model.program)

In [None]:
print(f"\nModel complexity:\n\t{model.complexity}")
print(f"\nModel fitness:\n\t{model.fitness}")
print(f"\nModel fitness_validation:\n\t{model.fitness_validation}")  # Is empty if no validation set is provided

In [None]:
model.evaluate(data=train[features])