In [1]:
from nsga2.estimator import NSGAIIRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/lexicase_paper/d_yacht.txt', sep=',')

# DEAP interface requires X and y to be numpy arrays, not pandas dataframes
X = df.drop('label', axis=1).values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

estimator = NSGAIIRegressor(**{
    'pop_size'        : 4, 
    'max_gen'         : 5,
    'max_depth'       : 7,  # 8
    'max_size'        : 64, # 75
    'objectives'      : ['error', 'size'],
    'cx_prob'         : 1/7,
    'initialization'  : 'uniform',
    'pick_criteria'   : 'MCDM', # error, MCDM
    'validation_size' : 0.33,
    'simplify'        : True,
    'simplification_method' : 'bottom_up',
    'verbosity'       : 1,
    'functions'       :[
        'div', 'add', 'sub', 'mul',
        'maximum', 'minimum',
        'sin', 'cos', 'tan',
        'sqrtabs', 'log1p', 'log', 'exp', 'square', 'abs'
    ]
}).fit(X_train, y_train)

simplifying ind idx0 multiply(add(tan(add(ARG1, ARG5)), sqrtabs(exp(ARG1))), cos(exp(multiply(ARG0, ARG4))))
 - list of indexes [(13, <deap.gp.Terminal object at 0x7f13f1aa6480>), (12, <deap.gp.Terminal object at 0x7f141edfee80>), (11, <deap.gp.Primitive object at 0x7f13f1aa16c0>), (10, <deap.gp.Primitive object at 0x7f13f1aa1ad0>), (9, <deap.gp.Primitive object at 0x7f13f1aa1940>), (8, <deap.gp.Terminal object at 0x7f13f1aa5ec0>), (7, <deap.gp.Primitive object at 0x7f13f1aa1ad0>), (6, <deap.gp.Primitive object at 0x7f13f1aa19e0>), (5, <deap.gp.Terminal object at 0x7f13f1aa6540>), (4, <deap.gp.Terminal object at 0x7f13f1aa5ec0>), (3, <deap.gp.Primitive object at 0x7f13f1aa1620>), (2, <deap.gp.Primitive object at 0x7f13f1aa1990>), (1, <deap.gp.Primitive object at 0x7f13f1aa1620>), (0, <deap.gp.Primitive object at 0x7f13f1aa16c0>)]
   - 13, ARG4
   - subtree [<deap.gp.Terminal object at 0x7f13f1aa6480>]
     - skipping
   - 12, ARG0
   - subtree [<deap.gp.Terminal object at 0x7f141edfee8



In [2]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse

model      = str(estimator.best_estimator_).replace("ARG", "x_")
size       = len(estimator.best_estimator_)
complexity = size
depth      = estimator.best_estimator_.height

print(model)
print(size)
print(complexity)
print(depth)

for metric, fn, (data_X, data_y) in [
    ('train_r2',  r2_score, (X_train, y_train)),
    ('test_r2',   r2_score, (X_test,  y_test )),
    ('train_mse', mse,      (X_train, y_train)),
    ('test_mse',  mse,      (X_test,  y_test )),
]:
    score = np.nan
    try:
        score = fn(estimator.predict(data_X), data_y)
        print(f"{metric} : {score}")
    except ValueError:
        print(f"(Failed to calculate {metric}")

multiply(add(tan(add(x_3, x_5)), x_3), cos(exp(log(square(x_0)))))
12
12
5
train_r2 : -11.641422562487234
test_r2 : -6.379123382153574
train_mse : 240.99518436186833
test_mse : 246.80919913271975




In [3]:
n_keys = 10
print( list(estimator.simplifier.pop_hash.keys())[:n_keys] )

for key in list(estimator.simplifier.pop_hash.keys())[:n_keys]:
    print(key)
    for ind in estimator.simplifier.pop_hash[key]:
        print(" -", ind)

# 11001111011111110110111010010100
# 11010101000100110111001000101100

['11100000011101001011111110110010', '00011110110011010100000101011001', '11100001011101001011111110110010', '11101001010001001011111110111011']
11100000011101001011111110110010
 - ARG3
 - exp(ARG1)
 - sqrtabs(ARG1)
 - sqrtabs(ARG1)
 - exp(ARG1)
 - sqrtabs(ARG3)
 - tan(ARG1)
 - exp(ARG1)
 - sqrtabs(ARG3)
 - sqrtabs(ARG3)
 - tan(ARG1)
 - sqrtabs(ARG3)
 - tan(ARG1)
 - log1p(132.5496948900636)
 - sqrtabs(exp(ARG1))
 - add(ARG3, ARG2)
 - sqrtabs(cos(ARG4))
 - maximum(ARG5, ARG2)
 - add(ARG3, ARG5)
 - add(ARG3, ARG5)
 - cos(cos(ARG3))
 - cos(cos(ARG3))
 - log(add(ARG4, ARG4))
 - add(tan(ARG1), ARG5)
 - sqrtabs(sin(cos(ARG4)))
 - exp(cdiv(ARG5, ARG4))
 - sqrtabs(exp(exp(ARG5)))
 - log1p(add(ARG4, sin(ARG0)))
 - sqrtabs(sqrtabs(sin(cos(ARG4))))
 - cos(exp(cdiv(ARG5, ARG4)))
 - add(log1p(132.5496948900636), square(ARG3))
 - cdiv(add(ARG3, ARG2), log1p(ARG5))
 - log1p(sqrtabs(sqrtabs(sin(cos(ARG4)))))
 - cos(log1p(sqrtabs(sqrtabs(sin(cos(ARG4))))))
 - add(tan(add(ARG1, ARG5)), sqrtabs(exp(ARG1)