In [1]:
from nsga2.estimator import NSGAIIRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/lexicase_paper/d_concrete.txt', sep=',')

# DEAP interface requires X and y to be numpy arrays, not pandas dataframes
X = df.drop('label', axis=1).values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

estimator = NSGAIIRegressor(**{
    'pop_size'        : 4, 
    'max_gen'         : 40,
    'max_depth'       : 3,  # 8
    'max_size'        : 2**3, # 75
    'objectives'      : ['error', 'size'],
    'cx_prob'         : 1/5,
    'initialization'  : 'uniform',
    'pick_criteria'   : 'error', # error, MCDM
    'validation_size' : 0.33,
    'simplify'        : True,
    'simplification_method' : 'bottom_up',
    'verbosity'       : 1,
    'survival'       : 'tournament',
    'functions'       :[]
}).fit(X_train, y_train)

simplifying ind idx0 arccos(arccos(subtract(expm1(ARG2), arccos(ARG7))))
 - list of indexes [(6, <deap.gp.Terminal object at 0x7f5031a24800>), (5, <deap.gp.Primitive object at 0x7f5031a225c0>), (4, <deap.gp.Terminal object at 0x7f5031a24240>), (3, <deap.gp.Primitive object at 0x7f5031a22700>), (2, <deap.gp.Primitive object at 0x7f5031a22160>), (1, <deap.gp.Primitive object at 0x7f5031a225c0>), (0, <deap.gp.Primitive object at 0x7f5031a225c0>)]
   - 6, ARG7
   - subtree [<deap.gp.Terminal object at 0x7f5031a24800>]
     - skipping
   - 5, arccos
   - subtree [<deap.gp.Primitive object at 0x7f5031a225c0>, <deap.gp.Terminal object at 0x7f5031a24800>]
     - cast into ind arccos(ARG7)
     - semantics [nan nan nan]
     - bad semantics
   - 4, ARG2
   - subtree [<deap.gp.Terminal object at 0x7f5031a24240>]
     - skipping
   - 3, expm1
   - subtree [<deap.gp.Primitive object at 0x7f5031a22700>, <deap.gp.Terminal object at 0x7f5031a24240>]
     - cast into ind expm1(ARG2)
     - semantics [

In [2]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse

model      = str(estimator.best_estimator_).replace("ARG", "x_")
size       = len(estimator.best_estimator_)
complexity = size
depth      = estimator.best_estimator_.height

print(model)
print(size)
print(complexity)
print(depth)

for metric, fn, (data_X, data_y) in [
    ('train_r2',  r2_score, (X_train, y_train)),
    ('test_r2',   r2_score, (X_test,  y_test )),
    ('train_mse', mse,      (X_train, y_train)),
    ('test_mse',  mse,      (X_test,  y_test )),
]:
    score = np.nan
    try:
        score = fn(estimator.predict(data_X), data_y)
        print(f"{metric} : {score}")
    except ValueError:
        print(f"(Failed to calculate {metric}")

for ind in estimator.archive_:
    print(ind.fitness, ind)

subtract(x_4, -29.657408123791107)
3
3
1
train_r2 : -5.191834473290887
test_r2 : -8.031305715401688
train_mse : 234.234050745994
test_mse : 263.1154100967376
(242.11685865935374, 3.0) subtract(ARG4, -29.657408123791107)


In [3]:
print( len(list(estimator.simplifier.pop_hash.keys())) )

n_keys =  len(list(estimator.simplifier.pop_hash.keys()))

for key in list(estimator.simplifier.pop_hash.keys())[:n_keys]:
    print(key)
    for ind in estimator.simplifier.pop_hash[key]:
        print(" -", ind)

# 11001111011111110110111010010100
# 11010101000100110111001000101100

32
0010001110111010
 - ARG6
 - maximum(ARG6, ARG1)
 - maximum(ARG0, subtract(ARG4, -29.6574081237911))
0011011000111000
 - ARG1
0110001110111011
 - ARG2
0010011110111010
 - ARG5
0010001010111010
 - ARG4
0010010110111010
 - ARG7
0000000000000000
 - 36.16243713733075
 - arctan(63.284264990328836)
 - absolute(36.16243713733075)
 - arctan(-177.94444874274666)
 - sqrtabs(0.0)
 - subtract(ARG4, ARG4)
 - subtract(26.76062797761844, -9.401808863411494)
 - subtract(24.23051316756485, -11.931923969765855)
1111001011010110
 - expm1(ARG2)
0011010000111000
 - add4(arctan(ARG7), -51.56879009014728, ARG1, ARG4)
1000010000111000
 - mul4(add4(arctan(ARG7), -51.56879009014728, ARG1, ARG4), ARG0, ARG5, ARG7)
1101001101101000
 - expm1(ARG3)
1001110111011100
 - sin(ARG0)
0011010111111100
 - expm1(sin(ARG0))
0111010111111110
 - mul4(expm1(sin(ARG0)), ARG5, ARG0, ARG2)
0101011001111010
 - expm1(sqrtabs(multiply(ARG3, ARG4)))
0010111010111010
 - cos(ARG2)
1101100001000101
 - subtract(ARG4, 36.16243713733075)


In [15]:
display(df.describe())

A = np.maximum(df['x0'], df['x4']+29.657).values
B = df['x6'].values

print(np.std(A) * (A - np.mean(A))[:5])
print(np.std(B) * (B - np.mean(B))[:5])

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,label
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


[27036.47147454 27036.47147454  5361.93013146  5361.93013146
 -8624.67751114]
[ -7819.81227097  -7819.81227097 -14391.05039658 -14391.05039658
   4160.67675074]
