In [1]:
from nsga2.estimator import NSGAIIRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/lexicase_paper/d_airfoil.txt', sep=',')

# DEAP interface requires X and y to be numpy arrays, not pandas dataframes
X = df.drop('label', axis=1).values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

estimator = NSGAIIRegressor(**{
    'pop_size'        : 40, 
    'max_gen'         : 50,
    'max_depth'       : 7,  # 8
    'max_size'        : 2**7, # 75
    'objectives'      : ['error', 'size'],
    'initialization'  : 'uniform',
    'pick_criteria'   : 'error', # error, MCDM
    'validation_size' : 0.33,
    'simplify'        : True,
    
    # Either you use smart variation (just 1 cx and 1 mutation)
    'smart_variation' : True,

    # Or you use mabs (4 mutations)
    'use_mab'         : False,
    'use_context'     : False,

    'simplification_method' : 'bottom_up',
    'simplification_tolerance' : 1e-0,
    'verbosity'       : 1,
    'survival'       : 'tournament'
}).fit(X_train, y_train)

hashtable will have dimensions 256 x 755
starting to index. 755, 0
starting to index. 755, 1
starting to index. 755, 2
starting to index. 755, 3
starting to index. 755, 4
starting to index. 755, 5
initialized 6 keys
gen	evals	best_size	best_error	n_simplifications	n_new_hashes	avg train error	avg train size	avg val error	avg val size	med train error	med train size	med val error	med val size	std train error	std train size	std val error	std val size	min train error	min train size	min val error	min val size	max train error	max train size	max val error	max val size
0  	40   	26       	-47.8211  	88               	248         	               	              	             	            	               	              	             	            	               	              	             	            	               	              	             	            	               	              	             	            
1  	40   	26       	-46.0309  	36               	68          	               	  

In [2]:
# Should not give erros even without mabs
pd.DataFrame(estimator.variator.mab.pull_history).iloc[:10]

Unnamed: 0,t,arm,reward,update,delta_error,gen
0,0,subtree,1.0,0,"[inf, 13.0]",1
1,1,cx,1.0,0,"[862266.052762957, 10.0]",1
2,2,subtree,1.0,0,"[6517.7614848678895, -2.0]",1
3,3,lsh_mutate,0.0,0,"[-11.032106063885294, 1.0]",1
4,4,cx,1.0,0,"[inf, 14.0]",1
5,5,cx,1.0,0,"[inf, 7.0]",1
6,6,lsh_mutate,0.0,0,"[-0.1456142680362973, 1.0]",1
7,7,lsh_mutate,1.0,0,"[15536.407423740751, 7.0]",1
8,8,subtree,0.5,0,"[-0.0, -0.0]",1
9,9,lsh_mutate,1.0,0,"[15334.615850986036, -20.0]",1


In [3]:
pd.DataFrame(estimator.variator.mab.pull_history)['arm'].value_counts().sort_values()

lsh_mutate    626
subtree       651
cx            683
Name: arm, dtype: int64

In [4]:
pd.DataFrame(estimator.variator.mab.pull_history).groupby('arm')['reward'].value_counts().sort_values()

arm         reward
subtree     0.0        62
cx          0.0        79
subtree     0.5       124
lsh_mutate  0.0       145
            0.5       151
cx          0.5       171
lsh_mutate  1.0       330
cx          1.0       433
subtree     1.0       465
Name: reward, dtype: int64

In [5]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse

model      = str(estimator.best_estimator_).replace("ARG", "x_")
size       = len(estimator.best_estimator_)
complexity = size
depth      = estimator.best_estimator_.height

print(model)
print(size)
print(complexity)
print(depth)

for metric, fn, (data_X, data_y) in [
    ('train_r2',  r2_score, (X_train, y_train)),
    ('test_r2',   r2_score, (X_test,  y_test )),
    ('train_mse', mse,      (X_train, y_train)),
    ('test_mse',  mse,      (X_test,  y_test )),
]:
    score = np.nan
    try:
        score = fn(estimator.predict(data_X), data_y)
        print(f"{metric} : {score}")
    except ValueError:
        print(f"(Failed to calculate {metric}")

for ind in estimator.archive_:
    print(ind.fitness, ind)

cdiv(128.42494154495984, mul3(0.9389090576554695, mul3(arccos(x_4), sqrt(x_4), log(add3(log1p(add4(x_4, x_1, x_3, x_3)), cos(cos(x_0)), add3(-24.25485073482981, -179.909079373228, x_0)))), add(add(expm1(x_2), 1167.6751592094372), -1167.5748291969091)))
30
30
7
train_r2 : -0.7331080811828405
test_r2 : -1.5155470750228135
train_mse : 29.078281550143284
test_mse : 35.702209734176286
(30.267660207107685, 32.0) cdiv(128.42494154495984, mul3(0.9389090576554695, mul3(arccos(ARG4), sqrt(ARG4), log(add3(log1p(add4(ARG4, ARG1, ARG3, ARG3)), mul4(ARG0, ARG4, ARG2, 2.388412300447648), add3(-24.25485073482981, -179.909079373228, ARG0)))), add(add(expm1(ARG2), 1167.6751592094372), -1167.5748291969091)))
(30.279727120887078, 30.0) cdiv(128.42494154495984, mul3(0.9389090576554695, mul3(arccos(ARG4), sqrt(ARG4), log(add3(log1p(add4(ARG4, ARG1, ARG3, ARG3)), cos(cos(ARG0)), add3(-24.25485073482981, -179.909079373228, ARG0)))), add(add(expm1(ARG2), 1167.6751592094372), -1167.5748291969091)))


In [6]:
if False:
    print( len(list(estimator.simplifier.pop_hash.keys())) )

    n_keys =  len(list(estimator.simplifier.pop_hash.keys()))

    for key in list(estimator.simplifier.pop_hash.keys())[:n_keys]:
        print(key)
        for ind in estimator.simplifier.pop_hash[key]:
            print(" -", ind)

In [7]:
if True:
    print( len(list(estimator.variator.variator_.pop_hash.keys())) )

    n_keys =  len(list(estimator.variator.variator_.pop_hash.keys()))

    for key in list(estimator.variator.variator_.pop_hash.keys())[:n_keys]:
        print(key)
        for ind in estimator.variator.variator_.pop_hash[key]:
            print(" -", ind)

1649
0
 - 1.0
1
 - ARG0
 - add(ARG0, ARG4)
 - add(sqrtabs(add3(arctan(ARG2), add3(ARG2, ARG2, ARG3), square(ARG0))), tan(multiply(minimum(ARG2, ARG1), mul4(ARG0, ARG2, ARG3, -13.722483109612554))))
 - sqrtabs(add3(arctan(ARG2), add3(ARG2, ARG2, ARG3), square(ARG0)))
 - subtract(ARG0, ARG4)
 - maximum(ARG0, ARG3)
 - add3(ARG0, ARG4, ARG4)
 - add4(ARG1, ARG1, ARG2, ARG0)
 - add4(ARG2, ARG3, ARG0, ARG2)
 - add(add4(-61.46098204457394, ARG0, ARG4, ARG1), arctan(ARG0))
 - add4(-61.46098204457394, ARG0, ARG4, ARG1)
 - multiply(log(maximum(sqrt(ARG3), -23.81914682386126)), ARG0)
 - add4(ARG0, ARG2, 16.246036438442914, ARG3)
 - add3(28.50464933399948, ARG0, ARG0)
 - add3(ARG0, -82.20587735844047, ARG1)
 - subtract(mul3(ARG0, 75.68934996593757, 85.26035307849841), sqrtabs(ARG0))
 - mul3(ARG0, 75.68934996593757, 85.26035307849841)
 - add4(-15.361920455023778, ARG0, -16.457837839787203, ARG1)
 - add3(ARG1, ARG0, ARG0)
 - maximum(ARG0, ARG1)
 - maximum(ARG2, ARG0)
 - add4(-34.80527063460069, -61.3

In [8]:
display(df.describe())

A = np.maximum(df['x0'], df['x4']+29.657).values
B = df['x6'].values

print(np.std(A) * (A - np.mean(A))[:5])
print(np.std(B) * (B - np.mean(B))[:5])

Unnamed: 0,x0,x1,x2,x3,x4,label
count,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0
mean,2886.380572,6.782302,0.136548,50.860745,0.01114,124.835943
std,3152.573137,5.918128,0.093541,15.572784,0.01315,6.898657
min,200.0,0.0,0.0254,31.7,0.000401,103.38
25%,800.0,2.0,0.0508,39.6,0.002535,120.191
50%,1600.0,5.4,0.1016,39.6,0.004957,125.721
75%,4000.0,9.9,0.2286,71.3,0.015576,129.9955
max,20000.0,22.2,0.3048,71.3,0.058411,140.987


KeyError: 'x6'