In [1]:
from nsga2.estimator import NSGAIIRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/lexicase_paper/d_airfoil.txt', sep=',')

# DEAP interface requires X and y to be numpy arrays, not pandas dataframes
X = df.drop('label', axis=1).values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

estimator = NSGAIIRegressor(**{
    'pop_size'        : 40, 
    'max_gen'         : 20,
    'max_depth'       : 6,  # 8
    'max_size'        : 2**6, # 75
    'objectives'      : ['error', 'size'],
    'initialization'  : 'uniform',
    'pick_criteria'   : 'error', # error, MCDM
    'validation_size' : 0.33,
    'simplify'        : False,
    
    # Either you use smart variation (just 1 cx and 1 mutation)
    'smart_variation' : False,
    'warm_up'         : False,

    # Or you use mabs (4 mutations)
    'use_mab'         : False,
    'use_context'     : False,

    'simplification_method' : 'bottom_up',
    'simplification_tolerance' : 1e-0,
    'verbosity'       : 1,
    'survival'       : 'tournament'
}).fit(X_train, y_train)

gen	evals	best_size	best_error	n_simplifications	n_new_hashes	avg train error	avg train size	avg val error	avg val size	med train error	med train size	med val error	med val size	std train error	std train size	std val error	std val size	min train error	min train size	min val error	min val size	max train error	max train size	max val error	max val size
0  	40   	8        	-1416.39  	61               	189         	               	              	             	            	               	              	             	            	               	              	             	            	               	              	             	            	               	              	             	            
1  	40   	1        	-47.306   	28               	48          	               	              	             	            	               	              	             	            	               	              	             	            	               	              	             	            	  

In [2]:
# Should not give erros even without mabs
pd.DataFrame(estimator.variator.mab.pull_history).iloc[:10]

Unnamed: 0,t,arm,reward,update,delta_error,gen,ball_id
0,0,point,1.0,0,"[166.72867862049134, -0.0]",1,1
1,1,point,1.0,0,"[13735.92090156993, -5.0]",1,1
2,2,point,0.0,0,"[-209.14030979132258, -0.0]",1,1
3,3,delete,1.0,0,"[14222.136759494624, -6.0]",1,1
4,4,delete,0.0,0,"[-11853.907811850224, -0.0]",1,1
5,5,subtree,1.0,0,"[inf, 44.0]",1,1
6,6,subtree,0.0,0,"[-0.0, -0.0]",1,1
7,7,insert,1.0,0,"[inf, 1.0]",1,1
8,8,insert,0.0,0,"[-10605.588558588204, 2.0]",1,1
9,9,cx,0.0,0,"[-15522.685708984027, -2.0]",1,1


In [3]:
pd.DataFrame(estimator.variator.mab.pull_history)['arm'].value_counts().sort_values()

cx         100
insert     126
point      164
subtree    171
delete     199
Name: arm, dtype: int64

In [4]:
pd.DataFrame(estimator.variator.mab.pull_history).groupby('arm')['reward'].value_counts().sort_values()

arm      reward
delete   0.0        22
cx       0.0        34
insert   0.0        34
subtree  0.0        34
point    0.0        40
cx       1.0        66
insert   1.0        92
point    1.0       124
subtree  1.0       137
delete   1.0       177
Name: reward, dtype: int64

In [9]:
variation_log = pd.DataFrame(estimator.variator.log)

print("statistics (without nans and infs)")
for col in ['delta error', 'delta size', 'euclid dist prediction']:
    print(col)
    print(variation_log
            .groupby('variation')[col]
            # .apply (lambda c: np.nanmean(c.replace([np.inf, -np.inf], np.nan).dropna()) )
            .apply (lambda c: np.nanmedian(c) )
            .sort_values()
    )
    print('-'*40)

statistics (without nans and infs)
delta error
variation
cx             2.198263
delete         2.295486
point          2.472891
insert        81.759586
subtree    15389.506117
Name: delta error, dtype: float64
----------------------------------------
delta size
variation
delete    -2.0
cx         0.0
point      0.0
insert     1.0
subtree    6.0
Name: delta size, dtype: float64
----------------------------------------
euclid dist prediction
variation
subtree    32.309526
cx         40.834866
delete     40.895770
insert     43.937488
point      45.788917
Name: euclid dist prediction, dtype: float64
----------------------------------------


In [6]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse

model      = str(estimator.best_estimator_).replace("ARG", "x_")
size       = len(estimator.best_estimator_)
complexity = size
depth      = estimator.best_estimator_.height

print(model)
print(size)
print(complexity)
print(depth)

for metric, fn, (data_X, data_y) in [
    ('train_r2',  r2_score, (X_train, y_train)),
    ('test_r2',   r2_score, (X_test,  y_test )),
    ('train_mse', mse,      (X_train, y_train)),
    ('test_mse',  mse,      (X_test,  y_test )),
]:
    score = np.nan
    try:
        score = fn(estimator.predict(data_X), data_y)
        print(f"{metric} : {score}")
    except ValueError:
        print(f"(Failed to calculate {metric}")

for ind in estimator.archive_:
    print(ind.fitness, ind)

subtract(124.9112286629668, subtract(cdiv(mul4(0.9411760897924463, x_2, x_0, x_4), x_4), square(arctan(add3(2843781.80219572, x_0, x_1)))))
16
16
5
train_r2 : -0.35691782112193815
test_r2 : -0.13493308968100837
train_mse : 26.164647301273668
test_mse : 24.13754296311155
(26.485483057186894, 16.0) subtract(124.9112286629668, subtract(cdiv(mul4(0.9411760897924463, ARG2, ARG0, ARG4), ARG4), square(arctan(add3(2843781.80219572, ARG0, ARG1)))))
(33.30673712978105, 13.0) subtract(-9236.100111111264, cdiv(cdiv(-110710.81655608509, ARG4), exp(square(arctan(add3(2290.9174664408274, ARG0, ARG1))))))
(35.34732435102061, 12.0) subtract(-9377.602363401324, cdiv(cdiv(-25291.37907274315, ARG4), square(arctan(add3(957.5179493520319, ARG0, ARG1)))))


In [7]:
if False:
    print( len(list(estimator.simplifier.pop_hash.keys())) )

    n_keys =  len(list(estimator.simplifier.pop_hash.keys()))

    for key in list(estimator.simplifier.pop_hash.keys())[:n_keys]:
        print(key)
        for ind in estimator.simplifier.pop_hash[key]:
            print(" -", ind)

In [8]:
if True:
    print( len(list(estimator.variator.variator_.pop_hash.keys())) )

    n_keys =  len(list(estimator.variator.variator_.pop_hash.keys()))

    for key in list(estimator.variator.variator_.pop_hash.keys())[:n_keys]:
        print(key)
        for ind in estimator.variator.variator_.pop_hash[key]:
            print(" -", ind)

AttributeError: 'Variator' object has no attribute 'variator_'

In [None]:
display(df.describe())

A = np.maximum(df['x0'], df['x4']+29.657).values
B = df['x4'].values

print(np.std(A) * (A - np.mean(A))[:5])
print(np.std(B) * (B - np.mean(B))[:5])