In [1]:
from nsga2.estimator import NSGAIIRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/lexicase_paper/d_concrete.txt', sep=',')

# DEAP interface requires X and y to be numpy arrays, not pandas dataframes
X = df.drop('label', axis=1).values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

estimator = NSGAIIRegressor(**{
    'pop_size'        : 80, 
    'max_gen'         : 100,
    'max_depth'       : 6,  # 8
    'max_size'        : 2**6, # 75
    'objectives'      : ['error', 'size'],
    'cx_prob'         : 1/5,
    'initialization'  : 'uniform',
    'pick_criteria'   : 'error', # error, MCDM
    'validation_size' : 0.33,
    'simplify'        : True,
    'simplification_method' : 'bottom_up',
    'verbosity'       : 1,
    'survival'       : 'tournament',
    'functions'       :[
                    'div', 'add', 'sub', 'mul',
                    'add3', 'add4', 'mul3', 'mul4',
                    'maximum', 'minimum',
                    'sin', 'cos', 'tan', 'arcsin', 'arccos', 'arctan',
                    'log1p', 'expm1', 'log', 'exp', 
                    'sqrt', 'sqrtabs', 'square', 'abs' 
                    ]
}).fit(X_train, y_train)

gen	evals	best_size	n_simplifications	n_new_hashes	avg train error	avg train size	avg val error	avg val size	med train error	med train size	med val error	med val size	std train error	std train size	std val error	std val size	min train error	min train size	min val error	min val size	max train error	max train size	max val error	max val size
0  	80   	1        	101              	367         	               	              	             	            	               	              	             	            	               	              	             	            	               	              	             	            	               	              	             	            
1  	80   	1        	10               	62          	               	              	             	            	               	              	             	            	               	              	             	            	               	              	             	            	               	              	    

In [2]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse

model      = str(estimator.best_estimator_).replace("ARG", "x_")
size       = len(estimator.best_estimator_)
complexity = size
depth      = estimator.best_estimator_.height

print(model)
print(size)
print(complexity)
print(depth)

for metric, fn, (data_X, data_y) in [
    ('train_r2',  r2_score, (X_train, y_train)),
    ('test_r2',   r2_score, (X_test,  y_test )),
    ('train_mse', mse,      (X_train, y_train)),
    ('test_mse',  mse,      (X_test,  y_test )),
]:
    score = np.nan
    try:
        score = fn(estimator.predict(data_X), data_y)
        print(f"{metric} : {score}")
    except ValueError:
        print(f"(Failed to calculate {metric}")

for ind in estimator.archive_:
    print(ind.fitness, ind)

add4(sqrt(x_0), multiply(mul3(log(x_7), add4(cdiv(-38.39720799869897, sqrt(x_1)), cdiv(-23492.096503973815, sqrt(x_5)), x_1, add4(x_1, x_0, x_2, x_6)), x_5), 1.8515059312702405e-05), add4(sqrt(x_0), -46.92870895231074, arctan(sqrt(exp(x_2))), sqrt(x_0)), log1p(mul3(120.9839388616722, sqrt(x_0), arctan(x_4))))
41
41
6
train_r2 : 0.815719100179013
test_r2 : 0.8008617761817125
train_mse : 43.72593177379217
test_mse : 49.37644033329807
(43.671051100195015, 41.0) add4(sqrt(ARG0), multiply(mul3(log(ARG7), add4(cdiv(-38.39720799869897, sqrt(ARG1)), cdiv(-23492.096503973815, sqrt(ARG5)), ARG1, add4(ARG1, ARG0, ARG2, ARG6)), ARG5), 1.8515059312702405e-05), add4(sqrt(ARG0), -46.92870895231074, arctan(sqrt(exp(ARG2))), sqrt(ARG0)), log1p(mul3(120.9839388616722, sqrt(ARG0), arctan(ARG4))))
(43.781020146127744, 39.0) add4(sqrt(ARG0), multiply(mul3(log(ARG7), add4(-833.3587905476098, cdiv(17.29003715385261, sqrt(ARG5)), ARG1, add4(ARG1, ARG0, ARG2, ARG6)), square(ARG5)), 2.1339400830083158e-08), add

In [3]:
print( len(list(estimator.simplifier.pop_hash.keys())) )

n_keys =  len(list(estimator.simplifier.pop_hash.keys()))

for key in list(estimator.simplifier.pop_hash.keys())[:n_keys]:
    print(key)
    for ind in estimator.simplifier.pop_hash[key]:
        print(" -", ind)

# 11001111011111110110111010010100
# 11010101000100110111001000101100

7691
1010110111001101001111000000001001111101001010101111110010110100000011101111100100101100000110001101100001100111111110110011111100011110110010100011101001100100000011100010010101011100000001011110100111111101000010000101100001100101001010101001101100110110
 - ARG0
 - absolute(ARG0)
 - maximum(ARG4, ARG0)
 - maximum(ARG0, ARG4)
 - square(sqrt(ARG0))
 - maximum(ARG0, -15.862814808154681)
 - maximum(ARG0, -65.37853341943791)
 - maximum(ARG0, ARG0)
 - maximum(-80.04097589838605, ARG0)
 - sqrt(square(ARG0))
 - log1p(expm1(ARG0))
 - minimum(ARG0, ARG0)
 - maximum(ARG0, 0.0)
 - maximum(ARG0, -43.93960259111314)
 - minimum(ARG0, ARG5)
 - maximum(ARG0, -76.40862531636154)
 - minimum(ARG0, ARG6)
 - maximum(tan(ARG4), ARG0)
 - maximum(sqrt(ARG0), ARG0)
 - maximum(subtract(ARG6, ARG5), ARG0)
 - maximum(ARG0, add3(ARG7, -117.8580242919264, -23.43992711282685))
 - maximum(ARG0, add(mul4(ARG3, 0.00017144078155443955, ARG4, ARG7), 29.11592537590536))
1001011101111111100011100000010000001010011100

In [4]:
display(df.describe())

A = np.maximum(df['x0'], df['x4']+29.657).values
B = df['x6'].values

print(np.std(A) * (A - np.mean(A))[:5])
print(np.std(B) * (B - np.mean(B))[:5])

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,label
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


[27036.47147454 27036.47147454  5361.93013146  5361.93013146
 -8624.67751114]
[ -7819.81227097  -7819.81227097 -14391.05039658 -14391.05039658
   4160.67675074]
