In [1]:
from symreg import Regressor
import pandas as pd
import numpy as np
import random

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

In [2]:
X, y = load_boston(return_X_y=True)
X /= X.mean()
y /= y.mean()
X, Xt, y, yt = train_test_split(X, y, test_size=0.4, random_state=0)

results = []
Xt.shape

(203, 13)

In [None]:
diff_data, duration_per = 360, 5 # 30 minutes
# diff_data, duration_per = 180, 5 # 15 minutes
# diff_data, duration_per = 120, 1 # 2 minutes
# diff_data, duration_per   = 50, .1 # instant


def time_left(diff_data):
    return f'{diff_data * duration_per / 60} minutes'

print(f'Will take {time_left(diff_data)}')

def eval_settings(**settings):
    print()
    r = Regressor(duration=duration_per, verbose=True, **settings)
    r.fit(X, y)
    
    y_out = r.predict(Xt)
    error = np.median((yt - y_out) ** 2)
    return {**settings, **{'error': error}}


for i in range(diff_data):
#     muts = [random.random()*.32, random.random()*.42]
#     hoist_mutation_chance = .13
#     grow_root_mutation_chance = .13
#     muts = [m/(sum(muts)+hoist_mutation_chance+grow_root_mutation_chance) for m in muts]
    # muts[0] is pointwise mutation; used only for normalization
    
    args = {
        'n': random.choice([35, 50, 80]),
        'complete_tree_as_new_subtree_chance': random.random(),
        'mutation_children': random.random()*2,
        'crossover_children': random.random()*2,
        'simplify_chance': random.random(),
    }
    
    results.append(eval_settings(**args))
    print(results[-1])
    print('------------', time_left(diff_data - i-1), 'left')
    print()

Will take 30.0 minutes

Complete. {'generations': 92, 'stagnated_generations': 0, 'duration': 5.287193775177002}
{'n': 50, 'complete_tree_as_new_subtree_chance': 0.8731163110769317, 'mutation_children': 0.0637212518550343, 'crossover_children': 1.8326634169659481, 'simplify_chance': 0.6772521876110039, 'error': 0.016893869242723816}
------------ 29.916666666666668 minutes left


Complete. {'generations': 448, 'stagnated_generations': 1, 'duration': 5.02456521987915}
{'n': 35, 'complete_tree_as_new_subtree_chance': 2.1383237117644782e-05, 'mutation_children': 0.23275169164959708, 'crossover_children': 1.2115853339317144, 'simplify_chance': 0.32149186163607735, 'error': 0.008418450758307574}
------------ 29.833333333333332 minutes left


Complete. {'generations': 623, 'stagnated_generations': 7, 'duration': 5.032642602920532}
{'n': 35, 'complete_tree_as_new_subtree_chance': 0.9292897622195738, 'mutation_children': 0.022113844747769118, 'crossover_children': 1.4081584885129352, 'simplify_

Complete. {'generations': 293, 'stagnated_generations': 3, 'duration': 5.022019624710083}
{'n': 80, 'complete_tree_as_new_subtree_chance': 0.43955332274509007, 'mutation_children': 0.3428765672057865, 'crossover_children': 0.27838417546984373, 'simplify_chance': 0.3694687984113636, 'error': 0.011768744256896854}
------------ 28.0 minutes left


Complete. {'generations': 1054, 'stagnated_generations': 5, 'duration': 5.008499622344971}
{'n': 35, 'complete_tree_as_new_subtree_chance': 0.3062461541894266, 'mutation_children': 0.37454370201409337, 'crossover_children': 0.17396379308641374, 'simplify_chance': 0.2563456624983893, 'error': 0.014484501950550321}
------------ 27.916666666666668 minutes left


Complete. {'generations': 128, 'stagnated_generations': 2, 'duration': 5.075653553009033}
{'n': 35, 'complete_tree_as_new_subtree_chance': 0.8069796073936458, 'mutation_children': 0.8273742635997168, 'crossover_children': 1.2680043941156824, 'simplify_chance': 0.9719586558285903, 'error': 0

In [None]:
pd.DataFrame(results).sort_values('error').to_csv('metaopt.tsv', sep='\t', index=False)

In [None]:
resdf = pd.read_csv('metaopt.tsv', sep='\t').sort_values('error').dropna()

split = list(map(int, np.linspace(0, 100, 33)[1:]))
percs = [s/100 for s in split]
names = [f'{s}%' for s in split]
percentiles = resdf['error'].describe(percentiles=percs).loc[names]
print(names)
resdf['chunk'] = 0
for p in percentiles:
    resdf.loc[resdf['error'] > p, 'chunk'] += 1

final_res = pd.concat([resdf.groupby('chunk').mean(), resdf.groupby('chunk').count()['error']], axis=1)
final_res

In [None]:
noerror = final_res.iloc[:, :-2]
noerror['n'] /= 100
noerror['rand'] = np.random.random(np.shape(noerror['n']))/5+.5
noerror['rand2'] = np.random.random(np.shape(noerror['n']))/5+.5
noerror.plot(figsize=(15, 10))
noerror

In [None]:
final_res.iloc[:, :-1].corr('spearman')

In [None]:
r2 = Regressor(duration=10, simplify_chance=0.7, verbose=True)
X = resdf.iloc[:,  :-2]
y = resdf.iloc[:, -2]

r2.fit(X, y)

In [None]:
r2.results()

# constant prog error: 0.0018655041589379236 before 0-simplification and 1-simplif

In [None]:
X = pd.DataFrame({'n': np.linspace(0, 1, 1001)})
from symreg.ga import Program
p = Program(
    'mul add 1.3919445013961234 0.004935158262587003 mul pow $n 0.017308843547676457 exp -4.774142744886527'\
    .replace('$n', '$0')
)
X['err'] = p.eval([X['n']])
X.plot(x='n', y='err', logy=True)

We chose `mutation_children` to be 1 in production. It seems the other arguments do not matter - the error is very close to the constant program (complexity 1).