In [1]:
"""Collates json-formatted results, cleans them up and saves them as .feather
files."""
# Author: William La Cava, williamlacava@gmail.com
# SRBENCH
# License: GPLv3

################################################################################
# Ground-truth problems
################################################################################
import pandas as pd
import json
import numpy as np
from glob import glob
from tqdm import tqdm
import os
import sys

rdir = '../results_sym_data/'
print('reading results from  directory', rdir)

##########
# load data from json
##########

frames = []
excluded_datasets = [
    'feynman_test_10',
    'feynman_I_26_2',
    'feynman_I_30_5'
]
excluded_cols = [
    'params'
]
fails = []
bad_bsr = []
updated = 0
for f in tqdm(glob(rdir + '/*/*.json')):
    if os.path.exists(f+'.updated'):
        f += '.updated'
        updated += 1
    if 'cv_results' in f: 
        continue
    if 'EHC' in f:
        continue
    if any([ed in f for ed in excluded_datasets]):
        continue
    try: 
        r = json.load(open(f,'r'))
        if isinstance(r['symbolic_model'],list):
            print('WARNING: list returned for model:',f)
            bad_bsr.append(f)
            sm = ['B'+str(i)+'*'+ri for i, ri in enumerate(r['symbolic_model'])]
            sm = '+'.join(sm)
            r['symbolic_model'] = sm
            
        sub_r = {k:v for k,v in r.items() if k not in excluded_cols}
    #     df = pd.DataFrame(sub_r)
        frames.append(sub_r) 
    #     print(f)
    #     print(r.keys())
    except Exception as e:
        fails.append([f,e])
        pass
    
print('{} results files loaded, {} ({:.1f}%) of which are '
	'updated'.format(len(frames), updated, updated/len(frames)*100))
print(len(fails),'fails:')
for f in fails: 
    print(f[0])
print('bad bsr:',bad_bsr)
df_results = pd.DataFrame.from_records(frames)

reading results from  directory ../results_sym_data/


100%|██████████| 3185/3185 [00:12<00:00, 256.21it/s]

3157 results files loaded, 3136 (99.3%) of which are updated
0 fails:
bad bsr: []





In [2]:
##########
# cleanup
##########

df_results = df_results.rename(columns={'time_time':'training time (s)'})
df_results.loc[:,'training time (hr)'] = df_results['training time (s)']/3600

# add modified R2 with 0 floor
df_results['r2_zero_test'] = df_results['r2_test'].apply(lambda x: max(x,0))
for col in ['symbolic_error_is_zero', 'symbolic_error_is_constant', 'symbolic_fraction_is_constant']:
    df_results.loc[:,col] = df_results[col].fillna(False)

print(','.join(df_results.algorithm.unique()))

# remove 'Regressor' from names
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_C_D_UCB1','C-D-UCB1'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_C_D_TS','C-D-TS'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_D_UCB1','D-UCB1'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_D_TS','D-TS'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush','Baseline'))

df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('Regressor','')) 
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('tuned.','')) 
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('sembackpropgp','SBP-GP')) 

# rename FE_AFP to AFP_FE
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('FE_AFP','AFP_FE'))

# rename GPGOMEA to GP-GOMEA
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('GPGOMEA','GP-GOMEA'))

# indicator of strogatz or feynman
df_results['data_group'] = df_results['dataset'].apply(lambda x: 'Feynman' if 'feynman' in x else 'Strogatz') 

# filling empty target noise with zeros
# df_results['target_noise']  = df_results['target_noise'].fillna(0)
# df_results['feature_noise'] = df_results['feature_noise'].fillna(0)

##########
# compute symbolic solutions
##########
print(df_results.columns)
display(df_results.sample(3).T)
df_results.loc[:,'symbolic_solution'] = df_results[['symbolic_error_is_zero',
                                                    'symbolic_error_is_constant',
                                                    'symbolic_fraction_is_constant']
                                                   ].apply(any,raw=True, axis=1)
df_results.loc[:,'symbolic_solution'] = df_results['symbolic_solution'] & ~df_results['simplified_symbolic_model'].isna() 
df_results.loc[:,'symbolic_solution'] = df_results['symbolic_solution'] & ~(df_results['simplified_symbolic_model'] == '0')
df_results.loc[:,'symbolic_solution'] = df_results['symbolic_solution'] & ~(df_results['simplified_symbolic_model'] == 'nan')

  df_results.loc[:,col] = df_results[col].fillna(False)


brush,brush_C_D_UCB1,brush_D_UCB1,brush_D_TS,brush_C_D_TS
Index(['dataset', 'algorithm', 'random_state', 'training time (s)',
       'symbolic_model', 'mse_train', 'mae_train', 'r2_train', 'mse_test',
       'mae_test', 'r2_test', 'simplicity', 'model_size', 'target_noise',
       'feature_noise', 'true_model', 'simplified_symbolic_model',
       'simplified_complexity', 'symbolic_error', 'symbolic_fraction',
       'symbolic_error_is_zero', 'symbolic_error_is_constant',
       'symbolic_fraction_is_constant', 'sympy_exception',
       'training time (hr)', 'r2_zero_test', 'data_group'],
      dtype='object')


Unnamed: 0,676,1509,660
dataset,feynman_II_11_3,strogatz_shearflow2,feynman_I_37_4
algorithm,Baseline,Baseline,C-D-TS
random_state,15795,16850,23654
training time (s),431.666361,32.528816,18391.659523
symbolic_model,"Div(Pow(Add(Mul(Sin(0.24*q),51.56*Ef),Sub(Exp(...","0.52*Sin(Add(3.32*Tanh(0.74*x),-0.07))","Mul(Add(Sub(3.31,2.55*Cos(1.00*Add(delta,3.12)..."
mse_train,0.000167,0.013891,0.028426
mae_train,0.009309,0.076592,0.115344
r2_train,0.988527,0.757441,0.99649
mse_test,0.000168,0.012387,0.029664
mae_test,0.009338,0.072156,0.117096


In [3]:
print(df_results.shape)
##########
# save results
##########
df_results.to_feather('../results/ground-truth_results_local.feather')
print('results saved to ../results/ground-truth_results_local.feather')

(3157, 28)
results saved to ../results/ground-truth_results_local.feather


In [4]:
# 3157