In [2]:
"""Collates json-formatted results, cleans them up and saves them as .feather
files."""
# Author: William La Cava, williamlacava@gmail.com
# SRBENCH
# License: GPLv3

################################################################################
# Ground-truth problems
################################################################################
import pandas as pd
import json
import numpy as np
from glob import glob
from tqdm import tqdm
import os
import sys

rdir = '../results_sym_data/'
print('reading results from  directory', rdir)


symbolic_algs = [
    'AFP', 
    'AFP_FE',
    'BSR',
    'DSR',
    'FFX',
    'FEAT',
    'EPLEX',
    'GP-GOMEA',
    'gplearn',
    'ITEA', 
    'MRGP', 
    'Operon',
    'SBP-GP',
    'AIFeynman',

    'Brush',
    'Brush wo split',
    'Brush (D-UCB1)',
    'Brush (C-D-UCB1)',
    'Brush (D-TS)',
    'Brush (C-D-TS)',
    'Brush wo split (D-UCB1)',
]
nongp_algs = [
    'BSR',
    'DSR',
    'AIFeynman'
]
gp_algs = [
    'AFP', 
    'AFP_FE',
    'FFX',
    'FEAT',
    'EPLEX',
    'GP-GOMEA',
    'gplearn',
    'ITEA', 
    'MRGP', 
    'Operon',
    'SBP-GP',

    'Brush',
    'Brush wo split',
    'Brush (D-UCB1)',
    'Brush (C-D-UCB1)',
    'Brush (D-TS)',
    'Brush (C-D-TS)',
    'Brush wo split (D-UCB1)',
]

##########
# load data from json
##########

frames = []
excluded_datasets = [
    'feynman_test_10',
    'feynman_I_26_2',
    'feynman_I_30_5'
]
excluded_cols = [
    'params'
]
fails = []
bad_bsr = []
updated = 0
for f in tqdm(glob(rdir + '/*/*.json')):

    if os.path.exists(f+'.updated'):
        f += '.updated'
        updated += 1
    if 'cv_results' in f: 
        continue
    if 'EHC' in f:
        continue

    if any([ed in f for ed in excluded_datasets]):
        continue

    # Filtering brushes
    # if not any([c in f for c in ['brush_500','brush_D_UCB1_500','brush_wo_split_500','brush_wo_split_D_UCB1_500',]]):
    #     continue

    if "_dso_" not in f:
        continue

    try: 
        r = json.load(open(f,'r'))
        if isinstance(r['symbolic_model'],list):
            print('WARNING: list returned for model:',f)
            bad_bsr.append(f)
            sm = ['B'+str(i)+'*'+ri for i, ri in enumerate(r['symbolic_model'])]
            sm = '+'.join(sm)
            r['symbolic_model'] = sm
            
        sub_r = {k:v for k,v in r.items() if k not in excluded_cols}
    #     df = pd.DataFrame(sub_r)
        frames.append(sub_r) 
    #     print(f)
    #     print(r.keys())
    except Exception as e:
        fails.append([f,e])
        pass
    
print('{} results files loaded, {} ({:.1f}%) of which are '
	'updated'.format(len(frames), updated, updated/len(frames)*100))
print(len(fails),'fails:')
for f in fails: 
    print(f[0])
print('bad bsr:',bad_bsr)
df_results = pd.DataFrame.from_records(frames)

reading results from  directory ../results_blackbox/


100%|██████████| 14016/14016 [00:07<00:00, 1931.15it/s]


803 results files loaded, 0 (0.0%) of which are updated
0 fails:
bad bsr: []


In [3]:
##########
# cleanup
##########

df_results = df_results.rename(columns={'time_time':'training time (s)'})
df_results.loc[:,'training time (hr)'] = df_results['training time (s)']/3600

# add modified R2 with 0 floor
df_results['r2_zero_test'] = df_results['r2_test'].apply(lambda x: max(x,0))
for col in ['symbolic_error_is_zero', 'symbolic_error_is_constant', 'symbolic_fraction_is_constant']:
    df_results.loc[:,col] = df_results[col].fillna(False)

print(','.join(df_results.algorithm.unique()))

# remove 'Regressor' from names
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('Regressor','')) 

#Rename SGD to Linear
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: 'Linear' if x=='SGD' else x)

# rename sembackpropgp to SBP
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('sembackpropgp','SBP-GP'))

# rename FE_AFP to AFP_FE
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('FE_AFP','AFP_FE'))

# rename GPGOMEA to GP-GOMEA
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('GPGOMEA','GP-GOMEA'))

df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_500', 'Brush'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_D_UCB1_500', 'Brush (D-UCB1)'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_wo_split_500','Brush wo split'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_wo_split_D_UCB1_500','Brush wo split (D-UCB1)'))
 
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_wo_split_D_UCB1','Brush wo split (D-UCB1)'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_wo_split','Brush wo split'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_D_UCB1', 'Brush (D-UCB1)'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_C_D_UCB1', 'Brush (C-D-UCB1)'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_D_TS', 'Brush (D-TS)'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush_C_D_TS', 'Brush (C-D-TS)'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('brush', 'Brush'))

df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('e2et','E2E'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('tpsr','TPSR+E2E'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('dso','uDSR'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('nesymres10M','NeSymRes 10M'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('nesymres100M','NeSymRes 100M'))

df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('PSTreeRegressor','PS-Tree'))
df_results['algorithm'] = df_results['algorithm'].apply(lambda x: x.replace('pstree','PS-Tree'))

df_results['symbolic_alg'] = df_results['algorithm'].apply(lambda x: x in symbolic_algs)

# indicator of strogatz or feynman
df_results['data_group'] = df_results['dataset'].apply(lambda x: 'Feynman' if 'feynman' in x else 'Strogatz') 

# filling empty target noise with zeros
# df_results['target_noise']  = df_results['target_noise'].fillna(0)
# df_results['feature_noise'] = df_results['feature_noise'].fillna(0)

##########
# compute symbolic solutions
##########
print(df_results.columns)
display(df_results.sample(3).T)
df_results.loc[:,'symbolic_solution'] = df_results[['symbolic_error_is_zero',
                                                    'symbolic_error_is_constant',
                                                    'symbolic_fraction_is_constant']
                                                   ].apply(any,raw=True, axis=1)
df_results.loc[:,'symbolic_solution'] = df_results['symbolic_solution'] & ~df_results['simplified_symbolic_model'].isna() 
df_results.loc[:,'symbolic_solution'] = df_results['symbolic_solution'] & ~(df_results['simplified_symbolic_model'] == '0')
df_results.loc[:,'symbolic_solution'] = df_results['symbolic_solution'] & ~(df_results['simplified_symbolic_model'] == 'nan')

KeyError: 'symbolic_error_is_zero'

In [None]:
for col in ['algorithm','dataset']:
    print(df_results[col].nunique(), col+'s')

print('mean trial count:')
print(df_results.groupby('algorithm')['dataset'].count().sort_values()
      / df_results.dataset.nunique())

In [None]:
print(df_results.shape)
##########
# save results
##########
df_results.to_feather('../results/ground-truth_results_dso2.feather')
print('results saved to ../results/ground-truth_results_dso2.feather')