# Ground Truth problems

These are problems for which the data generating process is a known model, 

$$ y = \phi^*(\mathbf{x}, \theta^*) $$

We assess how well symbolic regression algorithms find the form of the model, $\phi^*$, with some leniency on $\theta^*$ (we allow the model to be off by a constant or a scalar). 

In [1]:
import pandas as pd
import json
import numpy as np
from glob import glob
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('pdf', fonttype=42)
import os
sns.set(font_scale=1.2)
rdir = '../results/'
# figdir = 'figs/ground-truth/'
figdir = '../paper/figs/results_sym_data/'

print('figdir:',figdir)
def save(h=None,name='tmp'):
    name = name.strip().replace(' ','-').replace('%','pct')
    if h == None:
        h = plt.gcf()
    h.tight_layout()
    print('saving',name+'.pdf')
    if not os.path.exists(figdir):
        os.makedirs(figdir)
    plt.savefig(figdir+'/'+name+'.pdf', dpi=400, bbox_inches='tight')
    

figdir: ../paper/figs/results_sym_data/


# read data from feather

In [2]:
df_results = pd.read_feather(rdir+'ground-truth_results.feather')

# combine with local results ---------------------------------
df_results_local = pd.read_feather('../results/ground-truth_results_local.feather')
df_results = pd.concat([df_results_local, df_results], axis=0)

# removing feat (it was a sanity check)
# df_results = df_results.loc[df_results['algorithm']!='FEAT(e-lex)']

print('mean trial count:')
print(df_results.groupby('algorithm')['dataset'].count().sort_values()
      / df_results.dataset.nunique())

mean trial count:
algorithm
C-D-TS        4.007692
Baseline      4.384615
D-UCB1        4.392308
C-D-UCB1      4.453846
D-TS          4.484615
AIFeynman    35.915385
FEAT         36.169231
gplearn      37.523077
MRGP         37.838462
EPLEX        38.000000
AFP          38.000000
BSR          38.000000
GP-GOMEA     38.000000
FFX          38.000000
ITEA         38.000000
DSR          38.000000
SBP-GP       38.000000
AFP_FE       40.000000
Operon       40.000000
Name: dataset, dtype: float64


In [3]:
print(df_results["algorithm"].unique())
df_results[df_results["algorithm"]=="FEAT"]

['C-D-UCB1' 'D-TS' 'Baseline' 'D-UCB1' 'C-D-TS' 'FEAT' 'AFP' 'DSR' 'FFX'
 'GP-GOMEA' 'ITEA' 'Operon' 'AIFeynman' 'BSR' 'EPLEX' 'AFP_FE' 'gplearn'
 'MRGP' 'SBP-GP']


Unnamed: 0,dataset,algorithm,random_state,training time (s),symbolic_model,mse_train,mae_train,r2_train,mse_test,mae_test,...,symbolic_fraction,symbolic_error_is_zero,symbolic_error_is_constant,symbolic_fraction_is_constant,sympy_exception,training time (hr),r2_zero_test,data_group,symbolic_solution,process_time
0,feynman_III_10_19,FEAT,29910,7477.133516,16.4356+-9.06*(0.1340*(0.0948*x_0-0.4616*(0.03...,5.158905e-03,4.934706e-02,0.999902,4.657973e-03,4.577807e-02,...,(0.003*Bx*mom - 3.36*Bx + 0.069*By + 1.01*By*e...,False,False,False,,2.076982,0.999911,Feynman,False,7472.385775
9,feynman_III_10_19,FEAT,23654,636.218597,16.3018+6.28*x_0+5.44*(0.8835*x_3+0.2564*x_1)+...,7.122876e+00,2.167380e+00,0.873671,4.012022e+00,1.710611e+00,...,(0.04303*Bx**2 + 0.27802*Bx*mom + 1.854*Bx + 0...,False,False,False,,0.176727,0.924955,Feynman,False,635.630530
191,feynman_III_10_19,FEAT,14423,7576.555284,16.466+-13.74*(0.0375*x_0-0.8222*x_2)+10.10*(0...,1.545038e-03,2.661330e-02,0.999971,1.679500e-03,2.710005e-02,...,,False,False,False,,2.104599,0.999968,Feynman,False,7570.953067
192,feynman_III_10_19,FEAT,14423,7059.683217,16.4656+-6.52*(0.0144*exp(0.5034*(0.2403*x_1-0...,4.635524e-03,4.889632e-02,0.999913,4.473421e-03,4.674296e-02,...,(0.047*Bx*(0.83*By - 0.936*Bz)*(-0.e-3*Bx + 0....,False,False,False,,1.961023,0.999914,Feynman,False,7056.197374
193,feynman_III_10_19,FEAT,14423,2549.147430,16.4629+4.38*(0.3638*x_1+1.0988*x_3)+-3.35*(0....,4.836700e-02,1.729812e-01,0.999097,1.619409e-02,9.277797e-02,...,(-0.003*Bx*Bz*(0.424*Bx - 0.1*mom) + 1.593*Bx ...,False,False,False,,0.708097,0.999689,Feynman,False,2541.692349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68830,strogatz_vdp2,FEAT,5390,148.743288,-0.0494531+-0.09*x_0,2.062118e-17,3.499304e-09,1.000000,2.039534e-17,3.579783e-09,...,-10*(-0.09*x - 0.049)/x,False,False,False,,0.041318,1.000000,Strogatz,False,148.680772
68831,strogatz_vdp2,FEAT,5390,178.787194,-0.0494504+-0.09*x_0,1.073537e-08,8.326731e-05,0.999999,1.766185e-10,1.225463e-05,...,-10*(-0.09*x - 0.049)/x,False,False,False,,0.049663,1.000000,Strogatz,False,178.228066
68832,strogatz_vdp2,FEAT,860,183.814810,-0.050107+-0.09*x_0,1.912654e-17,3.424174e-09,1.000000,2.227791e-17,3.519332e-09,...,-10*(-0.09*x - 0.05)/x,False,False,False,,0.051060,1.000000,Strogatz,False,183.729596
68833,strogatz_vdp2,FEAT,860,195.807367,-0.0501146+-0.09*x_0,9.843030e-09,7.901096e-05,0.999999,7.171177e-11,6.915918e-06,...,-10*(-0.09*x - 0.05)/x,False,False,False,,0.054391,1.000000,Strogatz,False,194.623040


# compute symbolic solutions

In [4]:
df_results.loc[:,'symbolic_solution'] = df_results[['symbolic_error_is_zero',
                                                    'symbolic_error_is_constant',
                                                    'symbolic_fraction_is_constant']
                                                   ].apply(any,raw=True, axis=1)
print(df_results['algorithm'].unique())
display(df_results.sample(3).T)

# clean up any corner cases (constant models, failures)
df_results.loc[:,'symbolic_solution'] = df_results['symbolic_solution'] & ~df_results['simplified_symbolic_model'].isna() 
df_results.loc[:,'symbolic_solution'] = df_results['symbolic_solution'] & ~(df_results['simplified_symbolic_model'] == '0')
df_results.loc[:,'symbolic_solution'] = df_results['symbolic_solution'] & ~(df_results['simplified_symbolic_model'] == 'nan')

display(df_results.sample(3).T)
print(df_results['algorithm'].unique())

# save results for detailed tabulating
df_results.reset_index(drop=True).to_feather(rdir+'ground-truth_solns.feather')

['C-D-UCB1' 'D-TS' 'Baseline' 'D-UCB1' 'C-D-TS' 'FEAT' 'AFP' 'DSR' 'FFX'
 'GP-GOMEA' 'ITEA' 'Operon' 'AIFeynman' 'BSR' 'EPLEX' 'AFP_FE' 'gplearn'
 'MRGP' 'SBP-GP']


Unnamed: 0,34242,41177,66581
dataset,feynman_I_25_13,feynman_I_34_14,strogatz_predprey2
algorithm,gplearn,AFP_FE,BSR
random_state,4426,21962,14423
training time (s),1805.92346,28840.791287,28800.047815
symbolic_model,"div(X0, X1)",((x_2*1.113)+(((sqrt(|x_2|)*(sin(cos((x_1/sqrt...,[-0.50802657]+[-0.02050728]*(x[1]+-0.3137*(x[0...
mse_train,0.000002,0.000942,0.008546
mae_train,0.001151,0.022713,0.054704
r2_train,0.999997,0.999641,0.996639
mse_test,0.0,0.000911,0.037988
mae_test,0.0,0.022435,0.069177


Unnamed: 0,27749,65503,36307
dataset,feynman_I_13_12,strogatz_lv2,feynman_II_34_2a
algorithm,EPLEX,BSR,AFP_FE
random_state,29910,16850,15795
training time (s),10798.808352,17033.168303,28819.53557
symbolic_model,(((((x_2-x_3)+(-0.031+0.456))/(-0.045-x_0))+((...,[-0.06279453]+[-0.32680257]*0.7135*(x[0]+-(-(x...,((x_0*log(x_1))/(x_2+x_2))
mse_train,3.083165,0.002356,0.007395
mae_train,1.106165,0.025383,0.064399
r2_train,0.9655,0.99653,0.965882
mse_test,3.208093,0.071619,0.007616
mae_test,1.115437,0.05708,0.06476


In [None]:
df_results["algorithm"].unique()

In [None]:
df_results["data_group"].unique()

## summarize results by dataset, including ranking

In [None]:
# print(df_results['algorithm'].unique())

# display(df_results.groupby(['dataset','target_noise'])['algorithm'].unique())
# display(df_results.groupby(['dataset','target_noise'])['algorithm'].nunique().reset_index().sample(10))

df_results2 = df_results.merge(df_results.groupby(['dataset','target_noise'])['algorithm'].nunique().reset_index(),
                              on=['dataset','target_noise'], suffixes=('','_count'))

# print(df_results2['algorithm'].unique())
# display(df_results2.sample(3).T)

# count repeat trials
df_results2 = df_results2.merge(
           df_results2.groupby(['algorithm','dataset','target_noise'])['random_state'].nunique().reset_index(),
           on=['algorithm','dataset','target_noise'],suffixes=('','_repeats'))

# print(df_results2['algorithm'].unique())
# display(df_results2.sample(3).T)

# accuracy-based exact solutions 
df_results2['accuracy_solution'] = df_results2['r2_test'].apply(lambda x: x > 0.999).astype(float)

# print(df_results2['algorithm'].unique())
# display(df_results2.sample(3).T)

# get mean solution rates for algs on datasets at specific noise levels, averaged over trials 
for soln in ['accuracy_solution','symbolic_solution']:
    df_results2 = df_results2.merge(
        df_results2.groupby(['algorithm','dataset','target_noise'])[soln].mean().reset_index(),
                                  on=['algorithm','dataset', 'target_noise'],suffixes=('','_rate'))

# print(df_results2['algorithm'].unique())
# display(df_results2.sample(3).T)

# # rankings
for col in [c for c in df_results2.columns if c.endswith('test') or c.endswith('size')]:
    ascending = 'r2' not in col
    df_results2[col+'_rank_per_trial']=df_results2.groupby(
                        ['dataset','target_noise','random_state'])[col].apply(
                                                                              lambda x: 
                                                                              round(x,3).rank(
                                                                              ascending=ascending).astype(int))

# print(df_results2['algorithm'].unique())
# display(df_results2.sample(3).T)

df_sum = df_results2.groupby(['algorithm','dataset','target_noise','data_group'],as_index=False).median()

# print(df_sum['algorithm'].unique())
# display(df_sum.sample(3).T)

# rankings and normalized scores per dataset
for col in [c for c in df_sum.columns if any([c.endswith(n) for n in ['test','size','rate']])]:
    ascending = 'r2' not in col and 'solution' not in col
    df_sum[col+'_rank']=df_sum.groupby(['dataset','target_noise'])[col].apply(
        lambda x:  round(x,3).rank(ascending=ascending).astype(int) )
    df_sum[col+'_norm'] = df_sum.groupby(['dataset','target_noise'])[col].apply(lambda x: (x-x.min())/(x.max()-x.min()))

# df_sum['success_rate'] = df_results2.groupby(['algorithm','dataset'])['solution'].mean().reset_index()
for soln in ['accuracy_solution','symbolic_solution']:
    df_sum[soln +'_rate_(%)'] = df_sum[soln+'_rate'].apply(lambda x: x*100)
    
df_sum['rmse_test'] = df_sum['mse_test'].apply(np.sqrt)
df_sum['log_mse_test'] = df_sum['mse_test'].apply(lambda x: np.log(1+x))
# df_results = df_results2
df_sum

# save summary data
used for statistical comparisons

In [None]:
df_sum.to_csv(rdir+'/symbolic_dataset_results_sum.csv.gz',compression='gzip', index=False)

In [None]:
df_sum["algorithm"].unique()

In [None]:
df_sum[df_sum["algorithm"]=="Baseline"]

# solution rates by alg/dataset/noise

In [None]:
df_sum.groupby(['target_noise','algorithm'])['symbolic_solution_rate_(%)'].mean().round(2).unstack().transpose()

# plot comparisons

In [None]:
sns.set_style('whitegrid')
def compare(df_compare=None, x='r2_test',y='algorithm', row=None, col=None, scale=None, xlim=[], est=np.mean,
            orient='h', hue=None, **kwargs):
    df_compare = df_compare.copy()
    if row==None and col == None:
        aspect=1
    else:
        aspect=0.55
#     plt.figure(figsize=(8,7))
    tmp = df_compare.groupby(['target_noise',y])[x].apply(est).unstack().mean()
    order = tmp.sort_values(ascending=False).index
    
    for c in [x,y,row,col]:
        if c:
            df_compare = df_compare.rename(columns={c:c.replace('_',' ').title()})
        
    x = x.replace('_',' ').title()
    y = y.replace('_',' ').title()
    if row:
        row = row.replace('_',' ').title()
    if col:
        col = col.replace('_',' ').title()
    
    if scale=='log' and len(xlim)>0 and xlim[0] == 0:
        df_compare.loc[:,x] += 1
        xlim[0] = 1
        xnew = '1 + '+x
        df_compare=df_compare.rename(columns={x:xnew})
        x = xnew
    if orient=='v':
        tmp = x
        x = y
        y = tmp
    if col and not row:
        col_wrap = min(4, df_compare[col].nunique()) 
    else:
        col_wrap=None
        
    cat_args = dict(
                data=df_compare, 
                kind='point',
                y=y,
                x=x,
                order=order,
                row=row,
                col=col,
                col_wrap=col_wrap,
                palette='flare_r',
                margin_titles=True,
                aspect=aspect,
                hue=hue,
                legend_out=False,
    )
    cat_args.update(kwargs)
    g = sns.catplot( **cat_args )
    if hue:
        g._legend.remove() #(title=hue.replace('_',' ').title())
        g.axes.flat[-1].legend(title=hue.replace('_',' ').title(),
                               fontsize=10
                              )
    for ax in g.axes.flat: 
        ax.yaxis.grid(True)
        ax.set_ylabel('')
        ax.set_xlabel(ax.get_xlabel().replace('Symbolic ',''))
        if col:
            ttl = ax.get_title()
            ax.set_title(ttl.replace(col,'').replace('=',''))

    
    if len(xlim)>0:
        plt.xlim(xlim[0],xlim[1])
    if scale:
        plt.gca().set_xscale(scale)

    sns.despine(left=True, bottom=True)
    savename = '-'.join(['cat-'+cat_args['kind']+'plot',x+ '-by-'+ y])
    if row: savename += '_'+row
    if col: savename += '_'+col
    
    save(g, savename )

In [None]:
for metric in ['symbolic_solution_rate_(%)','r2_test','accuracy_solution']:
# for metric in ['r2_test']:
    for kind in ['point']: #,'strip']:
        args =dict(df_compare=df_sum, x=metric, est=np.mean, orient='h',
                   kind=kind) 
        if kind=='point': 
            args['join'] = False 
            args['markers']=['o','s','x','+']
        if metric == 'r2_test':
            args['xlim'] = [-1, 1]
        compare(**args,
                hue='target_noise', 
                col=None,
                ) 
        compare(**args, 
                hue='target_noise', 
                col='data_group',
                ) 

In [None]:
# Make the PairGrid
df_plot = df_sum.copy()
tmp = df_plot.groupby(['target_noise','algorithm'])['symbolic_solution_rate'].mean().unstack().mean()
order = tmp.sort_values(ascending=False).index
df_plot['size_diff'] = df_plot['model_size']-df_plot['simplified_complexity']+1
x_vars=[
#         'accuracy_solution_rate_(%)',
#         'mse_test',
#         'r2_test_rank',
#         'r2_test_norm',
        'symbolic_solution_rate_(%)',
        'r2_test',
        'simplified_complexity',
#         'size_diff',
#         'model_size',
#         'training time (s)',
#         'solution'
]
g = sns.PairGrid(df_plot, 
                 x_vars=x_vars,
                 y_vars=['algorithm'],
                 height=6.5, 
                 aspect=0.7,
                 hue='target_noise',
#                  hue_order=[0.01,0.001,0]
#                  hue='dataset'
                )
g.map(sns.pointplot, 
#       size=10,
      orient="h",
      # jitter=False,
      order=order,
      palette="flare_r",
      errwidth=2,
      # linewidth=0.01,
      # markeredgecolor='w',
      join=False,
      estimator=np.mean,
      n_boot=1000,
      markers=['x','o','s','+'],
      # markeralpha=0.5
     )
plt.legend(title='Target Noise')
titles = [x.replace('_',' ').title().replace('(S)','(s)') for x in x_vars]

for ax, title in zip(g.axes.flat, titles):

    # remove xlabel
    ax.set_xlabel('')
    ax.set_ylabel('')
    # Set a different title for each axes
    ax.set(title=title)
    
    if any([n in title.lower() for n in ['size','complexity','time']]):
        ax.set_xscale('log')
    if 'R2' in title and 'Rank' not in title:
        ax.set(title=title.replace('R2','$R^2$'))
        ax.set_xlim([0,1])

    # Make the grid horizontal instead of vertical
    ax.yaxis.grid(True)
save(g, 'pairgrid_'+'_'.join(x_vars))