# Benchmark for Symbolic Regression

In [3]:
import numpy as np

## Artificial bee colony programming for symbolic regression

1-D: 20 samples

{2,3}-D: 100 samples

Non-terminals: +, -, *, /, sin, cos, exp, log

MAE

$$
F_1 = x^3 + x^2 + x, x \in [-1,1] \\
F_2 = x^4 + x^3 + x^2 + x, x \in [-1,1] \\
F_3 = x^5 + x^4 + x^3 + x^2 + x, x \in [-1,1] \\
F_4 = x^6 + x^5 + x^4 + x^3 + x^2 + x, x \in [-1,1] \\
F_5 = sin(x^2)cos(x) - 1, x \in [-1,1] \\
F_6 = sin(x) + sin(x + x^2), x \in [-1,1] \\
F_7 = log(x+1) + log(x^2+1), x \in [0,2] \\
F_8 = \sqrt{x}, x \in [0,4] \\
F_9 = sin(x) + sin(y^2), x, y \in [-1,1] \\
F_{10} = 2sin(x)cos(y), x, y \in [-1,1] \\
F_{11} = F_2 \\
F_{12} = sin(x) + x^2 + 1, x \in [-1,1] \\
F_{13} = sin(x^3 + x), x \in [-1,1] \\
F_{14} = \frac{30xz}{(x-10)y^2}, x, z \in [-1,1], y \in [1,2] \\
F_{15} = \frac{x^4 + x^3 + y^2}{2-y}, x, y \in [-3, 3] \\
F_{16} = x^y, x, y \in [0,1]
$$

---


In [4]:
#1-D: 20 samples
#{2,3}-D: 100 samples

F = {}
F[1] = lambda x: x**3 + x**2  + x
F[2] = lambda x: x**4 + x**3 + x**2  + x
F[3] = lambda x: x**5 + x**4 + x**3 + x**2  + x
F[4] = lambda x: x**6 + x**5 + x**4 + x**3 + x**2  + x
F[5] = lambda x: np.sin(x**2)*np.cos(x) - 1
F[6] = lambda x: np.sin(x) + np.sin(x**2 + x)
F[7] = lambda x: np.log(x+1) + np.log(x**2 + 1)
F[8] = lambda x: np.sqrt(x)
F[9] = lambda X: np.sin(X[:,0]) + np.sin(X[:,1]**2)
F[10] = lambda X: 2*np.sin(X[:,0])*np.cos(X[:,1])
F[12] = lambda x: np.sin(x) + x**2 + 1
F[13] = lambda x: np.sin(x**3 + x)
F[14] = lambda X: 30*X[:,0]*X[:,2]/((X[:,0]-10)*X[:,1]**2)
F[15] = lambda X: (X[:,0]**4 + X[:,0]**3 + X[:,1]**2)/(2 - X[:,1])
F[16] = lambda X: X[:,0]**X[:,1]

info = {}

info['Bee'] = {}
Names = ['F{}'.format(x) for x in sorted(F.keys())]
Terms = [3,4,5,6,2,2,2,1,2,1,3,1,1,3,1]
info['Bee']['functions'] = dict(zip(Names,Terms))
info['Bee']['has_test'] = False

dim = np.ones(17, dtype=int)
dim[9:11] = 2
dim[14] = 3
dim[15:] = 2

domain = [(-1,1)]*17
domain[7:9] = [(0,2),(0,4)]
domain[15:] = [(-3,3), (0,1)]

for n, f in F.items():
    low, hi = domain[n]
    d = dim[n]
    samples = 20 if d == 1 else 100
    
    X = np.random.uniform(low, hi, (samples, d))
    if n == 14:
        X[:,1] = np.random.uniform(1,2, samples)
    Y = np.atleast_2d(f(X))
    if Y.shape[0] < Y.shape[1]:
        Y = Y.T
        
    Z = np.hstack((X,Y))
    np.savetxt('Benchmark/Bee/F'+str(n)+'.csv', Z)

# neat Genetic Programming: Controlling bloat naturally

1-D: 20 samples

2-D: 100 samples

Non-terminals: must look at references

RMSE

$$
F_1 = x^4 + x^3 + x^2 + x, x \in [-1,1] \\
F_2 = x^5 + x^4 + x^3 + x^2 + x, x \in [-1,1] \\
F_3 = sin(x^2)cos(x) - 1, x \in [-1,1] \\
F_4 = log(x+1) + log(x^2+1), x \in [0,2] \\
F_{5} = 2sin(x)cos(y), x, y \in [-1,1] \\
F_{6} = \sum_{i=0..x}{\frac{1}{i}}, x \in range(1,50)\\
F_{7} = \sum_{i=0..x}{\frac{1}{i}}, x \in range(1,120) \\
F_{8} = 2 - 2.1cos(9.8x)sin(1.3w), x, w \in [-50,50] \\
F_{9} = \frac{e^{-(x-1)^2}}{1.2 + (y-2.5)^2}, x, y \in [0.3,4] \\
F_{10} = \frac{1}{1+x^-4} + \frac{1}{1+y^-4}, x, y \in range(-5,5,0.4) 
$$

In [5]:
F = {}
F[1] = lambda x: x**4 + x**3 + x**2  + x
F[2] = lambda x: x**5 + x**4 + x**3 + x**2  + x
F[3] = lambda x: np.sin(x**2)*np.cos(x) - 1
F[4] = lambda x: np.log(x+1) + np.log(x**2 + 1)

F[5] = lambda X: 2*np.sin(X[:,0])*np.cos(X[:,1])

F[6] = lambda X: np.array([np.sum(1./np.arange(1,x+1)) for x in X])
F[7] = lambda X: np.array([np.sum(1./np.arange(1,x+1)) for x in X])

F[8] = lambda X: 2 - 2.1*np.cos(9.8*X[:,0])*np.sin(1.3*X[:,1])
F[9] = lambda X: np.exp(-(X[:,0]-1)**2)/(1.2 + (X[:,1] - 2.5)**2)
F[10] = lambda X: (1/(1+X[:,0]**-4) + 1/(1 + X[:,1]**-4))

info['Neat'] = {}
Names = ['F{}'.format(x) for x in range(1,11)]
Terms = [4,5,2,2,1,1,1,2,1,2]
info['Neat']['functions'] = dict(zip(Names,Terms))
info['Neat']['has_test'] = False


dim = np.ones(11, dtype=int)
dim[5:] = 2
dim[6:8] = 1

domain = [(-1,1)]*11
domain[4] = (0,2)
domain[6] = (1,50)
domain[7] = (1,120)
domain[8] = (-50,50)
domain[9] = (0.3,4)
domain[10] = (-5,5)

for n, f in F.items():
    low, hi = domain[n]
    d = dim[n]
    samples = 20 if d == 1 else 100
    
    X = np.random.uniform(low, hi, (samples, d))
    Y = np.atleast_2d(f(X))
    if Y.shape[0] < Y.shape[1]:
        Y = Y.T
        
    Z = np.hstack((X,Y))
    np.savetxt('Benchmark/Neat/F'+str(n)+'.csv', Z)

# GP made faster with semantic surrogate modelling

200 samples $\in [-5, 5]$

500 samples for testing

Non-terminals: +, -, *, /, pow, sqrt

MAE

$$
F_{1} = \frac{e^{-(x_1 - 1)^2}}{1.2 + (x_2 - 2.5)^2} \\
F_{2} = \frac{10}{5 + \sum_{i=1..5}{(x_i-3)^2}} \\
F_{3} = 6sin(x_1)cos(x_2) \\
F_{4} = x_1x_2x_3x_4x_5 \\
F_{5} = 32 - 3\frac{\tan(x_1)}{\tan(x_2)}\frac{\tan{x_3}}{\tan{x_4}}
$$


In [7]:
F = {}
F[1] = lambda X: np.exp(-(X[:,0]-1)**2)/(1.2 + (X[:,1]-2.5)**2)
F[2] = lambda X: 10/(5 + np.sum((X-3)**2,axis=1))
F[3] = lambda X: 6*np.sin(X[:,0])*np.cos(X[:,1])
F[4] = lambda X: X[:,0]*X[:,1]*X[:,2]*X[:,3]*X[:,4]
F[5] = lambda X: 32 - 3*np.tan(X[:,0])*np.tan(X[:,2])/(np.tan(X[:,1])*np.tan(X[:,3])) + X[:,4]

info['Surrogate1'] = {}
Names = ['F{}'.format(x) for x in range(1,6)]
Terms = [1, 1, 1, 1, 3]
info['Surrogate1']['functions'] = dict(zip(Names,Terms))
info['Surrogate1']['has_test'] = True

dim = np.ones(6, dtype=int)*5

domain = [(-5,5)]*6

for n, f in F.items():
    low, hi = domain[n]
    d = dim[n]    
    
    X = np.random.uniform(low, hi, (700, d))
    Y = np.atleast_2d(f(X))
    if Y.shape[0] < Y.shape[1]:
        Y = Y.T
        
    print(X.shape, Y.shape)
    Z = np.hstack((X,Y))
    np.savetxt('Benchmark/Surrogate1/F'+str(n)+'_train.csv', Z[:200,:])
    np.savetxt('Benchmark/Surrogate1/F'+str(n)+'_test.csv', Z[200:,:])

(700, 5) (700, 1)
(700, 5) (700, 1)
(700, 5) (700, 1)
(700, 5) (700, 1)
(700, 5) (700, 1)


# Surrogate Genetic Programming: A semantic aware evolutionary search

100 samples $\in [-5, 5]$

50 samples $\in [-10, 10]$ for testing

Non-terminals: +, -, *, /

MAE

$$
F_1 = x^3 + x^2 + 5x \\
F_2 = \frac{x^6}{x^3 + x^2 + 1} \\
F_3 = \frac{x}{1 - \log(x^2 + x + 1)} \\
F_4 = \sin(x^2) \\
F_5 = 5\sqrt(|x|)\\
F_6 = 100 + \log(x^2) + 5\sqrt(|x|)\\
F_7 = 2\tan(x)\cos(x)\\
$$

In [12]:
F = {}
F[1] = lambda x: x**3 + x**2 + 5*x
F[2] = lambda x: (x**6)/(x**3 + x**2 + 1)
F[3] = lambda x: x/(1 - np.log(x**2 + x + 1))
F[4] = lambda x: np.sin(x**2)
F[5] = lambda x: np.sqrt(np.abs(x))*5
F[6] = lambda x: 100 + np.log(x**2) + 5*np.sqrt(np.abs(x))
F[7] = lambda x: 2*np.tan(x)*np.cos(x)

info['Surrogate2'] = {}
Names = ['F{}'.format(x) for x in range(1,8)]
Terms = [3, 1, 1, 1, 1, 3, 1]
info['Surrogate2']['functions'] = dict(zip(Names,Terms))
info['Surrogate2']['has_test'] = True

import json
fw=open('Benchmark/data.info', 'w')
json.dump(info, fw)
fw.close()

dim = np.ones(8, dtype=int)

for n, f in F.items():
    
    X = np.random.uniform(-5, 5, (100,))
    Y = f(X)
    Z = np.vstack((X,Y)).T
    print(Z.shape)
    np.savetxt('Benchmark/Surrogate2/F'+str(n)+'_train.csv', Z)

    X = np.random.uniform(-10, 10, (50,))
    Y = f(X)
    Z = np.vstack((X,Y)).T
    np.savetxt('Benchmark/Surrogate2/F'+str(n)+'_test.csv', Z)


(100, 2)
(100, 2)
(100, 2)
(100, 2)
(100, 2)
(100, 2)
(100, 2)


# Improving Genetic Programming Based Symbolic Regression Using Deterministic Machine Learning

2500 samples $\in [0, 1]$

1250 samples $\in [0, 1]$ for testing

Non-terminals: +, -, *, /

RMSE

csv

In [9]:
info

{'Bee': {'functions': {'F1': 3,
   'F10': 1,
   'F12': 3,
   'F13': 1,
   'F14': 1,
   'F15': 3,
   'F16': 1,
   'F2': 4,
   'F3': 5,
   'F4': 6,
   'F5': 2,
   'F6': 2,
   'F7': 2,
   'F8': 1,
   'F9': 2},
  'has_test': False},
 'Neat': {'functions': {'F1': 4,
   'F10': 2,
   'F2': 5,
   'F3': 2,
   'F4': 2,
   'F5': 1,
   'F6': 1,
   'F7': 1,
   'F8': 2,
   'F9': 1},
  'has_test': False},
 'Surrogate1': {'functions': {'F1': 1, 'F2': 1, 'F3': 1, 'F4': 1, 'F5': 3},
  'has_test': True},
 'Surrogate2': {'functions': {'F1': 3,
   'F2': 1,
   'F3': 1,
   'F4': 1,
   'F5': 1,
   'F6': 3,
   'F7': 1},
  'has_test': True}}

In [37]:
import numpy as np
X = np.random.uniform(-1,1, (100,1))
y = np.sin(10*X)
w = 3.

In [66]:
for i in range(100):
    delta = ((y - np.sin(w*X))*X*np.cos(w*X)).mean()
    w = w + delta
print(w, ((y-np.sin(w*X))**2).mean(), delta)

2.55706235747 0.986585883816 1.82492909673e-16


In [84]:
import scipy.optimize as opt

ferror = lambda w: np.square(y - np.sin(w*X)).max()
opt.minimize_scalar(ferror, bracket=[2, 12])

     fun: 6.0926406473121375e-19
    nfev: 14
     nit: 13
 success: True
       x: 9.9999999991763762