# 2022 Flatiron Machine Learning x Science Summer School

## Step 2: Run symbolic regression on generic data

In this step, we investigate whether symbolic regression can discover the functions underlying the generic data created in Step 1. There is no support from deep learning at this point.

We use the symbolic regression package [PySR](https://github.com/MilesCranmer/PySR).

In [1]:
import os
import numpy as np
import joblib
from pysr import PySRRegressor

import warnings
warnings.filterwarnings("ignore")

### Step 2.1: Load data

We load the input and target function data created in Step 1 and create masks to split the data into training, validation and test sets.

In [2]:
data_path = "data"
data_ext = ".gz"

In [3]:
data = {}
for file_name in os.listdir(data_path):
    if file_name[-len(data_ext):] == data_ext:
        
        var = file_name[:-len(data_ext)]
        var_data = np.loadtxt(os.path.join(data_path, file_name))
        if len(var_data.shape) == 1:
            var_data = var_data.reshape(-1,1)
        data[var] = var_data

        print(f"Loaded {var} data.")

Loaded F01 data.
Loaded F02 data.
Loaded F03 data.
Loaded G01 data.
Loaded G02 data.
Loaded G03 data.
Loaded X01 data.


In [4]:
seed = 0

train_size = 0.7
val_size = 0.2

In [5]:
np.random.seed(0)

train_mask = {}
val_mask = {}
test_mask = {}

for var in data:
    if var[0] == "X":
        data_size = data[var].shape[0]

        data_idx = np.arange(data_size)
        np.random.shuffle(data_idx)

        train_idx = int(data_size*train_size)
        val_idx = train_idx + int(data_size*val_size)

        train_mask[var] = data_idx[:train_idx]
        val_mask[var] = data_idx[train_idx:val_idx]
        test_mask[var] = data_idx[val_idx:]

        print(f"Created masks for {var} data.")


Created masks for X01 data.


### Step 2.2: Run PySR

In [6]:
def get_model(X, y):

    model = PySRRegressor(
        procs=4,
        populations=30,
        niterations=30,
        maxsize=20,
        binary_operators=["plus", "sub", "mult"],
        unary_operators=["sin", "cos", "exp", "log_abs"],      
        model_selection="best",
        verbosity=0
    )

    model.fit(X, y)

    return model

In [7]:
model_path = "models"
model_name = "pysr_models_v1.pkl"

try:
    models = joblib.load(os.path.join(model_path, model_name))
except:
    models = {}

In [8]:
for var in sorted([k for k in data.keys() if k[0] == "F" and k not in models]):

    # get target dimensions
    f_dim = data[var].shape[1]

    # get input variables
    g_var = "G" + var[1:]        
    x_var = "X" + var[1:]
    while x_var not in data:
        x_num = int(x_var[1:]) - 1
        if x_num == 0:
            raise RuntimeError("Input data not loaded.")
        x_var = f"X{x_num:02d}"

    # get training mask
    mask = train_mask[x_var]

    models[var] = {g_var: [], x_var: []}
    for i in range(f_dim):

        # get target data
        y = data[var][mask,i]

        # learn f(x)
        print(f"Learning {var}_{i}({g_var}).")
        X = data[g_var][mask]
        models[var][g_var].append(get_model(X, y))

        joblib.dump(models, os.path.join(model_path, model_name))
    
        # learn f(g(x))
        print(f"Learning {var}_{i}({x_var}).")
        X = data[x_var][mask]
        models[var][x_var].append(get_model(X, y))

        joblib.dump(models, os.path.join(model_path, model_name))

    # get target dimensions
    g_dim = data[g_var].shape[1]

    models[g_var] = {x_var: []}
    for i in range(g_dim):

        # get target data
        y = data[g_var][mask,i]
   
        # learn g(x)
        print(f"Learning {g_var}_{i}({x_var}).")
        X = data[x_var][mask]
        models[g_var][x_var].append(get_model(X, y))

        joblib.dump(models, os.path.join(model_path, model_name))


Learning F03_0(G03).




Learning F03_0(X01).
Learning G03_0(X01).


In [9]:
del_name = "hall_of_fame_"

for f in os.listdir():
    if del_name in f:
        os.remove(f)
        print(f"Deleted {f}.")

Deleted hall_of_fame_2022-06-21_182314.694.csv.
Deleted hall_of_fame_2022-06-21_182314.694.csv.bkup.
Deleted hall_of_fame_2022-06-21_182703.562.csv.
Deleted hall_of_fame_2022-06-21_182703.562.csv.bkup.
Deleted hall_of_fame_2022-06-21_182824.482.csv.
Deleted hall_of_fame_2022-06-21_182824.482.csv.bkup.


### Step 2.3: Check discovery

We check whether models were identified correctly based on the PySR loss. More advanced checks could consider the validation data or out-of-distribution data, as the correct model would both generalize and extrapolate accurately.

In [14]:
disc_eps = 1e-6

In [20]:
for d_var in models:
    for i_var in models[d_var]:
        for m, model in enumerate(models[d_var][i_var]):
            best = model.get_best()
            print(f"{d_var}_{m}({i_var}): {best.loss:.2e} - [{(' ','X')[best.loss < disc_eps]}] - {best.equation}")

F01_0(G01): 4.23e-13 - [X] - (x0 * x0)
F01_0(X01): 5.55e+01 - [ ] - exp(log_abs(x7) * 4.0997605)
G01_0(X01): 2.37e-14 - [X] - ((((x7 * x7) + cos(x2)) + x0) + (x6 * x3))
F02_0(G02): 1.87e-10 - [X] - (x0 * ((x0 * 3.9499998) + 1.0000001))
F02_0(X01): 8.08e+03 - [ ] - (exp(exp(log_abs(x5) + 0.96497315)) + exp(x9 * 2.5191934))
G02_0(X01): 1.61e-13 - [X] - (((x5 * (x5 * x5)) + exp(x9)) + (sin(x0) * cos(x1)))
F03_0(G03): 3.29e-09 - [X] - (exp(x0 * 0.2499991) * x0)
F03_0(X01): 1.24e+01 - [ ] - exp(((x5 * -0.5841311) * x3) * x5)
G03_0(X01): 1.15e-09 - [X] - ((log_abs(x8 + x7) + (x3 * (-1.0469586e-5 - exp(log_abs(x5) * 1.9999653)))) + x0)


**Note**: The discovered model `G03_0(X01)` contains one too many terms (`-1.0469586e-5 * x3`).