# 2022 Flatiron Machine Learning x Science Summer School

## Step 2: Run symbolic regression on generic data

In this step, we investigate whether symbolic regression can discover the functions underlying the generic data created in Step 1. There is no support from deep learning at this point.

We use the symbolic regression package [PySR](https://github.com/MilesCranmer/PySR).

In [1]:
import os
import numpy as np
import joblib
from pysr import PySRRegressor

import warnings
warnings.filterwarnings("ignore")

### Step 2.1: Load data

We load the input and target function data created in Step 1 and create masks to split the data into training, validation and test sets.

In [2]:
data_path = "data"
data_ext = ".gz"

In [3]:
data = {}
for file_name in os.listdir(data_path):
    if file_name[-len(data_ext):] == data_ext:
        
        var = file_name[:-len(data_ext)]
        var_data = np.loadtxt(os.path.join(data_path, file_name))
        if len(var_data.shape) == 1:
            var_data = var_data.reshape(-1,1)
        data[var] = var_data

        print(f"Loaded {var} data.")

Loaded F01 data.
Loaded F02 data.
Loaded F03 data.
Loaded F04 data.
Loaded F05 data.
Loaded G01 data.
Loaded G02 data.
Loaded G03 data.
Loaded G04 data.
Loaded G05 data.
Loaded X01 data.
Loaded X02 data.
Loaded X03 data.
Loaded X04 data.
Loaded X05 data.


In [4]:
seed = 0

train_size = 0.7
val_size = 0.2

mask_ext = ".mask"

In [5]:
np.random.seed(seed)

masks = {}
for var in data:
    if var[0] == "X":

        mask_name = var + mask_ext
        try:
            masks[var] = joblib.load(os.path.join(data_path, mask_name))
            print(f"Loaded masks for {var} data.")
        except:
            data_size = data[var].shape[0]

            data_idx = np.arange(data_size)
            np.random.shuffle(data_idx)

            train_idx = int(data_size*train_size)
            val_idx = train_idx + int(data_size*val_size)

            masks[var] = {
                "train": data_idx[:train_idx],
                "val": data_idx[train_idx:val_idx],
                "test": data_idx[val_idx:],
            }
    
            joblib.dump(masks[var], os.path.join(data_path, mask_name))

            print(f"Created masks for {var} data.")

Created masks for X01 data.
Created masks for X02 data.
Created masks for X03 data.
Created masks for X04 data.
Created masks for X05 data.


### Step 2.2: Run PySR

In [6]:
def get_model(X, y):

    model = PySRRegressor(
        procs=4,
        populations=30,
        niterations=30,
        maxsize=20,
        binary_operators=["plus", "sub", "mult"],
        unary_operators=["sin", "cos", "exp", "log_abs"],      
        model_selection="best",
        verbosity=0
    )

    model.fit(X, y)

    return model

In [7]:
model_path = "models"
model_name = "pysr_models_v1.pkl"

try:
    models = joblib.load(os.path.join(model_path, model_name))
except:
    models = {}

In [8]:
for var in sorted([k for k in data.keys() if k[0] == "F" and k not in models]):

    # get target dimensions
    f_dim = data[var].shape[1]

    # get input variables
    g_var = "G" + var[1:]        
    x_var = "X" + var[1:]
    while x_var not in data:
        x_num = int(x_var[1:]) - 1
        if x_num == 0:
            raise RuntimeError("Input data not loaded.")
        x_var = f"X{x_num:02d}"

    # get training mask
    mask = masks[x_var]["train"]

    models[var] = {g_var: [], x_var: []}
    for i in range(f_dim):

        # get target data
        y = data[var][mask,i]

        # learn f(x)
        print(f"Learning {var}_{i}({g_var}).")
        X = data[g_var][mask]
        models[var][g_var].append(get_model(X, y))

        joblib.dump(models, os.path.join(model_path, model_name))
    
        # learn f(g(x))
        print(f"Learning {var}_{i}({x_var}).")
        X = data[x_var][mask]
        models[var][x_var].append(get_model(X, y))

        joblib.dump(models, os.path.join(model_path, model_name))

    # get target dimensions
    g_dim = data[g_var].shape[1]

    models[g_var] = {x_var: []}
    for i in range(g_dim):

        # get target data
        y = data[g_var][mask,i]
   
        # learn g(x)
        print(f"Learning {g_var}_{i}({x_var}).")
        X = data[x_var][mask]
        models[g_var][x_var].append(get_model(X, y))

        joblib.dump(models, os.path.join(model_path, model_name))


Learning F01_0(G01).




Learning F01_0(X01).
Learning G01_0(X01).
Learning F02_0(G02).
Learning F02_0(X02).
Learning G02_0(X02).
Learning G02_1(X02).
Learning G02_2(X02).
Learning F03_0(G03).
Learning F03_0(X03).
Learning G03_0(X03).
Learning F04_0(G04).
Learning F04_0(X04).
Learning G04_0(X04).
Learning G04_1(X04).
Learning G04_2(X04).
Learning F05_0(G05).
Learning F05_0(X05).
Learning G05_0(X05).
Learning G05_1(X05).
Learning G05_2(X05).


In [9]:
del_name = "hall_of_fame_"

for f in os.listdir():
    if del_name in f:
        os.remove(f)
        print(f"Deleted {f}.")

Deleted hall_of_fame_2022-06-23_184611.830.csv.
Deleted hall_of_fame_2022-06-23_184611.830.csv.bkup.
Deleted hall_of_fame_2022-06-23_184921.625.csv.
Deleted hall_of_fame_2022-06-23_184921.625.csv.bkup.
Deleted hall_of_fame_2022-06-23_185027.264.csv.
Deleted hall_of_fame_2022-06-23_185027.264.csv.bkup.
Deleted hall_of_fame_2022-06-23_185141.449.csv.
Deleted hall_of_fame_2022-06-23_185141.449.csv.bkup.
Deleted hall_of_fame_2022-06-23_185236.513.csv.
Deleted hall_of_fame_2022-06-23_185236.513.csv.bkup.
Deleted hall_of_fame_2022-06-23_185355.318.csv.
Deleted hall_of_fame_2022-06-23_185355.318.csv.bkup.
Deleted hall_of_fame_2022-06-23_185458.617.csv.
Deleted hall_of_fame_2022-06-23_185458.617.csv.bkup.
Deleted hall_of_fame_2022-06-23_185603.693.csv.
Deleted hall_of_fame_2022-06-23_185603.693.csv.bkup.
Deleted hall_of_fame_2022-06-23_185657.080.csv.
Deleted hall_of_fame_2022-06-23_185657.080.csv.bkup.
Deleted hall_of_fame_2022-06-23_185745.349.csv.
Deleted hall_of_fame_2022-06-23_185745.349.

### Step 2.3: Check discovery

We check whether models were identified correctly based on the PySR loss. More advanced checks could consider the validation data or out-of-distribution data, as the correct model would both generalize and extrapolate accurately.

In [10]:
disc_eps = 1e-6

In [11]:
for d_var in models:
    for i_var in models[d_var]:
        for m, model in enumerate(models[d_var][i_var]):
            best = model.get_best()
            print(f"{d_var}_{m}({i_var}): {best.loss:.2e} - [{(' ','X')[best.loss < disc_eps]}] - {best.equation}")

F01_0(G01): 9.65e-13 - [X] - (x0 * (x0 + 1.0000001))
F01_0(X01): 6.57e+00 - [ ] - ((x0 * ((x0 * x0) - -3.4709718)) * (x0 + x1))
G01_0(X01): 1.86e-14 - [X] - ((x0 * (x0 + x1)) + cos(x1))
F02_0(G02): 5.83e-10 - [X] - (((x0 + x1) + x2) * (((x0 + 0.9999908) + x1) + x2))
F02_0(X02): 1.91e+01 - [ ] - ((exp(x0 - -0.6688493) - exp(0.5662412 - x0)) * (x1 + x0))
G02_0(X02): 9.49e-15 - [X] - (x0 * x0)
G02_1(X02): 7.52e-16 - [X] - cos(x1)
G02_2(X02): 2.82e-15 - [X] - (x1 * x0)
F03_0(G03): 4.56e-11 - [X] - (x0 * (x0 + 2.745))
F03_0(X03): 9.09e+02 - [ ] - exp(x2 * 2.6427264)
G03_0(X03): 1.87e-13 - [X] - ((exp(x2) + ((x2 * x2) * x2)) + (sin(x0) * cos(x1)))
F04_0(G04): 8.81e-13 - [X] - ((x1 * x2) + (x0 * x0))
F04_0(X04): 8.51e+00 - [ ] - exp(x2 * 2.0940368)
G04_0(X04): 7.09e-16 - [X] - (cos(x1) * sin(x0))
G04_1(X04): 8.45e-14 - [X] - (x2 * (x2 * x2))
G04_2(X04): 1.41e-14 - [X] - exp(x2)
F05_0(G05): 1.57e+03 - [ ] - exp(x1 * 0.68942153)
F05_0(X05): 3.60e+03 - [ ] - exp(((x1 * x3) * 1.6188184) - x2)
G05

**Note**: Only `F05_0(G05)` does not work as intended.