# 2022 Flatiron Machine Learning x Science Summer School

## Step 2: Run symbolic regression on generic data

In this step, we investigate whether symbolic regression can discover the functions underlying the generic data created in Step 1. There is no support from deep learning at this point.

We use the symbolic regression package [PySR](https://github.com/MilesCranmer/PySR).

In [1]:
import os
import numpy as np
import joblib
from pysr import PySRRegressor

import warnings
warnings.filterwarnings("ignore")

### Step 2.1: Load data

We load the input and target function data created in Step 1 and create masks to split the data into training, validation and test sets.

In [2]:
data_path = "data_1k"
data_ext = ".gz"

In [3]:
data = {}
for file_name in os.listdir(data_path):
    if file_name[-len(data_ext):] == data_ext:
        
        var = file_name[:-len(data_ext)]
        var_data = np.loadtxt(os.path.join(data_path, file_name))
        if len(var_data.shape) == 1:
            var_data = var_data.reshape(-1,1)
        data[var] = var_data

        print(f"Loaded {var} data.")

Loaded F00 data.
Loaded F00_p1 data.
Loaded F01 data.
Loaded F02 data.
Loaded F03 data.
Loaded F04 data.
Loaded F05 data.
Loaded F06 data.
Loaded F07 data.
Loaded F07_p1 data.
Loaded F08 data.
Loaded F09 data.
Loaded G00 data.
Loaded G00_p1 data.
Loaded G01 data.
Loaded G02 data.
Loaded G03 data.
Loaded G04 data.
Loaded G05 data.
Loaded G06 data.
Loaded G07 data.
Loaded G07_p1 data.
Loaded G08 data.
Loaded G09 data.
Loaded X00 data.
Loaded X01 data.
Loaded X02 data.
Loaded X03 data.
Loaded X04 data.
Loaded X05 data.
Loaded X06 data.
Loaded X07 data.
Loaded X08 data.
Loaded X09 data.


In [4]:
seed = 0

train_size = 0.7
val_size = 0.2

mask_ext = ".mask"

In [5]:
np.random.seed(seed)

masks = {}
for var in data:
    if var[0] == "X":

        mask_name = var + mask_ext
        try:
            masks[var] = joblib.load(os.path.join(data_path, mask_name))
            print(f"Loaded masks for {var} data.")
        except:
            data_size = data[var].shape[0]

            data_idx = np.arange(data_size)
            np.random.shuffle(data_idx)

            train_idx = int(data_size*train_size)
            val_idx = train_idx + int(data_size*val_size)

            masks[var] = {
                "train": data_idx[:train_idx],
                "val": data_idx[train_idx:val_idx],
                "test": data_idx[val_idx:],
            }
    
            joblib.dump(masks[var], os.path.join(data_path, mask_name))

            print(f"Created masks for {var} data.")

Loaded masks for X00 data.
Loaded masks for X01 data.
Loaded masks for X02 data.
Loaded masks for X03 data.
Loaded masks for X04 data.
Loaded masks for X05 data.
Loaded masks for X06 data.
Loaded masks for X07 data.
Loaded masks for X08 data.
Created masks for X09 data.


### Step 2.2: Run PySR

In [None]:
def get_model(X, y):

    model = PySRRegressor(
        procs=4,
        populations=30,
        niterations=30,
        maxsize=20,
        binary_operators=["plus", "sub", "mult"],
        unary_operators=["sin", "cos", "exp", "log_abs"],      
        model_selection="best",
        verbosity=0
    )

    model.fit(X, y)

    return model

In [None]:
model_path = "models"
model_name = "pysr_models_1k.pkl"

try:
    models = joblib.load(os.path.join(model_path, model_name))
except:
    models = {}

In [None]:
models.keys()

In [None]:
for var in sorted([k for k in data.keys() if k[0] == "F" and k not in models]):

    # get target dimensions
    f_dim = data[var].shape[1]

    # get input variables
    g_var = "G" + var[1:]        
    x_var = "X" + var[1:]
    while x_var not in data:
        x_num = int(x_var[1:]) - 1
        if x_num == 0:
            raise RuntimeError("Input data not loaded.")
        x_var = f"X{x_num:02d}"

    # get training mask
    mask = masks[x_var]["train"]

    models[var] = {g_var: [], x_var: []}
    for i in range(f_dim):

        # get target data
        y = data[var][mask,i]

        # learn f(x)
        print(f"Learning {var}_{i}({g_var}).")
        X = data[g_var][mask]
        models[var][g_var].append(get_model(X, y))

        joblib.dump(models, os.path.join(model_path, model_name))
    
        # learn f(g(x))
        print(f"Learning {var}_{i}({x_var}).")
        X = data[x_var][mask]
        models[var][x_var].append(get_model(X, y))

        joblib.dump(models, os.path.join(model_path, model_name))

    # get target dimensions
    g_dim = data[g_var].shape[1]

    models[g_var] = {x_var: []}
    for i in range(g_dim):

        # get target data
        y = data[g_var][mask,i]
   
        # learn g(x)
        print(f"Learning {g_var}_{i}({x_var}).")
        X = data[x_var][mask]
        models[g_var][x_var].append(get_model(X, y))

        joblib.dump(models, os.path.join(model_path, model_name))


In [None]:
del_name = "hall_of_fame_"

for f in os.listdir():
    if del_name in f:
        os.remove(f)
        print(f"Deleted {f}.")

### Step 2.3: Check discovery

We check whether models were identified correctly based on the PySR loss. More advanced checks could consider the validation data or out-of-distribution data, as the correct model would both generalize and extrapolate accurately.

In [None]:
disc_eps = 1e-6

In [None]:
for d_var in models:
    for i_var in models[d_var]:
        for m, model in enumerate(models[d_var][i_var]):
            best = model.get_best()
            print(f"{d_var}_{m}({i_var}): {best.loss:.2e} - [{(' ','X')[best.loss < disc_eps]}] - {best.equation}")