# Preprocess and store the data as npz files (for better training)

In [1]:
import os
import json
import numpy as np

def preprocess_symbolic_regression(data_folder, output_folder):
    """
    Preprocesses JSON files for symbolic regression and saves the processed data.

    Parameters:
        data_folder (str): Path to the folder containing the train, val, and test subfolders with JSON files.
        output_folder (str): Path to the folder where preprocessed files will be saved.
    """
    os.makedirs(output_folder, exist_ok=True)
    splits = ['train', 'val', 'test']
    max_consts = 0
    
    for split in splits:
        split_folder = os.path.join(data_folder, split)
        output_split_folder = os.path.join(output_folder, split)
        os.makedirs(output_split_folder, exist_ok=True)
        
        files = [f for f in os.listdir(split_folder) if f.endswith('.json') and not f.startswith('properties')]
        X_all, y_all = [], []
        
        # First pass to find the maximum number of constants
        for file in files:
            with open(os.path.join(split_folder, file), 'r') as f:
                data = json.load(f)
                const_value_dict = data.get('const_value_dict', {})
                max_consts = max(max_consts, len(const_value_dict))
        
        # Second pass to process the data
        for file in files:
            with open(os.path.join(split_folder, file), 'r') as f:
                data = json.load(f)
                
                # Variables
                vars_array = np.array([data['points'][f'var_{i}'] for i in range(data['n_vars'])]).T
                
                # Constants (padded with zeros if fewer than max_consts)
                const_value_dict = data.get('const_value_dict', {})
                consts_array = np.zeros((len(vars_array), max_consts))
                consts = list(const_value_dict.values())

                # print(f"File: {file}, Constants: {len(consts)}, Max Constants: {max_consts}")
                
                consts_array[:, :len(consts)] = consts

                # Combine variables and constants
                X = np.hstack([vars_array, consts_array])
                y = np.array(data['points']['target']).reshape(-1, 1)

                print(f"Before preprocessing: {split, X.shape}") 
                
                X_all.append(X)
                y_all.append(y)
        
        # Stack all data for this split
        X_all = np.vstack(X_all)
        y_all = np.vstack(y_all)
        
        # Save preprocessed data
        np.savez_compressed(os.path.join(output_split_folder, f'{split}.npz'), X=X_all, y=y_all)
        print(f"Processed {split} data: {X_all.shape[0]} samples saved.")

# Define input and output folders
data_folder = "data_symbolic_regression"
output_folder = "preprocessed_symbolic_regression"

# Run preprocessing
preprocess_symbolic_regression(data_folder, output_folder)

File: f_473.json, Constants: 0, Max Constants: 3
Before preprocessing: ('train', (100, 6))
File: f_189.json, Constants: 0, Max Constants: 3
Before preprocessing: ('train', (100, 6))
File: f_536.json, Constants: 0, Max Constants: 3
Before preprocessing: ('train', (100, 6))
File: f_166.json, Constants: 0, Max Constants: 3
Before preprocessing: ('train', (100, 6))
File: f_49.json, Constants: 1, Max Constants: 3
Before preprocessing: ('train', (100, 6))
File: f_424.json, Constants: 0, Max Constants: 3
Before preprocessing: ('train', (100, 6))
File: f_8.json, Constants: 0, Max Constants: 3
Before preprocessing: ('train', (100, 6))
File: f_131.json, Constants: 1, Max Constants: 3
Before preprocessing: ('train', (100, 6))
File: f_561.json, Constants: 0, Max Constants: 3
Before preprocessing: ('train', (100, 6))
File: f_648.json, Constants: 0, Max Constants: 3
Before preprocessing: ('train', (100, 6))
File: f_218.json, Constants: 1, Max Constants: 3
Before preprocessing: ('train', (100, 6))
Fi

# Preprocess and store the data as a Dataframe (for better visualization)

In [2]:
import pandas as pd

def preprocess_to_dataframe(data_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    splits = ['train', 'val', 'test']
    
    for split in splits:
        split_folder = os.path.join(data_folder, split)
        output_split_folder = os.path.join(output_folder, split)
        os.makedirs(output_split_folder, exist_ok=True)
        
        files = [f for f in os.listdir(split_folder) if f.endswith('.json') and not f.startswith('properties')]
        data_rows = []
        
        for file in files:
            with open(os.path.join(split_folder, file), 'r') as f:
                data = json.load(f)
                
                const_value_dict = data.get('const_value_dict', {})
                consts = list(const_value_dict.values())
                
                for i in range(len(data['points']['target'])):
                    row = {
                        **{f'var_{j}': data['points'][f'var_{j}'][i] for j in range(data['n_vars'])},
                        **{f'const_{k}': consts[k] if k < len(consts) else 0 for k in range(max(len(consts), 3))},
                        'target': data['points']['target'][i]
                    }
                    data_rows.append(row)
        
        # Create a DataFrame
        df = pd.DataFrame(data_rows)
        df.to_csv(os.path.join(output_split_folder, f'{split}.csv'), index=False)
        print(f"Processed {split} data: {len(df)} samples saved.")

# Run the preprocessing
# Define input and output folders
data_folder = "data_symbolic_regression"
output_folder = "preprocessed_df_symbolic_regression"
preprocess_to_dataframe(data_folder, output_folder)

Processed train data: 74700 samples saved.
Processed val data: 16000 samples saved.
Processed test data: 16100 samples saved.


# Load the data in batches for the Diffusion model

In [3]:
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler

# Dataset class
class SymbolicRegressionDataset(Dataset):
    def __init__(self, file_path):
        # Load the .npz file
        data = np.load(file_path)
        self.X = data['X']
        self.y = data['y']
        
        # Normalize data
        self.scaler_X = StandardScaler().fit(self.X)
        self.scaler_y = StandardScaler().fit(self.y.reshape(-1, 1))
        self.X = self.scaler_X.transform(self.X)
        self.y = self.scaler_y.transform(self.y.reshape(-1, 1)).squeeze()
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.float32)

# Create data loaders
def create_data_loaders(train_file, val_file, test_file, batch_size=32):
    train_dataset = SymbolicRegressionDataset(train_file)
    val_dataset = SymbolicRegressionDataset(val_file)
    test_dataset = SymbolicRegressionDataset(test_file)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    return train_loader, val_loader, test_loader