# 1. Creating the Dataset

In [None]:
import json
import sympy
import numpy as np
import random
import logging
import re

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
PI = sympy.pi
E = sympy.E
PHI = (1 + sympy.sqrt(5))/2  # Golden ratio

# Function to generate random constants (you can customize this list)
def random_constants():
    """Randomly return a scientific constant."""
    constants = [PI, E, PHI, sympy.sqrt(2)]
    return random.choice(constants)

# Function to generate random coefficients (both float and integer)
def random_coefficient():
    """Generate a random coefficient (either integer or float)."""
    if random.random() < 0.5:
        return random.randint(1, 10)  # Integer coefficients
    else:
        return random.uniform(0.1, 5)  # Floating-point coefficients

# Function to create a dynamic list of n symbolic variables
def generate_variables(n):
    """Generate a list of n symbolic variables."""
    symbol_list = sympy.symbols(f'x0:{n}')  # Generates x0, x1, ..., xn-1
    return symbol_list

# Function to generate a random symbolic function with scientific relevance and random coefficients
def generate_random_function_with_relevance(variables, num_relevant_vars, complexity_range=(2, 4)):
    """Generate a random mathematical function using a subset of the variables."""
    relevant_vars = random.sample(variables, num_relevant_vars)  # Select relevant variables
    num_ops = random.randint(*complexity_range)  # Dynamically choose number of operations
    function = random.choice(relevant_vars)  # Start with a random relevant variable
    for _ in range(num_ops):
        op = random.choice([sympy.Add, sympy.Mul, sympy.sin, sympy.cos, sympy.exp, sympy.Pow])
        if op in [sympy.Add, sympy.Mul]:
            function = op(function, random.choice(relevant_vars + [random_constants(), random_coefficient()]))
        elif op == sympy.Pow:
            function = op(function, random.choice([2, 3, 4]))
        else:
            function = op(function)
    return function


# Example: Adding scientific equations, using physics-inspired forms
def generate_physics_function(variables):
    """Generate physics-inspired functions, e.g., energy, velocity."""
    functions = [
        random_coefficient() * variables[0]**2 + random_coefficient() * variables[1],  # Quadratic relation with random coefficients
        sympy.sin(variables[0]) + random_coefficient() * sympy.cos(variables[1]),  # Trigonometric functions with a coefficient
        random_coefficient() * variables[0]**3 + random_coefficient() * variables[1]**2 + random_coefficient() * 3*variables[0]*variables[1],  # Polynomial function
        random_coefficient() * (sympy.Mul(variables[0], 9.8))  # Gravitational potential energy or force: F = ma (Newton’s second law)
    ]
    return random.choice(functions)  # Return a random physics-inspired function

# Nguyen Dataset Generation (including physics-based functions)
def nguyen_dataset_dynamic(n=2, num_functions=10):
    """Generate a dataset with dynamic functions, including physics and scientific expressions."""
    variables = generate_variables(n)  # Generate n variables dynamically
    functions = []
    for _ in range(num_functions):
        num_relevant_vars = random.randint(1, n)  # Choose number of relevant variables randomly
        if random.random() > 0.5:  # 50% chance to generate a random or physics-inspired function
            functions.append(generate_physics_function(variables))
        else:
            functions.append(generate_random_function_with_relevance(variables, num_relevant_vars))
    return functions

# Feynman Dataset Generation
def feynman_dataset(n=2,num_functions=10):
    """Generate symbolic regression functions based on Feynman dataset."""
    variables = generate_variables(n)
    functions = [
        random_coefficient()*variables[0]**2 + random_coefficient() * 2*variables[1],            # Linear + quadratic
        random_coefficient()*sympy.sin(variables[0]) + random_coefficient() * sympy.cos(variables[1]),  # Trigonometric functions
        random_coefficient()*variables[0]**3 + random_coefficient() * variables[1]**2 + random_coefficient() * 3*variables[0]*variables[1],  # Polynomial
        random_coefficient()*variables[0]**4 + random_coefficient() * variables[1]**2 + random_coefficient() * variables[0]*variables[1],  # Quartic and product
    ]
    return functions[:num_functions]  # Return the first n functions

# Livermore Dataset Generation
def livermore_dataset(n=2,num_functions=10):
    """Generate symbolic regression functions based on Livermore dataset."""
    variables = generate_variables(n)
    functions = [
        random_coefficient()*variables[0]**6 + random_coefficient()*variables[1]**4 + random_coefficient()*variables[0]**2 + random_coefficient()*2*variables[1],  # Polynomial
        random_coefficient()*variables[0]**8 + random_coefficient()*variables[1]**5 + random_coefficient()*3*variables[0]*variables[1],  # Polynomial
        random_coefficient()*variables[0]**3 + random_coefficient()*5*variables[0]**2 + random_coefficient()*variables[1]**3 + random_coefficient()*2*variables[0]*variables[1],  # Polynomial
        random_coefficient()*variables[0]**5 + random_coefficient()*variables[1]**4 + random_coefficient()*2*variables[0]*variables[1],  # Polynomial
        random_coefficient()*sympy.sin(variables[0]) + random_coefficient() * sympy.cos(variables[1]) + random_coefficient() * variables[0]**2,  # Trigonometric + polynomial
    ]
    return functions[:num_functions]  # Return the first n functions

# Evaluate the generated functions with random inputs
def evaluate_nguyen_dataset_with_relevant_variables(functions, num_samples=100, range_vals=(-1, 1), noise_type='gaussian'):
    """Evaluate the functions with dynamic input variables and add noise, ensuring only relevant variables are considered."""
    dataset = []
    for func in functions:
        data = []
        skeleton = create_skeleton(func)
        num_vars = max([int(str(var)[1:]) for var in func.free_symbols]) + 1  # Get the highest subscript value from func.free_symbols
        all_variables = generate_variables(num_vars)  # Generate variables from x0 to xN (including highest subscript)
        
        for _ in range(num_samples):
            # Generate random values for all the variables from x0 to xN
            inputs = {str(var): np.random.uniform(*range_vals) for var in all_variables}

            # Evaluate the function using only the relevant variables (those in func.free_symbols)
            relevant_inputs = {str(var): inputs[str(var)] for var in func.free_symbols}  # Use only relevant variables for evaluation
            
            try:
                # Evaluate the function with the relevant inputs
                result = func.evalf(subs=relevant_inputs)

                # Check if the result is a real number
                if result.is_real:
                    output = float(result)
                else:
                    # If it's complex, skip or handle it (e.g., set output to NaN)
                    logger.warning(f"Complex result for {str(func)}: {result}. Skipping this data point.")
                    continue  # Skip complex results, or assign output = float('nan') if desired
                
                # Add noise
                if noise_type == 'gaussian':
                    noise = np.random.normal(0, 0.1)  # Gaussian noise with standard deviation of 0.1
                elif noise_type == 'uniform':
                    noise = np.random.uniform(-0.2, 0.2)  # Uniform noise in range [-0.2, 0.2]
                else:
                    noise = 0  # No noise

                output += noise
                data.append({"inputs": inputs, "output": output})

            except (ValueError, ZeroDivisionError, OverflowError) as e:
                logger.warning(f"Invalid expression for {str(func)}: {e}")
                continue  # Skip invalid evaluations

        # Convert the function to string for JSON serialization
        dataset.append({"function": str(func), "skeleton": skeleton, "data": data})
    
    return dataset

# Function to combine and shuffle datasets
def combine_and_shuffle_datasets(datasets):
    """Combine multiple datasets and shuffle them."""
    combined = []
    for dataset in datasets:
        combined.extend(dataset)
    random.shuffle(combined)  # Shuffle the combined dataset to mix all datasets well
    return combined

def create_skeleton(input_function):
    """
    Convert a symbolic function into its skeleton representation by replacing:
    - Numeric coefficients with 'C'
    - Standalone numbers with 'C'
    - Fractions with 'C' (except in exponents)
    - Preserves numeric exponents like **2
    """
    # Convert the symbolic function into a string
    skeleton = str(input_function)

    # Replace common mathematical constants with 'C'
    constants_to_replace = {
        str(sympy.E): 'C',                # Euler's number (e)
        str(sympy.pi): 'C',              # Pi
        str((1 + sympy.sqrt(5)) / 2): 'C', # Golden ratio (PHI)
    }
    for const, replacement in constants_to_replace.items():
        skeleton = skeleton.replace(const, replacement)

    # Replace standalone numbers not part of exponents
    skeleton = re.sub(
        r"(?<![\w^*])([+-]?\d*\.?\d+)(?![*]{2})",  # Excludes numbers after '**'
        "C",
        skeleton,
    )

    # Replace numeric coefficients (including preceding variables and functions)
    def replace_numeric_coeff(match):
        """Replace numeric coefficients with 'C', preserving the context."""
        return f"C{match.group(2)}"

    # Match numeric coefficients directly preceding variables or functions
    skeleton = re.sub(r"([+-]?\d*\.?\d+)(\*?[a-zA-Z(])", replace_numeric_coeff, skeleton)

    return skeleton
    
# Save dataset to JSON file
def save_to_json_line_by_line(data, filename):
    with open(filename, "w") as f:
        for data_point in data:
            json.dump(data_point, f)
            f.write("\n")
    return 
    
# Save dataset to JSON file
def save_to_json_line_by_line(data, filename):
    with open(filename, "w") as f:
        for data_point in data:
            json.dump(data_point, f)
            f.write("\n")
    return 

# Set the random seed (for replicability)
seed = 20777980  # You can change this seed to any integer
random.seed(seed)
np.random.seed(seed)

# Choose the number of variables and the number of functions for each dataset
n_variables = 5  # Set the number of variables dynamically (e.g., 5 variables for testing)
num_functions = 10  # Number of random functions to generate

# Generate datasets for Nguyen, Feynman, and Livermore datasets
nguyen_functions = nguyen_dataset_dynamic(n_variables, num_functions)
feynman_functions = feynman_dataset(n_variables, num_functions)
livermore_functions = livermore_dataset(n_variables, num_functions)

# Evaluate the datasets with different noise models
nguyen_dataset_evaluated = evaluate_nguyen_dataset_with_relevant_variables(nguyen_functions, num_samples=100, range_vals=(-1, 1), noise_type='gaussian')
feynman_dataset_evaluated = evaluate_nguyen_dataset_with_relevant_variables(feynman_functions, num_samples=100, range_vals=(-1, 1), noise_type='uniform')
livermore_dataset_evaluated = evaluate_nguyen_dataset_with_relevant_variables(livermore_functions, num_samples=100, range_vals=(-1, 1), noise_type='gaussian')

# Combine and shuffle all datasets
combined_dataset = combine_and_shuffle_datasets([nguyen_dataset_evaluated,feynman_dataset_evaluated,livermore_dataset_evaluated])

# Save the combined and shuffled dataset to a JSON file
file_path_combined = f"Data\combined_dataset_{n_variables}_variables_dynamic_seed{seed}.json"

save_to_json_line_by_line(combined_dataset, file_path_combined)

print(f"Combined and shuffled dataset saved to {file_path_combined}")


Combined and shuffled dataset saved to Data\combined_dataset_5_variables_dynamic_seed20777980.json


In [None]:
def create_skeleton(input_function):
    """
    Convert a symbolic function into its skeleton representation by replacing:
    - Numeric coefficients with 'C'
    - Standalone numbers with 'C'
    - Fractions with 'C' (except in exponents)
    - Preserves numeric exponents like **2
    """
    # Convert the symbolic function into a string
    skeleton = str(input_function)

    # Replace common mathematical constants with 'C'
    constants_to_replace = {
        str(sympy.E): 'C',                # Euler's number (e)
        str(sympy.pi): 'C',              # Pi
        str((1 + sympy.sqrt(5)) / 2): 'C', # Golden ratio (PHI)
    }
    for const, replacement in constants_to_replace.items():
        skeleton = skeleton.replace(const, replacement)

    # Replace standalone numbers not part of exponents
    skeleton = re.sub(
        r"(?<![\w^*])([+-]?\d*\.?\d+)(?![*]{2})",  # Excludes numbers after '**'
        "C",
        skeleton,
    )

    # Replace numeric coefficients (including preceding variables and functions)
    def replace_numeric_coeff(match):
        """Replace numeric coefficients with 'C', preserving the context."""
        return f"C{match.group(2)}"

    # Match numeric coefficients directly preceding variables or functions
    skeleton = re.sub(r"([+-]?\d*\.?\d+)(\*?[a-zA-Z(])", replace_numeric_coeff, skeleton)

    return skeleton

In [76]:
input_function = "E*x4*(cos(x2) + 7)"

skeleton = create_skeleton(input_function)

print(f"function: {input_function}, skeleton: {skeleton}, correct_skeleton: C*x4*(cos(x2) + C)")

function: E*x4*(cos(x2) + 7), skeleton: C*xC*(cos(x2) + C), correct_skeleton: C*x4*(cos(x2) + C)


# 1 *Improved* - Tree Based Approach

In [None]:
import json
import sympy
import numpy as np
import random
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
PI = sympy.pi
E = sympy.E
PHI = (1 + sympy.sqrt(5)) / 2  # Golden ratio

# Function to generate random constants (you can customize this list)
def random_constants():
    """Randomly return a scientific constant."""
    constants = [PI, E, PHI, sympy.sqrt(2)]
    return random.choice(constants)

# Function to generate random coefficients (both float and integer)
def random_coefficient():
    """Generate a random coefficient (either integer or float)."""
    if random.random() < 0.5:
        return random.randint(1, 10)  # Integer coefficients
    else:
        return random.uniform(0.1, 5)  # Floating-point coefficients

# Function to create a dynamic list of n symbolic variables
def generate_variables(n):
    """Generate a list of n symbolic variables."""
    symbol_list = sympy.symbols(f'x0:{n}')  # Generates x0, x1, ..., xn-1
    return symbol_list

# Function to build a tree-based symbolic expression
def build_expression_tree(variables, max_depth=3):
    """Recursively build a symbolic expression tree."""
    # Base case: leaf nodes are either variables or constants
    if max_depth == 0:
        return random.choice(list(variables) + [random_constants(), random_coefficient()])
    
    # Internal nodes: operators
    op = random.choice([sympy.Add, sympy.Mul, sympy.sin, sympy.cos, sympy.Pow, sympy.exp])
    
    # For unary operations (e.g., sin, cos, exp), only one argument is needed
    if op in [sympy.sin, sympy.cos, sympy.exp]:
        return op(build_expression_tree(variables, max_depth - 1))
    
    # For binary operations (e.g., Add, Mul, Pow), two arguments are needed
    left = build_expression_tree(variables, max_depth - 1)
    right = build_expression_tree(variables, max_depth - 1)
    
    return op(left, right)

# Function to generate a random symbolic function with a tree structure
def generate_random_function_with_tree_structure(variables, max_depth=3):
    """Generate a random mathematical function using a tree-based structure."""
    return build_expression_tree(variables, max_depth)

# Function to generate a random physics-inspired function using a tree-based structure
def generate_physics_function(variables, max_depth=3):
    """Generate physics-inspired functions using a tree-based structure."""
    functions = [
        random_coefficient() * variables[0]**2 + random_coefficient() * variables[1],  # Quadratic relation with random coefficients
        sympy.sin(variables[0]) + random_coefficient() * sympy.cos(variables[1]),  # Trigonometric functions with a coefficient
        random_coefficient() * variables[0]**3 + random_coefficient() * variables[1]**2 + random_coefficient() * 3 * variables[0] * variables[1],  # Polynomial function
        random_coefficient() * (sympy.Mul(variables[0], 9.8))  # Gravitational potential energy or force: F = ma (Newton’s second law)
    ]
    
    return random.choice(functions)  # Return a random physics-inspired function

# Nguyen Dataset Generation (including physics-based functions)
def nguyen_dataset_dynamic(n=2, num_functions=10, max_depth=3):
    """Generate a dataset with dynamic functions, including physics and scientific expressions."""
    variables = generate_variables(n)  # Generate n variables dynamically
    functions = []
    for _ in range(num_functions):
        num_relevant_vars = random.randint(1, n)  # Choose number of relevant variables randomly
        if random.random() > 0.5:  # 50% chance to generate a random or physics-inspired function
            functions.append(generate_physics_function(variables, max_depth))
        else:
            functions.append(generate_random_function_with_tree_structure(variables, max_depth))
    return functions

# Feynman Dataset Generation
def feynman_dataset(n=2, num_functions=10, max_depth=3):
    """Generate symbolic regression functions based on Feynman dataset."""
    variables = generate_variables(n)
    functions = [
        random_coefficient() * variables[0]**2 + random_coefficient() * 2 * variables[1],            # Linear + quadratic
        random_coefficient() * sympy.sin(variables[0]) + random_coefficient() * sympy.cos(variables[1]),  # Trigonometric functions
        random_coefficient() * variables[0]**3 + random_coefficient() * variables[1]**2 + random_coefficient() * 3 * variables[0] * variables[1],  # Polynomial
        random_coefficient() * variables[0]**4 + random_coefficient() * variables[1]**2 + random_coefficient() * variables[0] * variables[1],  # Quartic and product
    ]
    return [generate_random_function_with_tree_structure(variables, max_depth) for _ in range(num_functions)]  # Return tree-based functions

# Livermore Dataset Generation
def livermore_dataset(n=2, num_functions=10, max_depth=3):
    """Generate symbolic regression functions based on Livermore dataset."""
    variables = generate_variables(n)
    functions = [
        random_coefficient() * variables[0]**6 + random_coefficient() * variables[1]**4 + random_coefficient() * variables[0]**2 + random_coefficient() * 2 * variables[1],  # Polynomial
        random_coefficient() * variables[0]**8 + random_coefficient() * variables[1]**5 + random_coefficient() * 3 * variables[0] * variables[1],  # Polynomial
        random_coefficient() * variables[0]**3 + random_coefficient() * 5 * variables[0]**2 + random_coefficient() * variables[1]**3 + random_coefficient() * 2 * variables[0] * variables[1],  # Polynomial
        random_coefficient() * variables[0]**5 + random_coefficient() * variables[1]**4 + random_coefficient() * 2 * variables[0] * variables[1],  # Polynomial
        random_coefficient() * sympy.sin(variables[0]) + random_coefficient() * sympy.cos(variables[1]) + random_coefficient() * variables[0]**2,  # Trigonometric + polynomial
    ]
    return [generate_random_function_with_tree_structure(variables, max_depth) for _ in range(num_functions)]  # Return tree-based functions

# Evaluate the dataset with relevant variables
def evaluate_dataset_with_relevant_variables(functions, num_samples=100, range_vals=(-1, 1), noise_type='gaussian'):
    """Evaluate the functions with dynamic input variables and add noise, ensuring only relevant variables are considered."""
    dataset = []
    
    for func in functions:
        data = []
        num_vars = max([int(str(var)[1:]) for var in func.free_symbols]) + 1  # Get the highest subscript value from func.free_symbols
        all_variables = generate_variables(num_vars)  # Generate variables from x0 to xN (including highest subscript)
        
        for _ in range(num_samples):
            # Generate random values for all the variables from x0 to xN
            inputs = {str(var): np.random.uniform(*range_vals) for var in all_variables}

            # Evaluate the function using only the relevant variables (those in func.free_symbols)
            relevant_inputs = {str(var): inputs[str(var)] for var in func.free_symbols}  # Use only relevant variables for evaluation
            
            try:
                # Evaluate the function with the relevant inputs
                result = func.evalf(subs=relevant_inputs)

                # Check if the result is a real number
                if result.is_real:
                    output = float(result)
                else:
                    # If it's complex, skip or handle it (e.g., set output to NaN)
                    logger.warning(f"Complex result for {str(func)}: {result}. Skipping this data point.")
                    continue  # Skip complex results, or assign output = float('nan') if desired
                
                # Add noise
                if noise_type == 'gaussian':
                    noise = np.random.normal(0, 0.1)  # Gaussian noise with standard deviation of 0.1
                elif noise_type == 'uniform':
                    noise = np.random.uniform(-0.2, 0.2)  # Uniform noise in range [-0.2, 0.2]
                else:
                    noise = 0  # No noise

                output += noise
                data.append({"inputs": inputs, "output": output})

            except (ValueError, ZeroDivisionError, OverflowError) as e:
                logger.warning(f"Invalid expression for {str(func)}: {e}")
                continue  # Skip invalid evaluations

        # Convert the function to string for JSON serialization
        dataset.append({"function": str(func), "data": data})
    
    return dataset

# Function to combine and shuffle datasets
def combine_and_shuffle_datasets(datasets):
    """Combine multiple datasets and shuffle them."""
    combined = []
    for dataset in datasets:
        combined.extend(dataset)
    random.shuffle(combined)  # Shuffle the combined dataset to mix all datasets well
    return combined

# Save dataset to JSON file
def save_to_json_line_by_line(data, filename):
    """Save data to a file in JSON format, line-by-line."""
    with open(filename, "w") as f:
        for data_point in data:
            json.dump(data_point, f)
            f.write("\n")
    return

# Set the random seed (for replicability)
seed = 20777980  # You can change this seed to any integer
random.seed(seed)
np.random.seed(seed)

# Choose the number of variables and the number of functions for each dataset
n_variables = 5  # Set the number of variables (e.g., x0, x1, x2, ...)
num_functions = 50  # Number of functions per dataset
max_depth = 3  # Max depth for the function trees

# Generate each dataset
nguyen_data = nguyen_dataset_dynamic(n=n_variables, num_functions=num_functions, max_depth=max_depth)
feynman_data = feynman_dataset(n=n_variables, num_functions=num_functions, max_depth=max_depth)
livermore_data = livermore_dataset(n=n_variables, num_functions=num_functions, max_depth=max_depth)

# Evaluate datasets
datasets_to_combine = []
datasets_to_combine.append(evaluate_dataset_with_relevant_variables(nguyen_data, num_samples=100, range_vals=(-1, 1), noise_type='gaussian'))
datasets_to_combine.append(evaluate_dataset_with_relevant_variables(feynman_data, num_samples=100, range_vals=(-1, 1), noise_type='gaussian'))
datasets_to_combine.append(evaluate_dataset_with_relevant_variables(livermore_data, num_samples=100, range_vals=(-1, 1), noise_type='gaussian'))

# Combine and shuffle all datasets
combined_data = combine_and_shuffle_datasets(datasets_to_combine)

# Save to a JSON file
save_to_json_line_by_line(combined_data, "Data\combined_dataset_tree.json")
