# 1. Creating the Dataset

In [36]:
import json
import sympy
import numpy as np
import random
import logging
import re

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
PI = sympy.pi
E = sympy.E
PHI = (1 + sympy.sqrt(5))/2

def random_constants():
    """Randomly return a scientific constant."""
    constants = [PI, E, PHI, sympy.sqrt(2), -PI, -E, -PHI, -sympy.sqrt(2)]
    return random.choice(constants)

def random_coefficient():
    """Generate a random coefficient (either integer or float)."""
    if random.random() < 0.5:
        return random.randint(-10, 10)
    else:
        return random.uniform(-5, 5)

def generate_variables(n):
    """Generate a list of n symbolic variables."""
    symbol_list = sympy.symbols(f'x0:{n}')
    return symbol_list

def generate_random_function_with_relevance(variables, num_relevant_vars, complexity_range=(3, 6)):
    """Generate a random mathematical function using a subset of the variables."""
    relevant_vars = random.sample(variables, num_relevant_vars)
    num_ops = random.randint(*complexity_range)
    function = random.choice(relevant_vars)
    for _ in range(num_ops):
        op = random.choice([sympy.Add, sympy.Mul, sympy.sin, sympy.cos, sympy.exp, sympy.Pow, sympy.tanh])
        if op in [sympy.Add, sympy.Mul]:
            function = op(function, random.choice(relevant_vars + [random_constants(), random_coefficient()]))
        elif op == sympy.Pow:
            function = op(function, random.choice([2, 3, 4]))
        else:
            function = op(function)
    return function

def generate_physics_function(variables):
    """Generate physics-inspired functions, e.g., energy, velocity."""
    functions = [
        random_coefficient() * variables[0]**2 + random_coefficient() * variables[1],
        sympy.sin(variables[0]) + random_coefficient() * sympy.cos(variables[1]),
        random_coefficient() * variables[0]**3 + random_coefficient() * variables[1]**2 + random_coefficient() * 3*variables[0]*variables[1],  # Polynomial function
        random_coefficient() * (sympy.Mul(variables[0], 9.8)),
        random_coefficient() * sympy.tanh(variables[0] + random_coefficient()) 
    ]
    return random.choice(functions)

def nguyen_benchmark_equations_dynamic(n=2, num_functions=100):
    """Generate a dataset with dynamic functions, including physics and scientific expressions."""
    variables = generate_variables(n)
    functions = []
    for _ in range(num_functions):
        num_relevant_vars = random.randint(1, n)
        if random.random() > 0.5:
            functions.append(generate_physics_function(variables))
        else:
            functions.append(generate_random_function_with_relevance(variables, num_relevant_vars))
    return functions

def evaluate_nguyen_benchmark_equations_with_relevant_variables(functions, num_samples=100, range_vals=(-1, 1)):
    """Evaluate the functions with dynamic input variables, ensuring only relevant variables are considered, no noise."""
    dataset = []
    for func in functions:
        data = {"x": {}, "y": []}
        skeleton = create_skeleton(func)
        num_vars = max([int(str(var)[1:]) for var in func.free_symbols]) + 1
        all_variables = generate_variables(num_vars)
        
        for var in all_variables:
            data["x"][str(var)] = []

        for _ in range(num_samples):
            inputs = {str(var): np.random.uniform(*range_vals) for var in all_variables}
            relevant_inputs = {str(var): inputs[str(var)] for var in func.free_symbols}
            try:
                result = func.evalf(subs=relevant_inputs)
                if result.is_real:
                    output = float(result)
                else:
                    logger.warning(f"Complex result for {str(func)}: {result}. Skipping this data point.")
                    continue 
                for var in all_variables:
                    data["x"][str(var)].append(inputs[str(var)])
                data["y"].append(output)
            except (ValueError, ZeroDivisionError, OverflowError) as e:
                logger.warning(f"Invalid expression for {str(func)}: {e}")
                continue

        dataset.append({"function": str(func), "skeleton": skeleton, "data": data})
    
    return dataset

def save_to_json_line_by_line(data, filename):
    with open(filename, "w") as f:
        for data_point in data:
            json.dump(data_point, f)
            f.write("\n")

def create_skeleton(input_function):
    input_function_str = str(input_function)
    scientific_constants = [sympy.pi, sympy.E, (1 + sympy.sqrt(5))/2]
    for constant in scientific_constants:
        input_function_str = input_function_str.replace(str(constant), 'C')
    skeleton = re.sub(r"sqrt\(([+-]?\d*\.?\d+)\)", "C", input_function_str)
    skeleton = re.sub(r"exp\(([+-]?\d*\.?\d+)\)", "C", skeleton)
    skeleton = re.sub(
        r"([+-]?\d*\.?\d+)(?=[a-zA-Z(])",
        "C",
        skeleton
    )
    skeleton = re.sub(
        r"(?<![\w^*])([+-]?\d*\.?\d+)(?![*]{2})",
        "C",
        skeleton
    )
    skeleton = re.sub(r"-\s*C", "+ C", skeleton)
    skeleton = re.sub(r"C\s*/\s*C", "C", skeleton)
    skeleton = re.sub(r"C\s*\+\s*C", "C", skeleton)
    skeleton = re.sub(r"\(C\)", "C", skeleton)
    skeleton = re.sub(r"-\s*C", "C", skeleton)
    skeleton = re.sub(r"^\-", "C*", skeleton)
    return skeleton

seed = 940
random.seed(seed)
np.random.seed(seed)

n_variables = 5 
num_functions = 100 

nguyen_functions = nguyen_benchmark_equations_dynamic(n_variables, num_functions)


nguyen_benchmark_equations_evaluated = evaluate_nguyen_benchmark_equations_with_relevant_variables(nguyen_functions, num_samples=100, range_vals=(-1, 1))

file_path_combined = f"Data/combined_dataset_{n_variables}_variables_dynamic_seed{seed}.json"
save_to_json_line_by_line(nguyen_benchmark_equations_evaluated, file_path_combined)

print(f"Combined and shuffled dataset saved to {file_path_combined}")

Combined and shuffled dataset saved to Data/combined_dataset_5_variables_dynamic_seed20777980.json
