# 1. Creating the Dataset

In [8]:
import json
import sympy
import numpy as np
import random
import logging
import re

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
PI = sympy.pi
E = sympy.E
PHI = (1 + sympy.sqrt(5))/2  # Golden ratio

# Function to generate random constants (you can customize this list)
def random_constants():
    """Randomly return a scientific constant."""
    constants = [PI, E, PHI, sympy.sqrt(2)]
    return random.choice(constants)

# Function to generate random coefficients (both float and integer)
def random_coefficient():
    """Generate a random coefficient (either integer or float)."""
    if random.random() < 0.5:
        return random.randint(1, 10)  # Integer coefficients
    else:
        return random.uniform(0.1, 5)  # Floating-point coefficients

# Function to create a dynamic list of n symbolic variables
def generate_variables(n):
    """Generate a list of n symbolic variables."""
    symbol_list = sympy.symbols(f'x0:{n}')  # Generates x0, x1, ..., xn-1
    return symbol_list

# Function to generate a random symbolic function with scientific relevance and random coefficients
def generate_random_function_with_relevance(variables, num_relevant_vars, complexity_range=(2, 4)):
    """Generate a random mathematical function using a subset of the variables."""
    relevant_vars = random.sample(variables, num_relevant_vars)  # Select relevant variables
    num_ops = random.randint(*complexity_range)  # Dynamically choose number of operations
    function = random.choice(relevant_vars)  # Start with a random relevant variable
    for _ in range(num_ops):
        op = random.choice([sympy.Add, sympy.Mul, sympy.sin, sympy.cos, sympy.exp, sympy.Pow])
        if op in [sympy.Add, sympy.Mul]:
            function = op(function, random.choice(relevant_vars + [random_constants(), random_coefficient()]))
        elif op == sympy.Pow:
            function = op(function, random.choice([2, 3, 4]))
        else:
            function = op(function)
    return function

# Example: Adding scientific equations, using physics-inspired forms
def generate_physics_function(variables):
    """Generate physics-inspired functions, e.g., energy, velocity."""
    functions = [
        random_coefficient() * variables[0]**2 + random_coefficient() * variables[1],  # Quadratic relation with random coefficients
        sympy.sin(variables[0]) + random_coefficient() * sympy.cos(variables[1]),  # Trigonometric functions with a coefficient
        random_coefficient() * variables[0]**3 + random_coefficient() * variables[1]**2 + random_coefficient() * 3*variables[0]*variables[1],  # Polynomial function
        random_coefficient() * (sympy.Mul(variables[0], 9.8))  # Gravitational potential energy or force: F = ma (Newton’s second law)
    ]
    return random.choice(functions)  # Return a random physics-inspired function

# Nguyen Dataset Generation (including physics-based functions)
def nguyen_dataset_dynamic(n=2, num_functions=10):
    """Generate a dataset with dynamic functions, including physics and scientific expressions."""
    variables = generate_variables(n)  # Generate n variables dynamically
    functions = []
    for _ in range(num_functions):
        num_relevant_vars = random.randint(1, n)  # Choose number of relevant variables randomly
        if random.random() > 0.5:  # 50% chance to generate a random or physics-inspired function
            functions.append(generate_physics_function(variables))
        else:
            functions.append(generate_random_function_with_relevance(variables, num_relevant_vars))
    return functions

# Evaluate the generated functions with random inputs (without noise)
def evaluate_nguyen_dataset_with_relevant_variables(functions, num_samples=100, range_vals=(-1, 1)):
    """Evaluate the functions with dynamic input variables, ensuring only relevant variables are considered, no noise."""
    dataset = []
    for func in functions:
        data = {"x": {}, "y": []}
        skeleton = create_skeleton(func)
        num_vars = max([int(str(var)[1:]) for var in func.free_symbols]) + 1  # Get the highest subscript value from func.free_symbols
        all_variables = generate_variables(num_vars)  # Generate variables from x0 to xN (including highest subscript)
        
        # Initialize the dictionary for each variable
        for var in all_variables:
            data["x"][str(var)] = []

        for _ in range(num_samples):
            # Generate random values for all the variables from x0 to xN
            inputs = {str(var): np.random.uniform(*range_vals) for var in all_variables}

            # Evaluate the function using only the relevant variables (those in func.free_symbols)
            relevant_inputs = {str(var): inputs[str(var)] for var in func.free_symbols}  # Use only relevant variables for evaluation
            
            try:
                # Evaluate the function with the relevant inputs
                result = func.evalf(subs=relevant_inputs)

                # Check if the result is a real number
                if result.is_real:
                    output = float(result)
                else:
                    # If it's complex, skip or handle it (e.g., set output to NaN)
                    logger.warning(f"Complex result for {str(func)}: {result}. Skipping this data point.")
                    continue  # Skip complex results, or assign output = float('nan') if desired
                
                # Append the input variables and output value
                for var in all_variables:
                    data["x"][str(var)].append(inputs[str(var)])
                data["y"].append(output)

            except (ValueError, ZeroDivisionError, OverflowError) as e:
                logger.warning(f"Invalid expression for {str(func)}: {e}")
                continue  # Skip invalid evaluations

        # Add the function's skeleton and data to the dataset
        dataset.append({"function": str(func), "skeleton": skeleton, "data": data})
    
    return dataset

# Function to save the dataset to JSON file line by line
def save_to_json_line_by_line(data, filename):
    with open(filename, "w") as f:
        for data_point in data:
            json.dump(data_point, f)
            f.write("\n")

# Function to create skeleton representation (as before)
def create_skeleton(input_function):
    """
    Convert a symbolic function into its skeleton representation by replacing:
    - Numeric coefficients with 'C'
    - Standalone numbers with 'C'
    - Fractions with 'C' (except in exponents)
    - Preserves numeric exponents like **2
    """
    # Convert the symbolic function into a string
    skeleton = str(input_function)

    # Replace common mathematical constants with 'C'
    constants_to_replace = {
        str(sympy.E): 'C',                # Euler's number (e)
        str(sympy.pi): 'C',              # Pi
        str((1 + sympy.sqrt(5)) / 2): 'C', # Golden ratio (PHI)
    }
    for const, replacement in constants_to_replace.items():
        skeleton = skeleton.replace(const, replacement)

    # Replace standalone numbers not part of exponents
    skeleton = re.sub(
        r"(?<![\w^*])([+-]?\d*\.?\d+)(?![*]{2})",  # Excludes numbers after '**'
        "C",
        skeleton,
    )

    # Replace numeric coefficients (including preceding variables and functions)
    def replace_numeric_coeff(match):
        """Replace numeric coefficients with 'C', preserving the context."""
        return f"C{match.group(1)}"  # Correctly access the first captured group

    # Handle cases where coefficients are multiplied by variables/functions without a '*' (e.g., '2x0')
    skeleton = re.sub(r"([+-]?\d*\.?\d+)(?=[a-zA-Z(])", replace_numeric_coeff, skeleton)

    return skeleton

# Set the random seed (for replicability)
seed = 20777980  # You can change this seed to any integer
random.seed(seed)
np.random.seed(seed)

# Choose the number of variables and the number of functions for each dataset
n_variables = 5  # Set the number of variables dynamically (e.g., 5 variables for testing)
num_functions = 10  # Number of random functions to generate

# Generate datasets for Nguyen
nguyen_functions = nguyen_dataset_dynamic(n_variables, num_functions)

# Evaluate the datasets without noise
nguyen_dataset_evaluated = evaluate_nguyen_dataset_with_relevant_variables(nguyen_functions, num_samples=100, range_vals=(-1, 1))

# Save the dataset to a JSON file
file_path_combined = f"Data/combined_dataset_{n_variables}_variables_dynamic_seed{seed}.json"
save_to_json_line_by_line(nguyen_dataset_evaluated, file_path_combined)

print(f"Combined and shuffled dataset saved to {file_path_combined}")


Combined and shuffled dataset saved to Data/combined_dataset_5_variables_dynamic_seed20777980.json
