In [41]:

from itertools import combinations, product
import numpy as np
from sklearn.linear_model import LassoCV, ElasticNetCV
from scipy.interpolate import UnivariateSpline
from scipy.optimize import curve_fit
from scipy.signal import savgol_filter
import scipy.special as sp_special
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import warnings


In [42]:

def create_polynomial_feature_descriptions(candidate_function_descriptions, degree):
    poly = PolynomialFeatures(degree, include_bias=False)
    feature_combinations = poly.fit_transform(np.zeros((1, len(candidate_function_descriptions))))
    feature_names = poly.get_feature_names_out(candidate_function_descriptions)
    return feature_names
    
def robust_differential_equation_discovery(
    data,
    candidate_functions,
    candidate_function_descriptions,
    derivative_order=3,
    alpha_range=(1e-7, 1e-4),
    l1_ratio_range=(0.1, 0.9),
    cv_folds=10,
    n_bootstrap=100
):
    # Step 1: Denoising
    denoised_data = denoise_data(data)
   
    # Step 2: Interpolation
    interpolated_data = interpolate_data(denoised_data)
   
    # Step 3: Derivative Estimation
    derivatives = estimate_derivatives(interpolated_data, order=derivative_order)
   
    # Step 4: Feature Engineering
    X = create_feature_matrix(interpolated_data, derivatives, candidate_functions)
   
    # Polynomial Features Expansion
    degree = 3
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly.fit_transform(X)
   
    # Generate Polynomial Feature Descriptions
    poly_feature_descriptions = create_polynomial_feature_descriptions(candidate_function_descriptions, degree)
   
    # Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_poly)
   
    # Step 5: Sparse Identification using ElasticNet
    # Ensure no excessive regularization
    elastic_net = ElasticNetCV(
        alphas=np.logspace(-7, -4, 100),  # Smaller alpha range
        l1_ratio=np.linspace(0.1, 0.9, 10),
        cv=cv_folds,
        max_iter=10000,
        random_state=0
    )
    
    # Fit the model with the adjusted regularization
    elastic_net.fit(X_scaled, interpolated_data['y'])
   
    # Step 6: Model Validation
    scores = cross_val_score(
        elastic_net,
        X_scaled,
        interpolated_data['y'],
        cv=cv_folds,
        scoring='neg_mean_squared_error'
    )
    print("Cross-validation MSE:", -np.mean(scores))
   
    # Step 7: Uncertainty Quantification using Bootstrapping
    bootstrapped_models = bootstrap_model_fitting(
        X_scaled,
        interpolated_data['y'],
        elastic_net,
        n_bootstrap=n_bootstrap
    )
   
    # Step 8: Final Model Selection
    final_model = select_best_model(bootstrapped_models, X_scaled, interpolated_data['y'])
   
    return final_model, elastic_net.coef_, poly_feature_descriptions

def denoise_data(data):
    # Implement a denoising technique like Savitzky-Golay filtering
    denoised_y = savgol_filter(data['y'], window_length=7, polyorder=3)
    return {'x': data['x'], 'y': denoised_y}

def interpolate_data(denoised_data):
    # Implement an interpolation technique like Spline Interpolation
    spline = UnivariateSpline(denoised_data['x'], denoised_data['y'], s=0)
    interpolated_y = spline(denoised_data['x'])
    return {'x': denoised_data['x'], 'y': interpolated_y}

def estimate_derivatives(data, order=3):
    # Use finite differences to estimate derivatives up to the specified order
    derivatives = {'0th': data['y']}
    for i in range(1, order + 1):
        derivatives[f'{i}th'] = np.gradient(derivatives[f'{i-1}th'], data['x'])
    return derivatives

def create_feature_matrix(data, derivatives, candidate_functions):
    # Construct a matrix with candidate functions applied to the data and its derivatives
    X = []
    feature_names = []
    for order_label, derivative in derivatives.items():
        for func_index, func in enumerate(candidate_functions):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                try:
                    feature = func(derivative)
                    # Check for invalid values
                    if np.any(np.isnan(feature)) or np.any(np.isinf(feature)) or np.iscomplexobj(feature):
                        continue
                    # Clip extreme values to prevent overflow
                    feature = np.clip(feature, -1e6, 1e6)
                    X.append(feature.real)  # Ensure real values
                    feature_names.append(f'{order_label}_func_{func_index}')
                except Exception:
                    continue
    if not X:
        raise ValueError("No valid features generated. Check candidate functions and data.")
    X = np.column_stack(X)
    return X

def bootstrap_model_fitting(X, y, base_model, n_bootstrap=100):
    # Apply bootstrapping to fit the model multiple times and quantify uncertainty
    bootstrapped_models = []
    n_samples = X.shape[0]
    for i in range(n_bootstrap):
        sample_indices = np.random.choice(n_samples, size=n_samples, replace=True)
        X_sample, y_sample = X[sample_indices], y[sample_indices]
        model = LassoCV(
            alphas=base_model.alphas_,
            cv=base_model.cv,            # Corrected attribute for cross-validation folds
            max_iter=base_model.max_iter,
            random_state=i
        )
        model.fit(X_sample, y_sample)
        bootstrapped_models.append(model)
    return bootstrapped_models# Output the discovered model coefficients

def select_best_model(models, X, y):
    # Select the model with the best performance according to mean squared error
    mse_scores = [mean_squared_error(m.predict(X), y) for m in models]
    best_model_index = np.argmin(mse_scores)
    best_model = models[best_model_index]
    print(f"Best model MSE: {mse_scores[best_model_index]}")
    return best_model

# Updated candidate functions with safer implementations
candidate_functions = [
    # Basic functions
    lambda x: x,                                # Linear term
    lambda x: x**2,                             # Quadratic term
 #   lambda x: x**3,                             # Cubic term
 #   lambda x: x**4,                             # Fourth power term
 #   lambda x: x**5,                             # Fifth power term

    # Trigonometric functions
    lambda x: np.sin(x),                        # Sine function
 #   lambda x: np.cos(x),                        # Cosine function
 #   lambda x: np.tan(x),                        # Tangent function with domain restriction
 #   lambda x: np.sin(2 * x),                    # Harmonic sine term
 #   lambda x: np.cos(2 * x),                    # Harmonic cosine term

    # Exponential and logarithmic functions
 #   lambda x: np.exp(np.clip(x, -100, 100)),    # Exponential function with clipping
 #   lambda x: np.exp(-np.clip(x, -100, 100)),   # Decaying exponential function with clipping
 #   lambda x: np.log(np.abs(x) + 1e-6),         # Logarithmic function
 #   lambda x: np.exp(np.clip(x**2, -100, 100)), # Exponential of a quadratic term with clipping
 #   lambda x: np.log(x**2 + 1e-6),              # Logarithm of a quadratic term

    # Hyperbolic functions
 #   lambda x: np.tanh(x),                       # Hyperbolic tangent function
 #   lambda x: np.sinh(x),                       # Hyperbolic sine function
 #   lambda x: np.cosh(x),                       # Hyperbolic cosine function with clipping
     
    # Special functions with safe evaluations
 #   lambda x: sp_special.gamma(np.clip(x, 1e-6, 100)),          # Gamma function
 #   lambda x: sp_special.psi(np.clip(x, 1e-6, 100)),            # Digamma function
 #   lambda x: sp_special.erf(x),                                 # Error function
 #   lambda x: sp_special.erfc(x),                                # Complementary error function
 #   lambda x: sp_special.jv(0, x),                               # Bessel function of the first kind (order 0)
 #   lambda x: sp_special.yv(0, np.clip(x, 1e-6, 100)),          # Bessel function of the second kind (order 0)
 #   lambda x: sp_special.beta(np.clip(x, 1e-6, 100), np.clip(x+1, 1e-6, 100)), # Beta function
 #   lambda x: sp_special.lambertw(x).real,                       # Lambert W function (principal branch)
 #   lambda x: sp_special.zeta(np.clip(x, 1.1, 100)),             # Riemann zeta function

    # Inverse and root functions
 #   lambda x: 1 / (np.abs(x) + 1e-6),                            # Inverse function
 #   lambda x: np.sqrt(np.abs(x) + 1e-6),                         # Square root function
]

# List of descriptions for the candidate functions
candidate_function_descriptions = [
    "x",                                        # Linear term
    "x^2",                                      # Quadratic term
#    "x^3",                                      # Cubic term
#    "x^4",                                      # Fourth power term
#    "x^5",                                      # Fifth power term

    # Trigonometric functions
    "sin(x)",                                   # Sine function
 #   "cos(x)",                                   # Cosine function
 #   "tan(x)",                                   # Tangent function with domain restriction
 #   "sin(2x)",                                  # Harmonic sine term
 #   "cos(2x)",                                  # Harmonic cosine term

    # Exponential and logarithmic functions
 #   "exp(clip(x, -100, 100))",                  # Exponential function with clipping
 #   "exp(-clip(x, -100, 100))",                 # Decaying exponential function with clipping
 #   "log(abs(x) + 1e-6)",                       # Logarithmic function
 #   "exp(clip(x^2, -100, 100))",                # Exponential of a quadratic term with clipping
 #   "log(x^2 + 1e-6)",                          # Logarithm of a quadratic term

    # Hyperbolic functions
 #   "tanh(x)",                                  # Hyperbolic tangent function
 #   "sinh(x)",                                  # Hyperbolic sine function
 #   "cosh(x)",                                  # Hyperbolic cosine function with clipping
     
    # Special functions with safe evaluations
 #   "gamma(clip(x, 1e-6, 100))",                # Gamma function
 #   "psi(clip(x, 1e-6, 100))",                  # Digamma function
 #   "erf(x)",                                   # Error function
 #   "erfc(x)",                                  # Complementary error function
 #   "jv(0, x)",                                 # Bessel function of the first kind (order 0)
 #   "yv(0, clip(x, 1e-6, 100))",                # Bessel function of the second kind (order 0)
 #   "beta(clip(x, 1e-6, 100), clip(x+1, 1e-6, 100))", # Beta function with arbitrary parameters
 #   "lambertw(x).real",                         # Lambert W function (principal branch)
 #   "zeta(clip(x, 1.1, 100))",                  # Riemann zeta function

    # Inverse and root functions
 #   "1 / (abs(x) + 1e-6)",                      # Inverse function
 #   "sqrt(abs(x) + 1e-6)",                      # Square root function
]

def create_feature_matrix_simple(data, candidate_functions):
    # Construct a matrix with candidate functions applied to the original data
    X = []
    feature_names = []
    for func_index, func in enumerate(candidate_functions):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            try:
                feature = func(data['x'])
                # Check for invalid values
                if np.any(np.isnan(feature)) or np.any(np.isinf(feature)) or np.iscomplexobj(feature):
                    continue
                # Clip extreme values to prevent overflow
                feature = np.clip(feature, -1e6, 1e6)
                X.append(feature.real)  # Ensure real values
                feature_names.append(candidate_function_descriptions[func_index])
            except Exception:
                continue
    if not X:
        raise ValueError("No valid features generated. Check candidate functions and data.")
    X = np.column_stack(X)
    return X, feature_names

def robust_differential_equation_discovery_simple(
    data,
    candidate_functions,
    candidate_function_descriptions,
    alpha_range=(1e-8, 1e-5),
    l1_ratio_range=(0.1, 0.9),
    cv_folds=10,
    n_bootstrap=100
):
    # Skip Denoising and Interpolation
    interpolated_data = data  # Use raw data

    # Step 1: Feature Engineering
    X, feature_descriptions = create_feature_matrix_simple(interpolated_data, candidate_functions)
    
    # Separate scaling for polynomial and trigonometric terms
    X_poly = X[:, :2]  # Assuming first two are x and x^2
    X_trig = X[:, 2:]  # Assuming the last one is sin(x)
    
    scaler_poly = StandardScaler()
    scaler_trig = StandardScaler()
    
    X_poly_scaled = scaler_poly.fit_transform(X_poly)
    X_trig_scaled = scaler_trig.fit_transform(X_trig)
    
    # Combine scaled features
    X_scaled = np.hstack([X_poly_scaled, X_trig_scaled])
   
    # Step 2: Sparse Identification using ElasticNet
    elastic_net = ElasticNetCV(
        alphas=np.logspace(np.log10(alpha_range[0]), np.log10(alpha_range[1]), 100),
        l1_ratio=np.linspace(l1_ratio_range[0], l1_ratio_range[1], 10),
        cv=cv_folds,
        max_iter=10000,
        random_state=0
    )
    
    # Fit the model
    elastic_net.fit(X_scaled, interpolated_data['y'])
   
    # Step 3: Model Validation
    scores = cross_val_score(
        elastic_net,
        X_scaled,
        interpolated_data['y'],
        cv=cv_folds,
        scoring='neg_mean_squared_error'
    )
    print("Cross-validation MSE:", -np.mean(scores))
   
    # Step 4: Uncertainty Quantification using Bootstrapping
    bootstrapped_models = bootstrap_model_fitting(
        X_scaled,
        interpolated_data['y'],
        elastic_net,
        n_bootstrap=n_bootstrap
    )
   
    # Step 5: Final Model Selection
    final_model = select_best_model(bootstrapped_models, X_scaled, interpolated_data['y'])
   
    return final_model, elastic_net.coef_, feature_descriptions

In [43]:

# Simulate some noisy data for testing
np.random.seed(0)
x = np.linspace(0, 10, 100)
y = 2*x - 0.5*x**2 + np.sin(x)
data = {"x": x, "y": y}

In [None]:

# Discover the differential equation
model, coefficients, poly_feature_descriptions = robust_differential_equation_discovery(
    data,
    candidate_functions,
    candidate_function_descriptions,
    derivative_order=3
)

# Identify the non-zero coefficients
non_zero_indices = np.nonzero(coefficients)[0]

# Ensure that the index does not exceed the length of feature descriptions
valid_non_zero_indices = [idx for idx in non_zero_indices if idx < len(poly_feature_descriptions)]

# Print the non-zero coefficients and corresponding functions
print("Discovered model:")
terms = []
for idx in valid_non_zero_indices:
    description = poly_feature_descriptions[idx]  # Get the description of the function
    coefficient = coefficients[idx]
    terms.append(f"{coefficient:.6f} * {description}")
    print(f"Coefficient: {coefficient:.6f}, Function: {description}")

# Print the final equation
equation = " + ".join(terms)
print(f"Final Model: y = {equation}")

Cross-validation MSE: 1.199804084936819e-08


In [None]:
# Threshold for filtering small coefficients
threshold = 1e-5

# Identify the non-zero coefficients with magnitude above the threshold
significant_indices = [idx for idx in np.nonzero(coefficients)[0] if abs(coefficients[idx]) > threshold]

# Print the significant coefficients and corresponding functions
print("Discovered model:")
terms = []
for idx in significant_indices:
    description = poly_feature_descriptions[idx]  # Get the description of the function
    coefficient = coefficients[idx]
    terms.append(f"{coefficient:.6f} * {description}")
    print(f"Coefficient: {coefficient:.6f}, Function: {description}")

# Print the final equation
equation = " + ".join(terms)
print(f"Final Model: y = {equation}")

In [None]:
# Test with raw data (no denoising/interpolation)
model, coefficients, feature_descriptions = robust_differential_equation_discovery_simple(
    data,
    candidate_functions,
    candidate_function_descriptions
)

# Threshold for filtering small coefficients
threshold = 1e-5

# Identify the non-zero coefficients with magnitude above the threshold
significant_indices = [idx for idx in np.nonzero(coefficients)[0] if abs(coefficients[idx]) > threshold]

# Print the significant coefficients and corresponding functions
print("Discovered model:")
terms = []
for idx in significant_indices:
    description = feature_descriptions[idx]  # Get the description of the function
    coefficient = coefficients[idx]
    terms.append(f"{coefficient:.6f} * {description}")
    print(f"Coefficient: {coefficient:.6f}, Function: {description}")

# Print the final equation
equation = " + ".join(terms)
print(f"Final Model: y = {equation}")

In [None]:
def true_function(x, a, b, c):
    return a * x + b * x**2 + c * np.sin(x)

params, _ = curve_fit(true_function, x, y)
print(f"Fitted parameters: a = {params[0]}, b = {params[1]}, c = {params[2]}")

In [None]:
# Step 1: Define Candidate Functions with Descriptive Names
def generate_candidate_functions():
    return [
        (lambda x: x, "x"),                     # Linear term
        (lambda x: x**2, "x^2"),                # Quadratic term
        (lambda x: x**3, "x^3"),                # Cubic term
        (lambda x: np.sin(x), "sin(x)"),        # Sine function
        (lambda x: np.cos(x), "cos(x)"),        # Cosine function
        (lambda x: np.exp(x), "exp(x)"),        # Exponential function
        (lambda x: np.log(np.abs(x) + 1e-6), "log(|x| + 1e-6)")  # Logarithmic function
    ]

# Step 2: Generate Features with Descriptive Names
def create_feature_matrix(x, candidate_functions):
    X = []
    feature_names = []
    for func, name in candidate_functions:
        try:
            X.append(func(x))
            feature_names.append(name)
        except:
            continue
    return np.column_stack(X), feature_names

# Step 3: Use Regularized Regression
def discover_equation(x, y):
    candidate_functions = generate_candidate_functions()
    X, feature_names = create_feature_matrix(x, candidate_functions)

    # Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Regularized Regression with LassoCV to select the best features
    lasso = LassoCV(cv=10, random_state=0).fit(X_scaled, y)

    # Identify significant features (non-zero coefficients)
    significant_indices = np.where(np.abs(lasso.coef_) > 1e-5)[0]
    significant_features = [(lasso.coef_[i], feature_names[i]) for i in significant_indices]

    # Step 4: Evaluate and Simplify the Model
    equation = "y = " + " + ".join(f"{coef:.6f}*{name}" for coef, name in significant_features)
    print("Discovered model:")
    print(equation)
    
    # Calculate and print model performance
    y_pred = lasso.predict(X_scaled)
    mse = mean_squared_error(y, y_pred)
    print(f"Mean Squared Error: {mse:.6f}")
    
    return equation, lasso

# Example usage
np.random.seed(0)
x = np.linspace(0, 10, 100)
y = 2*x - 0.5*x**2 + np.sin(x)  # Generate example data
y += np.random.normal(scale=0.1, size=y.shape)  # Add some noise

# Discover the equation from data
equation, model = discover_equation(x, y)

In [None]:
# Step 1: Define Candidate Functions
def generate_candidate_functions():
    return [
        (lambda x: x, "x"),                     # Linear term
        (lambda x: x**2, "x^2"),                # Quadratic term
        (lambda x: np.sin(x), "sin(x)"),        # Sine function
        (lambda x: np.cos(x), "cos(x)"),        # Cosine function
        (lambda x: np.exp(x), "exp(x)"),        # Exponential function
        (lambda x: np.log(np.abs(x) + 1e-6), "log(|x| + 1e-6)")  # Logarithmic function
    ]

# Step 2: Build Candidate Equations
def build_candidate_equations(candidate_functions, max_terms=3):
    candidate_equations = []
    for num_terms in range(1, max_terms + 1):
        for combination in combinations(candidate_functions, num_terms):
            functions, names = zip(*combination)
            candidate_equations.append((functions, names))
    return candidate_equations

# Step 3: Evaluate Each Equation
def evaluate_equations(x, y, candidate_equations):
    best_mse = float('inf')
    best_equation = None
    best_params = None
    best_description = ""

    for functions, names in candidate_equations:
        # Define a composite function
        def composite_function(x, *params):
            result = np.zeros_like(x)
            for i, func in enumerate(functions):
                result += params[i] * func(x)
            return result
        
        # Initial guess for parameters
        initial_guess = np.ones(len(functions))
        
        try:
            # Fit the model using curve_fit
            params, _ = curve_fit(composite_function, x, y, p0=initial_guess)
            
            # Calculate MSE
            y_pred = composite_function(x, *params)
            mse = mean_squared_error(y, y_pred)
            
            # Simple description of the equation
            equation_description = " + ".join(f"{param:.6f}*{name}" for param, name in zip(params, names))
            
            # Check if this is the best equation (balance between simplicity and accuracy)
            if mse < best_mse:
                best_mse = mse
                best_equation = composite_function
                best_params = params
                best_description = equation_description
                
        except Exception as e:
            continue
    
    return best_description, best_mse

# Step 4: Implement the Process
def discover_best_equation(x, y):
    candidate_functions = generate_candidate_functions()
    candidate_equations = build_candidate_equations(candidate_functions, max_terms=3)
    best_description, best_mse = evaluate_equations(x, y, candidate_equations)
    
    print("Best Discovered Model:")
    print(f"y = {best_description}")
    print(f"Mean Squared Error: {best_mse:.6f}")
    
    return best_description, best_mse

# Example usage
np.random.seed(0)
x = np.linspace(0, 10, 100)
y = 2*x - 0.5*x**2 + np.sin(x)  # Generate example data
y += np.random.normal(scale=0.1, size=y.shape)  # Add some noise

# Discover the best equation from data
best_description, best_mse = discover_best_equation(x, y)