In [24]:
import numpy as np
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, Matern, WhiteKernel
from skopt import gp_minimize
from skopt.space import Real, Categorical
from sklearn.model_selection import KFold
from skopt.utils import use_named_args
from sklearn.exceptions import ConvergenceWarning
import warnings

In [25]:
# Suppress convergence warnings
warnings.filterwarnings('ignore', category=ConvergenceWarning)
warnings.filterwarnings('ignore')

In [26]:
# Function to convert string representation of arrays to numpy arrays
def str_to_array(s):
    return np.fromstring(s[1:-1], sep=' ')

In [27]:
# Evaluation function with k-fold cross validation
def evaluate_model_with_kfold(kernel, X, y, k=5):
    kf = KFold(n_splits=k)
    scores = []
    model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=15)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        scores.append(model.score(X_test, y_test))
    return np.mean(scores)

In [28]:
# Optimization function to predict the next query point
def bayesian_optimization(function_name, all_inputs, all_outputs, num_inputs):
    # Define the search space for the model hyperparameters
    space = [
        Real(1e-6, 1e2, name='constant'),
        Real(1e-3, 1e4, name='length_scale'),
        Categorical(['RBF', 'Matern', 'RBF + WhiteKernel'], name='kernel_type')
    ]

    # Function to be maximized
    @use_named_args(space)
    def objective(constant, length_scale, kernel_type):
        if kernel_type == 'RBF':
            kernel = C(constant, (1e-6, 1e2)) * RBF(length_scale, length_scale_bounds=(1e-3, 1e4))
        elif kernel_type == 'Matern':
            kernel = C(constant, (1e-6, 1e2)) * Matern(length_scale, length_scale_bounds=(1e-3, 1e4))
        elif kernel_type == 'RBF + WhiteKernel':
            kernel = C(constant, (1e-6, 1e2)) * RBF(length_scale, length_scale_bounds=(1e-3, 1e4)) + WhiteKernel()
        
        X = all_inputs[function_name]
        y = all_outputs[function_name].ravel()
        
        return -evaluate_model_with_kfold(kernel, X, y, k=5)  # Evaluation for maximization (negative for minimize)

    # Bayesian optimization
    res = gp_minimize(objective, space, n_calls=30, random_state=42)

    # Best kernel found
    best_kernel_type = res.x[2]
    if best_kernel_type == 'RBF':
        best_kernel = C(res.x[0], (1e-6, 1e2)) * RBF(res.x[1], length_scale_bounds=(1e-3, 1e4))
    elif best_kernel_type == 'Matern':
        best_kernel = C(res.x[0], (1e-6, 1e2)) * Matern(res.x[1], length_scale_bounds=(1e-3, 1e4))
    elif best_kernel_type == 'RBF + WhiteKernel':
        best_kernel = C(res.x[0], (1e-6, 1e2)) * RBF(res.x[1], length_scale_bounds=(1e-3, 1e4)) + WhiteKernel()
    
    model = GaussianProcessRegressor(kernel=best_kernel, n_restarts_optimizer=15)
    X = all_inputs[function_name]
    y = all_outputs[function_name].ravel()
    model.fit(X, y)
    
    # Next query point
    x_new = np.random.rand(1, num_inputs) * 0.999999 + 0.000001  # Ensuring that inputs are in the correct range
    y_new = model.predict(x_new)
    
    return x_new, y_new

In [29]:
# Main function to perform Bayesian optimization for all functions
def main_optimization():
    # Load new data from CSV
    new_data_df = pd.read_csv('./data/603_data.csv')

    # Dictionary to define the number of entries for each function
    num_inputs = {
        'f1': 2,
        'f2': 2,
        'f3': 3,
        'f4': 4,
        'f5': 4,
        'f6': 5,
        'f7': 6,
        'f8': 8
    }

    # Initialize dictionaries to store data for all functions
    initial_inputs = {}
    initial_outputs = {}
    new_inputs = {}
    new_outputs = {}
    all_inputs = {}
    all_outputs = {}

    # Dictionary to store the next query points
    next_queries = {}

    # Load data for all functions F1 to F8
    for i in range(1, 9):
        func_name = f'f{i}'
        # Load initial data
        initial_inputs[func_name] = np.load(f'./data/{func_name}/initial_inputs_{func_name}_part1.npy')
        initial_inputs[f'{func_name}_X2'] = np.load(f'./data/{func_name}/initial_inputs_{func_name}_part2.npy')
        initial_inputs[func_name] = np.concatenate((initial_inputs[func_name], initial_inputs[f'{func_name}_X2']), axis=0)
        
        initial_outputs[func_name] = np.load(f'./data/{func_name}/initial_outputs_{func_name}_part1.npy')
        initial_outputs[f'{func_name}_y2'] = np.load(f'./data/{func_name}/initial_outputs_{func_name}_part2.npy')
        initial_outputs[func_name] = np.concatenate((initial_outputs[func_name], initial_outputs[f'{func_name}_y2']), axis=0)

        # Extract and concatenate new inputs
        new_inputs[func_name] = np.array([str_to_array(x) for x in new_data_df[func_name]])
        new_inputs[func_name] = new_inputs[func_name].reshape(-1, num_inputs[func_name])
        all_inputs[func_name] = np.concatenate((initial_inputs[func_name], new_inputs[func_name]), axis=0)

        # Extract and concatenate new outputs
        new_outputs[func_name] = new_data_df[f'{func_name}_output'].values.reshape(-1, 1)
        all_outputs[func_name] = np.concatenate((initial_outputs[func_name].reshape(-1, 1), new_outputs[func_name]), axis=0)

        # Perform Bayesian optimization for the current function
        next_query, predicted_output = bayesian_optimization(func_name, all_inputs, all_outputs, num_inputs[func_name])
        next_queries[func_name] = next_query
        print(f'Next query point for {func_name}: {next_query}')
        print(f'Predicted output for {func_name}: {predicted_output}')

    # Print all next query points at the end
    print("\nNext query points for all functions:")
    for func_name, query in next_queries.items():
        print(f'{func_name}: {query}')

In [30]:
if __name__ == "__main__":
    main_optimization()

Next query point for f1: [[0.96924416 0.05381045]]
Predicted output for f1: [-1.2613171e-107]
Next query point for f2: [[0.27018398 0.01487373]]
Predicted output for f2: [0.06449717]
Next query point for f3: [[0.0524531  0.89853147 0.76727213]]
Predicted output for f3: [-0.05694074]
Next query point for f4: [[0.15577833 0.31252674 0.78923247 0.13429116]]
Predicted output for f4: [-13.8589831]
Next query point for f5: [[0.20806936 0.91323665 0.66136432 0.12284574]]
Predicted output for f5: [66.67435055]
Next query point for f6: [[0.4700897  0.78753621 0.57902848 0.41476095 0.76243346]]
Predicted output for f6: [-1.66769588]
Next query point for f7: [[0.7770324  0.00095601 0.08473906 0.86365527 0.02997492 0.92068437]]
Predicted output for f7: [0.05759545]
Next query point for f8: [[0.15705823 0.95477754 0.40902787 0.23322083 0.84270815 0.81703667
  0.40651175 0.64879555]]
Predicted output for f8: [8.91402779]

Next query points for all functions:
f1: [[0.96924416 0.05381045]]
f2: [[0.270