In [None]:
import numpy as np
from sklearn.preprocessing import RobustScaler
from matplotlib import pyplot as plt
from bo_methods_lib.bo_methods_lib.GPBO_Class_fxns import * #Fix this later
from bo_methods_lib.bo_methods_lib.GPBO_Classes_New import * #Fix this later
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 200
import pygad

In [None]:
def open_file_helper(file_path):
    """
    Opens a .gz or .pickle file based on the extension

    Parameters
    ----------
    file_path: str
        The file path of the data

    Returns
    -------
    results: pickled object
        The results stored in the .pickle or .gz file

    Raises
    ------
    AssertionError
        If the file path is not a string
    ValueError
        If the file type is not .gz or .pickle
    """
    assert isinstance(file_path, str), "file_path must be a string"
    if file_path.endswith(".pickle") or file_path.endswith(".pkl"):
        with open(file_path, "rb") as fileObj:
            results = pickle.load(fileObj)
    elif file_path.endswith(".gz"):
        with gzip.open(file_path, "rb") as fileObj:
            results = pickle.load(fileObj)
    else:
        raise ValueError("File type must be .gz or .pickle!")

    return results

In [None]:
import pygad
import numpy
import json
import signac
cs_num = 1
num_restarts = 10 if cs_num in [2,3] else 5

class GA_run:
    def __init__(self, num_generations, num_parents_mating, sol_per_pop, cs_num):
        self.num_generations = num_generations
        self.num_parents_mating = num_parents_mating
        self.sol_per_pop = sol_per_pop
        self.cs_num = cs_num
        self.count = 0
        self.iter_param_data = []
        self.iter_sse_data = []
        self.sse_list = []
        self.project = signac.get_project("GPBO_Fix")
        self.criteria_dict = {
            "cs_name_val": self.cs_num,
            "meth_name_val": 1,
        }
        self.cs_genmeth_dict = {
            "Simple Linear": 1,
            "Muller x0": 2,
            "Muller y0": 2,
            "Yield-Loss": 1,
            "Large Linear": 2,
            "BOD Curve": 1,
            "Log Logistic": 1,
            "2D Log Logistic": 2,
        }

        self.cs_xval_dict = {
            "Simple Linear": 5,
            "Muller x0": 5,
            "Muller y0": 5,
            "Yield-Loss": 10,
            "Large Linear": 5,
            "BOD Curve": 10,
            "Log Logistic": 10,
            "2D Log Logistic": 5,
        }
        simulator, exp_data, tot_runs_cs, ftol = self.__get_simulator_exp_data(cs_num)
        # simulator, exp_data = self.get_cs_class_data(cs_num)
        self.simulator = simulator
        self.exp_data = exp_data

    # def get_cs_class_data(self, cs_num):
    #     simulator = simulator_helper_test_fxns(cs_num, 0, None, 1)
    #     cs_class = get_cs_class_from_val(cs_num)
    #     gen_meth = Gen_meth_enum(2)
    #     exp_data = simulator.gen_exp_data(self.cs_xval_dict[cs_class.name],
    #                                       gen_meth, 
    #                                       1)
        
    #     simulator.noise_std = np.abs(np.mean(exp_data.y_vals))*0.05
    #     self.num_genes = exp_data.get_dim_theta()# len(exp_data.x_vals)
    #     self.simulator = simulator
    #     self.exp_data = exp_data
    #     return simulator, exp_data

    def __get_simulator_exp_data(self, cs_num):
        """
        Gets the simulator and experimental data from the job

        Returns
        -------
        simulator: Simulator
            The simulator object to evaluate
        exp_data: Data
            The experimental data to evaluate
        tot_runs_cs: int
            The total number of runs in the case study
        ftol: float
            The tolerance for the objective function

        Notes
        -----
        The simulator and experimental data is consistent between all methods of a given case study
        """

        jobs = sorted(
            self.project.find_jobs(self.criteria_dict), key=lambda job: job._id
        )
        valid_files = [
            job.fn("BO_Results.gz")
            for job in jobs
            if os.path.exists(job.fn("BO_Results.gz"))
        ]
        if len(valid_files) > 0:
            smallest_file = min(valid_files, key=lambda x: os.path.getsize(x))
            # Find the job corresponding to the smallest file size
            smallest_file_index = valid_files.index(smallest_file)
            job = jobs[smallest_file_index]

            # Open the statepoint of the job
            with open(job.fn("signac_statepoint.json"), "r") as json_file:
                # Load the JSON data
                sp_data = json.load(json_file)
            # get number of total runs from statepoint
            tot_runs_cs = sp_data["bo_run_tot"]
            ftol = sp_data["obj_tol"]
            if tot_runs_cs == 1:
                if sp_data["cs_name_val"] in [2, 3] and sp_data["bo_iter_tot"] == 75:
                    tot_runs_cs = 10
                else:
                    tot_runs_cs = 5

            # Open smallest job file
            results = open_file_helper(job.fn("BO_Results.gz"))
            # Get Experimental data and Simulator objects used in problem
            exp_data = results[0].exp_data_class
            simulator = results[0].simulator_class
            simulator.indices_to_consider = simulator.indeces_to_consider # For backwards compatibility

        else:
            # Set tot_runs cs as 5 as a default
            tot_runs_cs = 5
            # Create simulator and exp Data class objects
            simulator = simulator_helper_test_fxns(
                self.criteria_dict["cs_name_val"], 0, None, self.seed
            )

            # Get criteria dict name from cs number
            cs_name_dict = get_cs_class_from_val(self.criteria_dict["cs_name_val"]).name
            # Set num_x based off cs number
            self.cs_x_dict = {
                "Simple Linear": 5,
                "Muller x0": 5,
                "Muller y0": 5,
                "Yield-Loss": 10,
                "Large Linear": 5,
                "BOD Curve": 10,
                "Log Logistic": 10,
                "2D Log Logistic": 5,
            }
            self.num_x = self.cs_x_dict[cs_name_dict]
            if self.criteria_dict["cs_name_val"] in [2,3,10,14]:
                exp_data = simulator.gen_exp_data(self.num_x, Gen_meth_enum(2), self.seed)
            else:
                exp_data = simulator.gen_exp_data(self.num_x, Gen_meth_enum(1), self.seed)
            simulator.noise_std = np.abs(np.mean(exp_data.y_vals))*0.05
            ftol = 1e-7

        self.simulator = simulator
        self.exp_data = exp_data
        print(self.simulator.seed, self.simulator.noise_std)

        return simulator, exp_data, tot_runs_cs, ftol
    # def fitness_func(self, ga_instance, solution, solution_idx):
    #     self.count += 1
    #     # Repeat the theta best array once for each x value
    #     # Need to repeat theta_best such that it can be evaluated at every x value in exp_data using simulator.gen_y_data
    #     t_guess_repeat = np.vstack([solution] * len(self.exp_data.x_vals))
    #     # Add instance of Data class to theta_best
    #     theta_guess_data = Data(
    #         t_guess_repeat,
    #         self.exp_data.x_vals,
    #         None,
    #         None,
    #         None,
    #         None,
    #         None,
    #         None,
    #         self.simulator.bounds_theta_reg,
    #         self.simulator.bounds_x,
    #         1,
    #         self.simulator.seed,
    #     )
    #     # Calculate y values and sse for theta_best with noise
    #     theta_guess_data.y_vals = self.simulator.gen_y_data(
    #         theta_guess_data, self.simulator.noise_mean, self.simulator.noise_std
    #     )

    #     sse = np.sum((self.exp_data.y_vals - theta_guess_data.y_vals)**2)
    #     fitness = 1/(sse+1e-6)

    #     # Append intermediate values to list
    #     self.iter_param_data.append(np.array(solution))
    #     self.iter_sse_data.append(sse)
        

    #     return float(fitness)
    

    def fitness_func(self, ga_instance, solution, solution_idx):
        self.count += 1
        soln_repeat = np.vstack([solution] * len(self.exp_data.x_vals))
        solution_data = Data(
            soln_repeat,
            self.exp_data.x_vals,
            None,
            None,
            None,
            None,
            None,
            None,
            self.simulator.bounds_theta_reg,
            self.simulator.bounds_x,
            1,
            1,
        )
        # print(self.exp_data.x_vals)
        output = self.simulator.gen_y_data(solution_data, 0, self.simulator.noise_std)
        # print(np.array(solution).reshape(1,-1), output)
        sse = np.sum((self.exp_data.y_vals - output)**2)

        fitness = 1/(sse+1e-6)
        return float(fitness)
    
    def run(self):
        saturate = int(self.num_generations/3)
        sat_str = "saturate_" + str(saturate)
        gene_space = [{'low': row[0], 'high': row[1]} for row in self.simulator.bounds_theta_reg.T]
        ga_instance = pygad.GA(num_generations=num_generations,
                       num_parents_mating=num_parents_mating,
                       sol_per_pop=sol_per_pop,
                       gene_space = gene_space,
                       num_genes=self.exp_data.get_dim_theta(),
                       fitness_func=self.fitness_func,
                       random_seed=self.simulator.seed,
                       stop_criteria = [sat_str])
        ga_instance.run()
        # ga_instance.plot_fitness()
        solution, solution_fitness, solution_idx = ga_instance.best_solution(ga_instance.last_generation_fitness)
        # print(f"Parameters of the best solution : {solution}")
        # # print(f"Fitness value of the best solution = {solution_fitness}")
        sse = 1/solution_fitness + 1e-6
        # print(f"SSE of the best solution = {sse}")
        # print(f"Index of the best solution : {solution_idx}")
        # if ga_instance.best_solution_generation != -1:
        #     print(f"Best fitness value reached after {ga_instance.best_solution_generation} generations.")
        # print("SSE Evaluations", self.count)
        self.ga_instance = ga_instance

        return sse
    

num_generations = 50 # Number of generations.
num_parents_mating = 5 # Number of solutions to be selected as parents in the mating pool.
sol_per_pop = 25 # Number of solutions in the population.

# ga_classes = []
# solns = []
# for i in range(num_restarts):
#     garun = GA_run(num_generations, num_parents_mating, sol_per_pop, cs_num)
#     sse = garun.run()
#     ga_classes.append(garun)
#     solns.append(sse)

# soln_best, solution_fitness, solution_idx = best_class.ga_instance.best_solution(best_class.ga_instance.last_generation_fitness)
# sse_method = min(solns)

# print("Min is: ", min(solns), "at index: ", solns.index(min(solns)))
# use_ga = ga_classes[solns.index(min(solns))]
# # use_ga.ga_instance.plot_fitness()
# solution, solution_fitness, solution_idx = use_ga.ga_instance.best_solution(use_ga.ga_instance.last_generation_fitness)
# print(f"Parameters of the best solution : {solution}")
# # print(f"Fitness value of the best solution = {solution_fitness}")
# sse = 1/solution_fitness + 1e-6
# print(f"SSE of the best solution = {sse}")
# # print(f"Index of the best solution : {solution_idx}")
# # if use_ga.ga_instance.best_solution_generation != -1:
# #     print(f"Best fitness value reached after {use_ga.ga_instance.best_solution_generation} generations.")
# print("SSE Evaluations", use_ga.count)

# Saving the GA instance.
# filename = 'genetic' # The filename to which the instance is saved. The name is without extension.
# ga_instance.save(filename=filename)

# Loading the saved GA instance.
# loaded_ga_instance = pygad.load(filename=filename)
# loaded_ga_instance.plot_fitness()

In [None]:
from scipy.optimize import shgo

class SHGO_run:
    def __init__(self, cs_num):
        self.cs_num = cs_num
        self.count = 0
        self.cs_genmeth_dict = {
            "Simple Linear": 1,
            "Muller x0": 2,
            "Muller y0": 2,
            "Yield-Loss": 1,
            "Large Linear": 2,
            "BOD Curve": 1,
            "Log Logistic": 1,
            "2D Log Logistic": 2,
        }

        self.cs_xval_dict = {
            "Simple Linear": 5,
            "Muller x0": 5,
            "Muller y0": 5,
            "Yield-Loss": 10,
            "Large Linear": 5,
            "BOD Curve": 10,
            "Log Logistic": 10,
            "2D Log Logistic": 5,
        }
        self.get_cs_class_data(cs_num)

    def get_cs_class_data(self, cs_num):
        simulator = simulator_helper_test_fxns(cs_num, 0, None, 1)
        cs_class = get_cs_class_from_val(cs_num)
        gen_meth = Gen_meth_enum(self.cs_genmeth_dict[cs_class.name])
        exp_data = simulator.gen_exp_data(self.cs_xval_dict[cs_class.name],
                                          gen_meth, 
                                          1)
        simulator.noise_std = np.abs(np.mean(exp_data.y_vals))*0.05
        self.num_genes = exp_data.get_dim_theta()# len(exp_data.x_vals)
        self.simulator = simulator
        self.exp_data = exp_data

    def shgo_scipy_func(self, theta_guess, exp_data, simulator):
        """
        Function to define regression function for least-squares fitting
        Parameters
        ----------
        theta_guess: np.ndarray
            The parameter set values to evaluate
        exp_data: Data
            The experimental data to evaluate
        simulator: Simulator
            The simulator object to evaluate

        Returns
        -------
        error: np.ndarray
            The error between the experimental data and the simulated data
        """
        # Repeat the theta best array once for each x value
        # Need to repeat theta_best such that it can be evaluated at every x value in exp_data using simulator.gen_y_data
        t_guess_repeat = np.repeat(
            theta_guess.reshape(1, -1), exp_data.get_num_x_vals(), axis=0
        )
        # Add instance of Data class to theta_best
        theta_guess_data = Data(
            t_guess_repeat,
            exp_data.x_vals,
            None,
            None,
            None,
            None,
            None,
            None,
            simulator.bounds_theta_reg,
            simulator.bounds_x,
            1,
            simulator.seed,
        )
        # Calculate y values and sse for theta_best with noise
        theta_guess_data.y_vals = simulator.gen_y_data(
            theta_guess_data, simulator.noise_mean, simulator.noise_std
        )

        error = exp_data.y_vals.flatten() - theta_guess_data.y_vals.flatten()

        return np.sum(error**2)
    
    def run(self):
        # Find shgo solution
        solution = optimize.shgo(
        lambda theta_guess: self.shgo_scipy_func(theta_guess, self.exp_data, self.simulator),
        bounds = self.simulator.bounds_theta_reg.T,
        sampling_method="sobol",
    )
        self.solution = solution

        return sse

# shgo_classes = []
# solns = []
# for i in range(num_restarts):
#     SHGO = SHGO_run(cs_num)
#     sse = SHGO.run()
#     shgo_classes.append(SHGO)
#     solns.append(sse)

# use_shgo = shgo_classes[solns.index(min(solns))]
# print(f"Parameters of the best solution : {use_shgo.solution.x}")
# print(f"SSE of the best solution = {use_shgo.solution.fun}")
# print("SSE Evaluations", use_shgo.solution.nfev)
    

In [None]:
from scipy.optimize import shgo
from scipy import optimize

class NM_run:
    def __init__(self, cs_num):
        self.cs_num = cs_num
        self.count = 0
        self.cs_genmeth_dict = {
            "Simple Linear": 1,
            "Muller x0": 2,
            "Muller y0": 2,
            "Yield-Loss": 1,
            "Large Linear": 2,
            "BOD Curve": 1,
            "Log Logistic": 1,
            "2D Log Logistic": 2,
        }

        self.cs_xval_dict = {
            "Simple Linear": 5,
            "Muller x0": 5,
            "Muller y0": 5,
            "Yield-Loss": 10,
            "Large Linear": 5,
            "BOD Curve": 10,
            "Log Logistic": 10,
            "2D Log Logistic": 5,
        }
        self.get_cs_class_data(cs_num)

    def get_cs_class_data(self, cs_num):
        simulator = simulator_helper_test_fxns(cs_num, 0, None, 1)
        cs_class = get_cs_class_from_val(cs_num)
        gen_meth = Gen_meth_enum(self.cs_genmeth_dict[cs_class.name])
        exp_data = simulator.gen_exp_data(self.cs_xval_dict[cs_class.name],
                                          gen_meth, 
                                          1)
        simulator.noise_std = np.abs(np.mean(exp_data.y_vals))*0.05
        self.simulator = simulator
        self.num_genes = exp_data.get_dim_theta()# len(exp_data.x_vals)
        self.exp_data = exp_data

    def NM_scipy_func(self, theta_guess, exp_data, simulator):
        """
        Function to define regression function for least-squares fitting
        Parameters
        ----------
        theta_guess: np.ndarray
            The parameter set values to evaluate
        exp_data: Data
            The experimental data to evaluate
        simulator: Simulator
            The simulator object to evaluate

        Returns
        -------
        error: np.ndarray
            The error between the experimental data and the simulated data
        """
        # Repeat the theta best array once for each x value
        # Need to repeat theta_best such that it can be evaluated at every x value in exp_data using simulator.gen_y_data
        t_guess_repeat = np.repeat(
            theta_guess.reshape(1, -1), exp_data.get_num_x_vals(), axis=0
        )
        # Add instance of Data class to theta_best
        theta_guess_data = Data(
            t_guess_repeat,
            exp_data.x_vals,
            None,
            None,
            None,
            None,
            None,
            None,
            simulator.bounds_theta_reg,
            simulator.bounds_x,
            1,
            simulator.seed,
        )
        # Calculate y values and sse for theta_best with noise
        theta_guess_data.y_vals = simulator.gen_y_data(
            theta_guess_data, simulator.noise_mean, simulator.noise_std
        )

        error = exp_data.y_vals.flatten() - theta_guess_data.y_vals.flatten()

        return np.sum(error**2)
    
    def run(self):
        # Find shgo solution
        solution = optimize.minimize(
        self.NM_scipy_func,
        np.random.uniform(self.simulator.bounds_theta_reg[0], self.simulator.bounds_theta_reg[1]),
        method = 'Nelder-Mead',
        bounds = self.simulator.bounds_theta_reg.T,
        args = (self.exp_data, self.simulator)
    )
        self.solution = solution

        return sse

# nm_classes = []
# solns = []
# for i in range(num_restarts):
#     neldmead = NM_run(cs_num)
#     sse = neldmead.run()
#     nm_classes.append(neldmead)
#     solns.append(sse)

# use_nm = nm_classes[solns.index(min(solns))]
# print(f"Parameters of the best solution : {use_nm.solution.x}")
# print(f"SSE of the best solution = {use_nm.solution.fun}")
# print("SSE Evaluations", use_nm.solution.nfev)
    

In [None]:
import pandas as pd

# Initialize an empty list to store the results
results = []

# Define the methods to loop over
methods = ['GA']

# Loop over CS numbers and methods
for cs_num in [11, 14, 2, 1, 12, 13, 3, 10]:
    num_restarts = 10 if cs_num in [2, 3] else 5  # Determine the number of restarts for each cs_num
    max_runs = 75 if cs_num in [2, 3] else 50  # Determine the number of runs for each cs_num
    cs_class = get_cs_class_from_val(cs_num)  # Get the CS class
    cs_name = cs_class.name  # Get the name of the CS class
    cs_genmeth_dict = {
            "Simple Linear": 1,
            "Muller x0": 2,
            "Muller y0": 2,
            "Yield-Loss": 1,
            "Large Linear": 2,
            "BOD Curve": 1,
            "Log Logistic": 1,
            "2D Log Logistic": 2,
        }

    cs_xval_dict = {
        "Simple Linear": 5,
        "Muller x0": 5,
        "Muller y0": 5,
        "Yield-Loss": 10,
        "Large Linear": 5,
        "BOD Curve": 10,
        "Log Logistic": 10,
        "2D Log Logistic": 5,
    }

    for method in methods:
        if method == 'GPBO':
            df = pd.read_csv('/scratch365/mcarlozo/Toy_Problem/Results_act/cs_name_val_' +str(cs_num) +'/ep_enum_val_1/gp_package_gpflow/meth_name_val_in_1_2_3_4_5_6_7/best_results.csv', header=0)
            sse_method = df['Min Obj Act Cum'].min()
            count_method = df['Max Evals'].iloc[df['Min Obj Act Cum'].idxmin()] + len(cs_class.idcs_to_consider)*10
            soln_best = df['Theta Min Obj'].iloc[df['Min Obj Act Cum'].idxmin()]
            method_report = df['BO Method'].iloc[df['Min Obj Act Cum'].idxmin()]
            if "Log" in method_report:
                sse_method = np.exp(sse_method)
        else:
            method_report = method
            classes = []
            solns = []
            
            for i in range(num_restarts):
                if method == 'GA':
                    classobj = GA_run(max_runs, 5, 25, cs_num)
                elif method == 'NM':
                    classobj = NM_run(cs_num)
                elif method == 'SHGO':
                    classobj = SHGO_run(cs_num)
            
                sse = classobj.run()
                classes.append(classobj)
                solns.append(sse)

            closest_index = np.argmin(np.abs(np.array(solns) - np.median(solns)))

            best_class = classes[closest_index]

            if method == 'GA':
                soln_best, solution_fitness, solution_idx = best_class.ga_instance.best_solution(best_class.ga_instance.last_generation_fitness)
                sse_method = min(solns)
                count_method = best_class.count  # Number of evaluations for GA
            elif method != "GPBO":
                soln_best = best_class.solution.x  # Parameters for NM solution
                sse_method = best_class.solution.fun  # SSE for NM solution
                count_method = best_class.solution.nfev  # Number of evaluations for NM

        # Append the results to the list
        results.append({
            'CS_Name': cs_name,
            'Method': method_report,
            'SSE': sse_method,
            'Loss_Evals': count_method,
            'Best_Solution': soln_best
        })

# Convert the list of results into a pandas DataFrame
df = pd.DataFrame(results)

# Display the DataFrame
print(df)
# df.to_csv('stochastic_res_sob_test.csv', index=False)
