In [1]:
# BO Baseline
# Define an acquisition function
import os
import time
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, RBF

def acquisition(X, model, scaler, epsilon=0.01):
    prediction, std_dev = model.predict(scaler.transform(X), return_std=True)
    return -(prediction + epsilon * std_dev)

# load the data
file_path = os.path.join('Data', 'numeric_data.csv')
data = pd.read_csv(file_path)

# Extract features and target
features = ["Powderkg", "Liquidkg", "WC", "Fly_Ash_ratio", "GGBFS_ratio", "temperature"]
target = "fc_28dGroundTruth"

# Initialize kernel and GP model
#kernel = ConstantKernel(1.0, (1e-3, 1e3)) * Matern(length_scale=10, nu=1.5)
kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
gpr = GaussianProcessRegressor(kernel=kernel)

timestamp = str(int(time.time()))

for experiment in range(30):
    print(f"Experiment: {experiment+1}")

    # Randomly sample initial training set
    
    initial_sample_size = 4
    train_data = data.sample(n=initial_sample_size)
    while train_data[target].max() >= desired_strength:
        train_data = data.sample(n=initial_sample_size)
        
    # All other data is potential test data
    test_data = data.drop(train_data.index)

    # Define scalers outside the loop
    X_scaler = StandardScaler()
    y_scaler = MinMaxScaler()

    for _ in range(10):
        # Fit and transform the data
        X_train = X_scaler.fit_transform(train_data[features])
        y_train = y_scaler.fit_transform(train_data[target].values.reshape(-1, 1))

        # Train GP model
        gpr.fit(X_train, y_train)

        # Use the acquisition function to select the next point from the test set
        X_test = X_scaler.transform(test_data[features])
        acq_values = -acquisition(X_test, gpr, X_scaler)  # negative because we are maximizing

        # Select the point that maximizes the acquisition function
        max_acq_index = np.argmax(acq_values)
        X_next = X_test[max_acq_index]

        # Get the original index from the test_data DataFrame
        original_index = test_data.index[max_acq_index]

        # Append X_next to training data
        train_data = pd.concat([train_data, test_data.loc[[original_index]]])

        # Remove X_next from test_data
        test_data = test_data.drop(original_index)
        
        # If we have found a material with the desired strength, we stop sampling
        if train_data.loc[original_index, target] >= desired_strength:
            break
            

        
    filename = f"results/BO/experiment_{experiment+1}_BO_initialsample_{initial_sample_size}_target_{int(targ_quant.value)}_%_Dev_Budget_{budget}_{timestamp}.csv"
    train_data.to_csv(filename, index=False)


NameError: name 'os' is not defined

In [None]:
#RF Baseline

from sklearn.preprocessing import StandardScaler
from lolopy.learners import RandomForestRegressor
import numpy as np
import pandas as pd


def acquisition(X, model, scaler):
    '''Acquisition function that finds the maximum of predictions plus uncertainties.'''
    # Get predictions and uncertainties
    preds, stds = model.predict(X, return_std=True)
    # Return predictions plus uncertainties
    return preds + stds

def run_experiment(data, features, target, desired_strength, experiment_num=30, initial_sample_size=4, iteration_num=10):
    # Initialize Random Forest model
    dtr = RandomForestRegressor()

    # Define scaler outside the loop
    X_scaler = StandardScaler()

    for experiment in range(experiment_num):
        print(f"Experiment: {experiment+1}")
        # Randomly sample initial training set
        train_data = data.sample(n=initial_sample_size)

        # All other data is potential test data
        test_data = data.drop(train_data.index)

        for _ in range(iteration_num):
            # Fit and transform the data
            X_train = X_scaler.fit_transform(train_data[features])
            y_train = train_data[target].values
            # Check if number of samples less than 8, if yes, tile them
            if X_train.shape[0] < 8:
                repeat_times = 8 // X_train.shape[0] + 1
                X_train = np.tile(X_train, (repeat_times, 1))
                y_train = np.tile(y_train, repeat_times)
            # Train RF model
            dtr.fit(X_train, y_train)

            # Use the acquisition function to select the next point from the test set
            X_test = X_scaler.transform(test_data[features])
            acq_values = acquisition(X_test, dtr, X_scaler)

            # Select the point that maximizes the acquisition function
            max_acq_index = np.argmax(acq_values)
            X_next = X_test[max_acq_index]

            # Get the original index from the test_data DataFrame
            original_index = test_data.index[max_acq_index]

            # Append X_next to training data
            train_data = pd.concat([train_data, test_data.loc[[original_index]]])

            # Remove X_next from test_data
            test_data = test_data.drop(original_index)

            # If we have found a material with the desired strength, we stop sampling
            if train_data.loc[original_index, target] >= desired_strength:
                break
        filename = f"results/RF/experiment_{experiment+1}_RF_initialsample_{initial_sample_size}_target_{int(targ_quant.value)}_%_Dev_Budget_{budget}_{timestamp}.csv"
        train_data.to_csv(filename, index=False)
        
        
# load the data
file_path = os.path.join('Data', 'numeric_data.csv')
data = pd.read_csv(file_path)

# Extract features and target
features = ["Powderkg", "Liquidkg", "WC", "Fly_Ash_ratio", "GGBFS_ratio", "temperature"]
target = "fc_28dGroundTruth"

timestamp = str(int(time.time()))               
run_experiment(data, features, target, desired_strength, experiment_num=30, initial_sample_size=4, iteration_num=10)
                

In [3]:
import os
import time
import pandas as pd
import numpy as np
# Random Draw 
def run_experiment(data, features, target, desired_strength, experiment_num=30, initial_sample_size=0, iteration_num=10):
    # Initialize Random Forest model
    dtr = RandomForestRegressor()

    # Define scaler outside the loop
    X_scaler = StandardScaler()

    for experiment in range(experiment_num):
        print(f"Experiment: {experiment+1}")
        # Randomly sample initial training set
        train_data = data.sample(n=initial_sample_size)

        # All other data is potential test data
        test_data = data.drop(train_data.index)

        for _ in range(iteration_num):
            # Fit and transform the data
            # Randomly select a point from the test set
            original_index = np.random.choice(test_data.index)

            # Append X_next to training data
            train_data = pd.concat([train_data, test_data.loc[[original_index]]])

            # Remove X_next from test_data
            test_data = test_data.drop(original_index)

            # If we have found a material with the desired strength, we stop sampling
            if train_data.loc[original_index, target] >= desired_strength:
                break
        filename = f"results/RP/experiment_{experiment+1}_RP_target_{int(targ_quant.value)}_%_Dev_Budget_{budget}_{timestamp}.csv"
        train_data.to_csv(filename, index=False)
        
        
# load the data
file_path = os.path.join('Data', 'numeric_data.csv')
data = pd.read_csv(file_path)

# Extract features and target
features = ["Powderkg", "Liquidkg", "WC", "Fly_Ash_ratio", "GGBFS_ratio", "temperature"]
target = "fc_28dGroundTruth"
timestamp = str(int(time.time()))               
run_experiment(data, features, target, desired_strength, experiment_num=30, initial_sample_size=0, iteration_num=10)
                

FileNotFoundError: [Errno 2] No such file or directory: 'Data/numeric_data.csv'