In [15]:
import torch
import numpy as np
import pandas as pd
from typing import List, Optional, Tuple
import logging
import os
import time
from itertools import product
import toml

from botorch.models import MixedSingleTaskGP
from botorch.fit import fit_gpytorch_model
from gpytorch.mlls import ExactMarginalLogLikelihood
from botorch.acquisition import ExpectedImprovement, UpperConfidenceBound
from botorch.optim import optimize_acqf_mixed
from botorch.models.transforms import Normalize, Standardize

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler


In [16]:
from endure.lsm.cost import EndureCost
from endure.lsm.types import LSMDesign, System, Policy, Workload, LSMBounds
from endure.lcm.data.generator import KHybridGenerator

In [30]:
def initialize_feature_list(bounds: torch.Tensor) -> List:
        lower_t_bound = bounds.size_ratio_range[0]
        upper_t_bound = bounds.size_ratio_range[1]
        fixed_features_list = []
        for t in range(2, upper_t_bound + 1):
                param_values = [range(1, upper_t_bound)] * num_k_values
                for combination in product(*param_values):
                    fixed_feature = {1: t}
                    fixed_feature.update(
                        {i + 2: combination[i] for i in range(len(combination))}
                    )
                    fixed_features_list.append(fixed_feature)
        return fixed_features_list
        

In [31]:
directory_path = '/data_comp/train-data/lcm/std/kcost'
parquet_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('0000.parquet')]
df = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)
df['total_cost'] = df['z0_cost'] + df['z1_cost'] + df['q_cost'] + df['w_cost']
df = df.head(1000)

In [32]:
print(df.columns)

Index(['z0_cost', 'z1_cost', 'q_cost', 'w_cost', 'z0', 'z1', 'q', 'w',
       'entry_p_page', 'selec', 'entry_size', 'max_h', 'num_elem', 'h', 'T',
       'K_0', 'K_1', 'K_2', 'K_3', 'K_4', 'K_5', 'K_6', 'K_7', 'K_8', 'K_9',
       'K_10', 'K_11', 'K_12', 'K_13', 'K_14', 'K_15', 'K_16', 'K_17', 'K_18',
       'K_19', 'total_cost'],
      dtype='object')


In [33]:
workload_columns = ['z0', 'z1', 'q', 'w']
design_columns = ['h', 'T'] + [f'K_{i}' for i in range(20)]
X_workload = df[workload_columns].values

X = df[design_columns].values
y = df['total_cost'].values

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_scaled = torch.tensor(X_scaled, dtype=torch.float64)
y = torch.tensor(y, dtype=torch.float64).unsqueeze(-1)

with open("endure.toml") as fid:
        config = toml.load(fid)
def_bounds = LSMBounds(**config["lsm"]["bounds"])
cf: EndureCost = EndureCost(def_bounds.max_considered_levels)
# num_k_values = config["job"]["BayesianOptimization"]["num_k_values"]
num_k_values = 20

In [34]:
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X_workload)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

  super()._check_params_vs_input(X, default_n_init=10)


In [35]:
def def_mixed_gp_model(train_X, train_Y, cat_dims, bounds):
    train_Y = (train_Y - train_Y.mean()) / train_Y.std()
    gp = MixedSingleTaskGP(
        train_X, train_Y, cat_dims=cat_dims,
        input_transform=Normalize(d=train_X.shape[1], bounds=bounds),
        outcome_transform=Standardize(m=1)
    )
    mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
    fit_gpytorch_model(mll)
    return gp

In [36]:
def gen_bounds() -> torch.Tensor:
        h_bounds = torch.tensor(
            [
                def_bounds.bits_per_elem_range[0],
                def_bounds.bits_per_elem_range[1],
            ],
            dtype=torch.float,
        )

        t_bounds = torch.tensor(def_bounds.size_ratio_range)
        policy_bounds = torch.tensor([0, 1])
        
        lower_limits = [
            def_bounds.bits_per_elem_range[0],
            def_bounds.size_ratio_range[0],
            ] + [1] * num_k_values
        upper_limits = [
            def_bounds.bits_per_elem_range[1],
            def_bounds.size_ratio_range[1],
            ] + [def_bounds.size_ratio_range[1] - 1] * num_k_values
        new_bounds_list = [lower_limits, upper_limits]
        bounds = torch.tensor(new_bounds_list, dtype=torch.float64)
        return bounds

In [37]:
gp_models = []

for i in range(num_clusters):
    cluster_indices = np.where(labels == i)[0]
    X_cluster_design = X_scaled[cluster_indices]
    y_cluster = y[cluster_indices]
    best_y = y_cluster.min().item()
    cat_dims = list(range(1, def_bounds.max_considered_levels + 2))
    bounds = gen_bounds()
    gp_model = def_mixed_gp_model(X_cluster_design, y_cluster, cat_dims, bounds)
    gp_models.append(gp_model)



In [38]:
def suggest_design_for_workload(workload, centroids, gp_models, bounds):
    workload_array = np.array([workload.z0, workload.z1, workload.q, workload.w])
    distances = np.linalg.norm(centroids - workload_array, axis=1)
    closest_cluster = np.argmin(distances)
    gp_model = gp_models[closest_cluster]
    workload_tensor = torch.tensor(workload_array, dtype=torch.float32).unsqueeze(0)
    gp_model.eval()
    acqf = ExpectedImprovement(model=gp_model, best_f=best_y, maximize=False)
    fixed_feature_list = initialize_feature_list(def_bounds) 
    with torch.no_grad():
        candidate, acq_value = optimize_acqf_mixed(
            acq_function=acqf,
            bounds=bounds,
            q=1,
            num_restarts=5,
            raw_samples=20,
            fixed_features_list=[]
        )
    
    design = candidate.squeeze().numpy()
    return design

In [None]:
def generate_designs(generator, bounds, centroids, gp_models):
    designs = []
    for _ in range(5):
        z0, z1, q, w = generator._sample_workload(4)
        workload = Workload(z0=z0, z1=z1, q=q, w=w)
        new_design = suggest_design_for_workload(workload, centroids, gp_models, bounds)
        designs.append(new_design)
        print(new_design)
    return designs

generator = KHybridGenerator(def_bounds)
bounds = torch.tensor([
    [def_bounds.bits_per_elem_range[0], def_bounds.size_ratio_range[0]] + [1] * num_k_values,
    [def_bounds.bits_per_elem_range[1], def_bounds.size_ratio_range[1]] + [def_bounds.size_ratio_range[1] - 1] * num_k_values
], dtype=torch.float32).T
designs = generate_designs(generator, bounds, centroids, gp_models)

check_identical_designs(designs, idxs)


In [None]:
generator = KHybridGenerator(def_bounds)
bounds = torch.tensor([
    [def_bounds.bits_per_elem_range[0], def_bounds.size_ratio_range[0]] + [1] * num_k_values,
    [def_bounds.bits_per_elem_range[1], def_bounds.size_ratio_range[1]] + [def_bounds.size_ratio_range[1] - 1] * num_k_values
designs = generate_designs(generator, bounds, centroids, gp_models)

# Check for identical designs (assuming the function check_identical_designs is defined elsewhere)
check_identical_designs(designs, idxs)