In [1]:
import os
import warnings

import hdbscan
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import umap
from ax.plot.pareto_frontier import plot_pareto_frontier
from ax.plot.pareto_utils import compute_posterior_pareto_frontier
from ax.service.ax_client import AxClient
from ax.service.managed_loop import optimize
from ax.service.utils.instantiation import ObjectiveProperties
from sklearn.metrics import (
    calinski_harabasz_score,
    davies_bouldin_score,
    silhouette_score,
)

from ssl_wafermap.utilities.plotting import create_subplots, mpn65_palette

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_pickle("../data/interim/model_preds/SwaV_preds_full.pkl.xz")
cols = df.columns.difference(["failureType", "failureCode", "waferMap"])
data = df[cols].values

In [3]:
reducer = umap.UMAP(
    random_state=0,
    n_neighbors=30,
    min_dist=0,
    n_components=50,
    densmap=True,
    dens_lambda=0.1,
)
reduced_data = reducer.fit_transform(data)

In [4]:
def hdbscan_evaluation_function(parameterization):
    # Perform HDBSCAN clustering on the data with given parameters
    clusterer = hdbscan.HDBSCAN(
        min_samples=parameterization.get("min_samples"),
        min_cluster_size=parameterization.get("min_cluster_size"),
        cluster_selection_epsilon=parameterization.get("cluster_selection_epsilon"),
        metric=parameterization.get("metric"),
    )
    # clusterer.fit(data)
    clusterer.fit(reduced_data)

    # Calculate the number of clusters and points labeled as noise to constrain the optimization
    labels = clusterer.labels_
    n_clusters = labels.max() + 1
    n_noise = (labels == -1).sum()

    # Compute the silhouette score, Calinski-Harabasz score, and Davies-Bouldin score
    # These should be on the subset of the data NOT labeled as noise (i.e. labels != -1)
    subset_data, subset_labels = reduced_data[labels != -1], labels[labels != -1]
    silhouette = silhouette_score(subset_data, subset_labels)
    calinski_harabasz = calinski_harabasz_score(subset_data, subset_labels)
    davies_bouldin = davies_bouldin_score(subset_data, subset_labels)

    # Return the evaluation metrics and outcome constraints
    # These are tuples of the metrics with the SEM
    return {
        "n_noise": (n_noise, 0),
        "n_clusters": (n_clusters, 0),
        "silhouette": (silhouette, 0),
        "calinski_harabasz": (calinski_harabasz, 0),
        "davies_bouldin": (davies_bouldin, 0),
    }


# Create a search space for the hyperparameters
parameters = [
    {"name": "min_samples", "type": "range", "bounds": [1, 60], "value_type": "int"},
    {
        "name": "min_cluster_size",
        "type": "range",
        "bounds": [10, 100],
        "value_type": "int",
    },
    {
        "name": "cluster_selection_epsilon",
        "type": "range",
        "bounds": [0.1, 1.5],
        "value_type": "float",
    },
    {
        "name": "metric",
        "type": "choice",
        "values": ["euclidean", "manhattan", "canberra", "braycurtis"],
        "value_type": "str",
        "is_ordered": False,
    },
]


# Initialize the optimization client
ax_client = AxClient(random_seed=0)
ax_client.create_experiment(
    parameters=parameters,
    # Optimize cluster evaluation metrics while minimizing the number of noise points
    objectives={
        "silhouette": ObjectiveProperties(minimize=False),
        "calinski_harabasz": ObjectiveProperties(minimize=False),
        "davies_bouldin": ObjectiveProperties(minimize=True),
        "n_noise": ObjectiveProperties(minimize=True),
    },
    # We will constrain the number of possible clusters and points that are labeled as noise
    outcome_constraints=["n_clusters <= 30", "n_clusters >= 15"],
    overwrite_existing_experiment=True,
)

# Run 30 trials
for i in range(30):
    parameterization, trial_index = ax_client.get_next_trial()
    ax_client.complete_trial(
        trial_index=trial_index,
        raw_data=hdbscan_evaluation_function(parameterization),
    )

[INFO 05-19 10:37:22] ax.service.ax_client: Starting optimization with verbose logging. To disable logging, set the `verbose_logging` argument to `False`. Note that float values in the logs are rounded to 6 decimal points.
[INFO 05-19 10:37:22] ax.service.utils.instantiation: Due to non-specification, we will use the heuristic for selecting objective thresholds.
[INFO 05-19 10:37:22] ax.service.utils.instantiation: Created search space: SearchSpace(parameters=[RangeParameter(name='min_samples', parameter_type=INT, range=[1, 60]), RangeParameter(name='min_cluster_size', parameter_type=INT, range=[5, 100]), RangeParameter(name='cluster_selection_epsilon', parameter_type=FLOAT, range=[0.1, 1.5]), ChoiceParameter(name='metric', parameter_type=STRING, values=['euclidean', 'manhattan', 'canberra', 'braycurtis'], is_ordered=False, sort_values=False)], parameter_constraints=[]).
[INFO 05-19 10:37:22] ax.modelbridge.dispatch_utils: Using Bayesian optimization with a categorical kernel for impro

In [112]:
# Retrieve the best parameters
best_param_dict = ax_client.get_pareto_optimal_parameters()

# Display the best parameters and their evaluation metrics
summary_table = []
for exp_idx, summary in best_param_dict.items():
    params, (eval_metrics, sems) = summary
    row = {"Experiment": exp_idx, **params, **eval_metrics}
    summary_table.append(row)

summary_table = pd.DataFrame(summary_table)
int_cols = ["n_clusters", "n_noise"]
summary_table[int_cols] = summary_table[int_cols].astype(int)

param_cols = ["min_samples", "min_cluster_size", "cluster_selection_epsilon", "metric"]
obj_cols = [
    "n_clusters",
    "n_noise",
    "silhouette",
    "calinski_harabasz",
    "davies_bouldin",
]

param_df = summary_table[param_cols]
obj_df = summary_table[obj_cols]

# Concatenate the grouped columns
df_summary = pd.concat(
    [param_df, obj_df], axis=1, keys=["Hyperparameters", "Objectives"]
)
df_summary.index = summary_table["Experiment"]
df_summary

[INFO 05-19 16:20:00] ax.modelbridge.torch: The observations are identical to the last set of observations used to fit the model. Skipping model fitting.
[INFO 05-19 16:20:00] ax.service.utils.best_point: Using inferred objective thresholds: [ObjectiveThreshold(calinski_harabasz >= 18515.715795196178), ObjectiveThreshold(davies_bouldin <= 0.5655129241684163), ObjectiveThreshold(n_noise <= 2819.0235794422542), ObjectiveThreshold(silhouette >= 0.44177356250043925)], as objective thresholds were not specified as part of the optimization configuration on the experiment.


Unnamed: 0_level_0,Hyperparameters,Hyperparameters,Hyperparameters,Hyperparameters,Objectives,Objectives,Objectives,Objectives,Objectives
Unnamed: 0_level_1,min_samples,min_cluster_size,cluster_selection_epsilon,metric,n_clusters,n_noise,silhouette,calinski_harabasz,davies_bouldin
Experiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0,29,61,0.792271,canberra,15,330,0.464322,30807.981322,0.547937
11,60,60,0.480578,euclidean,16,2239,0.671799,96756.24993,0.416208
15,60,29,0.442924,euclidean,20,2592,0.652893,93385.555137,0.409964
16,60,77,0.447398,euclidean,17,2512,0.683756,107706.481912,0.418814
17,13,41,0.674771,euclidean,15,363,0.570803,27143.14208,0.52684
19,49,41,0.585896,euclidean,15,1355,0.613641,58374.99824,0.398364
21,60,5,0.572264,euclidean,16,1688,0.592933,61005.738192,0.395183
22,48,5,0.68579,euclidean,15,997,0.551328,39512.495621,0.492939
27,60,100,0.448317,euclidean,16,2490,0.689801,113417.404928,0.424819
28,60,46,0.55822,euclidean,15,1722,0.650908,76575.607392,0.372175


In [None]:
# Create a 2D UMAP embedding of the data
reducer_2d = umap.UMAP(random_state=0)
embedding_2d = reducer_2d.fit_transform(data)

In [147]:
best_idx = 15
best_params = best_param_dict[best_idx][0]

# Perform HDBSCAN clustering on the data with the best parameters
clusterer = hdbscan.HDBSCAN(**best_params)
clusterer.fit(reduced_data)

In [148]:
emb_df = pd.DataFrame(embedding_2d, columns=["x", "y"])
emb_df["failureType"] = df["failureType"].values
emb_df["waferMap"] = df["waferMap"].values
emb_df["cluster"] = pd.Series(clusterer.labels_).astype("category").values
emb_df.sort_values("cluster", inplace=True)

In [168]:
fig = px.scatter(
    emb_df,
    x="x",
    y="y",
    color="cluster",
    # color_discrete_sequence=px.colors.qualitative.Light24,
    color_discrete_sequence=mpn65_palette(),
    width=800,
    height=600,
    template="simple_white",
    color_discrete_map={-1: "lightgray"},
)
# fig.update_traces(
#     marker=dict(
#         line=dict(
#             width=emb_df["cluster"].map(lambda x: 0.5 if x < 0 else 0).values,
#             color="white",
#         )
#     )
# )
fig.show()