In [34]:
import pandas as pd
import numpy as np
import os
from IPython.display import Image, display, clear_output
import ipywidgets as widgets

import umap
import hdbscan

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

import plotly.express as px
import plotly.graph_objects as go

# Folder with boiling plot images (try ../data/boiling_plots or data/boiling_plots)
BOILING_PLOTS_DIR = os.path.join("..", "data", "boiling_plots")
if not os.path.isdir(BOILING_PLOTS_DIR):
    BOILING_PLOTS_DIR = os.path.join("data", "boiling_plots")
BOILING_PLOTS_DIR = os.path.abspath(BOILING_PLOTS_DIR)

def image_path_for_file_name(file_name):
    """Get path to boiling_plots image for a given CSV file_name. Tries exact match then close match."""
    if not isinstance(file_name, str) or not file_name.strip():
        return None
    base = file_name.strip().replace(".csv", "")
    exact = os.path.join(BOILING_PLOTS_DIR, base + ".png")
    if os.path.isfile(exact):
        return exact
    if not os.path.isdir(BOILING_PLOTS_DIR):
        return None
    for f in os.listdir(BOILING_PLOTS_DIR):
        if not f.lower().endswith(".png"):
            continue
        if f.startswith(base) or base in f or f.replace(".png", "") == base:
            return os.path.join(BOILING_PLOTS_DIR, f)
    return None

In [35]:
# Load data
df = pd.read_csv("../data/features.csv")
df2 = pd.read_csv("../data/features_before.csv")

# Separate metadata and features
file_names = df["file_name"]
X = df.drop(columns=["file_name"])
X2 = df2.drop(columns=["file_name"])

In [36]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X2_scaled = scaler.fit_transform(X2)

# # Optional: remove near-constant features
# X_scaled = VarianceThreshold(threshold=1e-3).fit_transform(X_scaled)

# Fit UMAP
umap_model = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=2,
    metric="euclidean",
    random_state=42
)

X_umap = umap_model.fit_transform(X_scaled)
X2_umap = umap_model.fit_transform(X2_scaled)



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [37]:
# Fit HDBSCAN
# Best example 17, 6
clusterer = hdbscan.HDBSCAN(
    min_cluster_size= 17, # larger -> fewer, more conservative clusters
    min_samples=6, # larger -> more points labled as noise
    metric="euclidean"
)

labels = clusterer.fit_predict(X_umap)
labels2 = clusterer.fit_predict(X2_umap)

# Attach cluster labels
df["cluster"] = labels
df2["cluster"] = labels2

In [None]:
# Visualization (interactive: hover for file_name, click to open boiling plot image)
plot_df = pd.DataFrame({
    "UMAP-1": X_umap[:, 0],
    "UMAP-2": X_umap[:, 1],
    "cluster": labels.astype(str),
    "file_name": file_names.values
})
fig_px = px.scatter(plot_df, x="UMAP-1", y="UMAP-2", color="cluster",
                    hover_data=["file_name"], title="UMAP + HDBSCAN on Accelerometer Feature Vectors")
fig_px.update_layout(width=800, height=600)
fig = go.FigureWidget(fig_px)
img_output = widgets.Output()

def on_click(trace, points, selector):
    with img_output:
        clear_output(wait=True)
        if not points.point_inds:
            return
        idx = points.point_inds[0]
        # customdata is list of [file_name] per point (from hover_data)
        cd = getattr(trace, "customdata", None)
        fn = cd[idx][0] if cd is not None and idx < len(cd) else plot_df["file_name"].iloc[idx]
        path = image_path_for_file_name(fn)
        print(f"file_name: {fn}")
        if path and os.path.isfile(path):
            display(Image(filename=path))
        else:
            print(f"No image found for: {fn}")

for t in fig.data:
    t.on_click(on_click)
display(widgets.VBox([fig, img_output]))


VBox(children=(FigureWidget({
    'data': [{'customdata': array([['Boiling_at_Heater_T85_RefVideo1.csv'],
    …

In [39]:
# visual previous features (interactive: hover for file_name, click to open boiling plot image)
plot_df2 = pd.DataFrame({
    "UMAP-1": X2_umap[:, 0],
    "UMAP-2": X2_umap[:, 1],
    "cluster": labels2.astype(str),
    "file_name": df2["file_name"].values
})
fig2_px = px.scatter(plot_df2, x="UMAP-1", y="UMAP-2", color="cluster",
                     hover_data=["file_name"], title="UMAP + HDBSCAN on Previous Accelerometer Feature Vectors")
fig2_px.update_layout(width=800, height=600)
fig2 = go.FigureWidget(fig2_px)
img_output2 = widgets.Output()

def on_click2(trace, points, selector):
    with img_output2:
        clear_output(wait=True)
        if not points.point_inds:
            return
        idx = points.point_inds[0]
        cd = getattr(trace, "customdata", None)
        fn = cd[idx][0] if cd is not None and idx < len(cd) else plot_df2["file_name"].iloc[idx]
        path = image_path_for_file_name(fn)
        print(f"file_name: {fn}")
        if path and os.path.isfile(path):
            display(Image(filename=path))
        else:
            print(f"No image found for: {fn}")

for t in fig2.data:
    t.on_click(on_click2)
display(widgets.VBox([fig2, img_output2]))


VBox(children=(FigureWidget({
    'data': [{'customdata': array([['MATLAB 3-53 PM Mon, Aug 26, 2024 Run4 .csv'…