In [1]:
import pandas as pd
import numpy as np
import os
import sys
from io import BytesIO
from IPython.display import Image, display, clear_output
import ipywidgets as widgets
import matplotlib.pyplot as plt

import umap
import hdbscan

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

import plotly.express as px
import plotly.graph_objects as go

# Allow importing from visuals (project root)
sys.path.insert(0, os.path.abspath(".."))
from visuals.visualization_fixed import visualize_csv_data as visualize_csv_data_fixed

# Folder with boiling plot images (try visuals/boiling_plots, then data/boiling_plots)
BOILING_PLOTS_DIR = os.path.join("..", "visuals", "boiling_plots")
if not os.path.isdir(BOILING_PLOTS_DIR):
    BOILING_PLOTS_DIR = os.path.join("visuals", "boiling_plots")
if not os.path.isdir(BOILING_PLOTS_DIR):
    BOILING_PLOTS_DIR = os.path.join("..", "data", "boiling_plots")
if not os.path.isdir(BOILING_PLOTS_DIR):
    BOILING_PLOTS_DIR = os.path.join("data", "boiling_plots")
BOILING_PLOTS_DIR = os.path.abspath(BOILING_PLOTS_DIR)

# Folder with CSV data (for generating fixed y-axis plot on click)
CSV_DIR = os.path.join("..", "data", "CSV")
if not os.path.isdir(CSV_DIR):
    CSV_DIR = os.path.join("data", "CSV")
CSV_DIR = os.path.abspath(CSV_DIR) if os.path.isdir(CSV_DIR) else None

def image_path_for_file_name(file_name):
    """Get path to boiling_plots image for a given CSV file_name. Tries exact match then close match."""
    if not isinstance(file_name, str) or not file_name.strip():
        return None
    base = file_name.strip().replace(".csv", "")
    exact = os.path.join(BOILING_PLOTS_DIR, base + ".png")
    if os.path.isfile(exact):
        return exact
    if not os.path.isdir(BOILING_PLOTS_DIR):
        return None
    for f in os.listdir(BOILING_PLOTS_DIR):
        if not f.lower().endswith(".png"):
            continue
        if f.startswith(base) or base in f or f.replace(".png", "") == base:
            return os.path.join(BOILING_PLOTS_DIR, f)
    return None

def csv_path_for_file_name(file_name):
    """Get path to CSV file for a given file_name (for fixed y-axis plot)."""
    if not isinstance(file_name, str) or not file_name.strip() or not CSV_DIR:
        return None
    fn = file_name.strip()
    exact = os.path.join(CSV_DIR, fn)
    if os.path.isfile(exact):
        return exact
    for f in os.listdir(CSV_DIR):
        if not f.lower().endswith(".csv"):
            continue
        if f == fn or f.strip() == fn:
            return os.path.join(CSV_DIR, f)
    return None

In [2]:
# Load data
df = pd.read_csv("../data/features.csv")
df2 = pd.read_csv("../data/features_before.csv")

# Separate metadata and features
file_names = df["file_name"]
X = df.drop(columns=["file_name"])
X2 = df2.drop(columns=["file_name"])

In [3]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X2_scaled = scaler.fit_transform(X2)

# # Optional: remove near-constant features
# X_scaled = VarianceThreshold(threshold=1e-3).fit_transform(X_scaled)

# Fit UMAP
umap_model = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=2,
    metric="euclidean",
    random_state=42
)

X_umap = umap_model.fit_transform(X_scaled)
X2_umap = umap_model.fit_transform(X2_scaled)


  warn(


In [4]:
# Fit HDBSCAN
# Best example 17, 6
clusterer = hdbscan.HDBSCAN(
    min_cluster_size= 17, # larger -> fewer, more conservative clusters
    min_samples=6, # larger -> more points labled as noise
    metric="euclidean"
)

labels = clusterer.fit_predict(X_umap)
labels2 = clusterer.fit_predict(X2_umap)

# Attach cluster labels
df["cluster"] = labels
df2["cluster"] = labels2

In [11]:
# Visualization (interactive: hover for file_name, click to open boiling plot image)
plot_df = pd.DataFrame({
    "UMAP-1": X_umap[:, 0],
    "UMAP-2": X_umap[:, 1],
    "cluster": labels.astype(str),
    "file_name": file_names.values
})
fig_px = px.scatter(plot_df, x="UMAP-1", y="UMAP-2", color="cluster",
                    hover_data=["file_name"], title="UMAP + HDBSCAN on Accelerometer Feature Vectors")
fig_px.update_layout(width=800, height=600)
fig = go.FigureWidget(fig_px)
img_output = widgets.Output()

def on_click(trace, points, selector):
    with img_output:
        clear_output(wait=True)
        if not points.point_inds:
            return
        idx = points.point_inds[0]
        # customdata is list of [file_name] per point (from hover_data)
        cd = getattr(trace, "customdata", None)
        fn = cd[idx][0] if cd is not None and idx < len(cd) else plot_df["file_name"].iloc[idx]
        img_path = image_path_for_file_name(fn)
        csv_path = csv_path_for_file_name(fn)
        print(f"file_name: {fn}")
        # Left: fixed y-axis (-0.3, 0.3); Right: normal auto-scale
        left_img = right_img = None
        if csv_path and os.path.isfile(csv_path):
            try:
                fig_fixed = visualize_csv_data_fixed(csv_path)
                buf = BytesIO()
                fig_fixed.savefig(buf, format="png", bbox_inches="tight")
                plt.close(fig_fixed)
                buf.seek(0)
                left_img = widgets.Image(value=buf.read(), format="png")
            except Exception as e:
                print(f"Fixed-axis plot failed: {e}")
        if img_path and os.path.isfile(img_path):
            with open(img_path, "rb") as f:
                right_img = widgets.Image(value=f.read(), format="png")
        if left_img is not None and right_img is not None:
            display(widgets.HBox([left_img, right_img]))
        elif left_img is not None:
            display(left_img)
        elif right_img is not None:
            display(right_img)
        else:
            print(f"No CSV or image found for: {fn}")

for t in fig.data:
    t.on_click(on_click)
display(widgets.VBox([fig, img_output]))

# Cluster output: name + cluster number → umap_hdbscan_cluster_output.csv
df_out = pd.DataFrame({
    "name": file_names,
    "cluster": labels,
})
df_out.to_csv("umap_hdbscan_cluster_output.csv", index=False)
df_out

VBox(children=(FigureWidget({
    'data': [{'customdata': array([['Boiling_at_Heater_T85_RefVideo1.csv'],
    …

Unnamed: 0,name,cluster
0,Boiling_at_Heater_T85_RefVideo1.csv,6
1,Boiling_at_Heater_T93_RefVideo2.csv,6
2,Boiling_at_Heater_T98_RefVideo3.csv,6
3,Calm_to_Boiling_50V_on_StartT65.csv,2
4,Calm_to_Boiling_50V_on_StartT80.csv,6
...,...,...
441,"MATLAB 4-58 PM Thu, Oct 10, 2024 Run12.csv",2
442,"MATLAB 4-59 PM Tue, Oct 1, 2024 Run12 .csv",0
443,"MATLAB 5-01 PM Thu, Oct 10, 2024 Run13.csv",2
444,"MATLAB 5-01 PM Tue, Oct 1, 2024 Run13 .csv",5


In [6]:
from pathlib import Path
import os
import shutil

# Load features once (outside loop)
features_csv = Path("../data/features.csv")  # relative to notebook location
features_df = pd.read_csv(features_csv)

# Get unique clusters and get 5 representative samples from each cluster
representative_samples = []
for cluster in np.unique(labels):
    cluster_samples = df[df["cluster"] == cluster]
    representative_samples.append(cluster_samples.sample(n=5, random_state=42))
    print(f"Cluster {cluster}:")
    print(cluster_samples["file_name"].values[:5])
    
    # Merge cluster samples with features_df to get all feature values
    merged_cluster_samples = pd.merge(cluster_samples, features_df, on="file_name", how="left")
    
    # Save merged data to CSV
    output_folder = Path(f"../visuals/rep_samples/UMAP-HDBSCAN/{cluster}")
    if output_folder.exists():
        shutil.rmtree(output_folder)
    output_folder.mkdir(parents=True, exist_ok=True)
    output_file = output_folder / f"cluster_{cluster}.csv"
    merged_cluster_samples.to_csv(output_file, index=False)
    
    # Copy representative sample plots
    for file in cluster_samples["file_name"].values[:5]:
        png_file = file.replace(".csv", ".png")
        src = Path(f"../visuals/boiling_plots/{png_file}")
        dst = output_folder / png_file
        
        # Only copy if source exists
        if src.exists():
            shutil.copy(src, dst)
        else:
            print(f"Warning: {src} not found, skipping")

Cluster -1:
['MATLAB 1-43 PM Thu, Oct 31, 2024 Run2 .csv'
 'MATLAB 1-52 PM Fri, Jun 14, 2024 Run4 .csv'
 'MATLAB 12-37 PM Fri, May 31, 2024 Run1 .csv'
 'MATLAB 12-40 PM Fri, Jun 28, 2024 Run3 .csv'
 'MATLAB 2-55 PM Wed, Nov 6, 2024 Run0 .csv']
Cluster 0:
['MATLAB 1-00 PM Fri, Jun 28, 2024 Run8 .csv'
 'MATLAB 1-02 PM Thu, Nov 7, 2024 Run8 .csv'
 'MATLAB 1-07 PM Fri, Jun 14, 2024 Run1 .csv'
 'MATLAB 1-07 PM Fri, May 31, 2024 Run3 .csv'
 'MATLAB 1-12 PM Thu, Nov 7, 2024 Run9 .csv']
Cluster 1:
['MATLAB 1-05 PM Mon, Mar 11, 2024 Run5 .csv'
 'MATLAB 1-08 PM Mon, Mar 11, 2024 Run6 .csv'
 'MATLAB 1-16 PM Mon, Mar 11, 2024 Run7 .csv'
 'MATLAB 12-20 PM Fri, Mar 8, 2024 Run3 .csv'
 'MATLAB 12-54 PM Tue, Apr 2, 2024 Run5 .csv']
Cluster 2:
['Calm_to_Boiling_50V_on_StartT65.csv'
 'MATLAB 1-14 PM Tue, Sep 10, 2024 Run10 .csv'
 'MATLAB 1-21 PM Thu, Nov 7, 2024 Run10 .csv'
 'MATLAB 1-25 PM Thu, Nov 7, 2024 Run11 .csv'
 'MATLAB 1-29 PM Thu, Nov 7, 2024 Run12 .csv']
Cluster 3:
['MATLAB 1-02 PM Mon, Mar 1

In [7]:
# visual previous features (interactive: hover for file_name, click to open boiling plot image)
plot_df2 = pd.DataFrame({
    "UMAP-1": X2_umap[:, 0],
    "UMAP-2": X2_umap[:, 1],
    "cluster": labels2.astype(str),
    "file_name": df2["file_name"].values
})
fig2_px = px.scatter(plot_df2, x="UMAP-1", y="UMAP-2", color="cluster",
                     hover_data=["file_name"], title="UMAP + HDBSCAN on Previous Accelerometer Feature Vectors")
fig2_px.update_layout(width=800, height=600)
fig2 = go.FigureWidget(fig2_px)
img_output2 = widgets.Output()

def on_click2(trace, points, selector):
    with img_output2:
        clear_output(wait=True)
        if not points.point_inds:
            return
        idx = points.point_inds[0]
        cd = getattr(trace, "customdata", None)
        fn = cd[idx][0] if cd is not None and idx < len(cd) else plot_df2["file_name"].iloc[idx]
        path = image_path_for_file_name(fn)
        print(f"file_name: {fn}")
        if path and os.path.isfile(path):
            display(Image(filename=path))
        else:
            print(f"No image found for: {fn}")

for t in fig2.data:
    t.on_click(on_click2)
display(widgets.VBox([fig2, img_output2]))

VBox(children=(FigureWidget({
    'data': [{'customdata': array([['MATLAB 3-53 PM Mon, Aug 26, 2024 Run4 .csv'…

In [8]:
import pandas as pd
import umap
import hdbscan

# 1) Sweep n_neighbors 10..30 → UMAP → HDBSCAN → one row per n_neighbors, columns = file_name
n_neighbors_range = range(10, 31)
rows = []
for k in n_neighbors_range:
    umap_k = umap.UMAP(n_neighbors=k, min_dist=0.1, n_components=2, metric="euclidean", random_state=42)
    X_umap_k = umap_k.fit_transform(X_scaled)
    clusterer = hdbscan.HDBSCAN(min_cluster_size=16, min_samples=7, metric="euclidean")
    labels_k = clusterer.fit_predict(X_umap_k)
    rows.append({"n_neighbors": k, **{fn: int(labels_k[i]) for i, fn in enumerate(file_names)}})
n_neighbors_sweep_df = pd.DataFrame(rows).set_index("n_neighbors")

# 2) Consensus (mode per sample) and agreement per row
consensus_cluster = n_neighbors_sweep_df.mode(axis=0).iloc[0]
agreement = (n_neighbors_sweep_df == consensus_cluster).sum(axis=1)

# 3) n_clusters per run (excluding noise -1), constrain to 3–6
n_clusters_per_run = n_neighbors_sweep_df.apply(lambda row: row[row >= 0].nunique(), axis=1)
MIN_CLUSTERS, MAX_CLUSTERS = 3, 6
valid = (n_clusters_per_run >= MIN_CLUSTERS) & (n_clusters_per_run <= MAX_CLUSTERS)

# 4) Best n_neighbors among valid (max agreement)
if valid.any():
    best_n_neighbors = int(agreement.loc[valid].idxmax())
    best_row = n_neighbors_sweep_df.loc[best_n_neighbors]
else:
    best_n_neighbors = None
    best_row = None

# Optional: inspect
print("n_clusters per n_neighbors:\n", n_clusters_per_run)
print("Valid (3–6 clusters):", sorted(agreement.loc[valid].index.tolist()) if valid.any() else "none")
print("best_n_neighbors =", best_n_neighbors)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting 

n_clusters per n_neighbors:
 n_neighbors
10    2
11    2
12    2
13    5
14    2
15    8
16    2
17    4
18    2
19    2
20    2
21    5
22    7
23    5
24    2
25    2
26    6
27    6
28    2
29    2
30    5
dtype: int64
Valid (3–6 clusters): [13, 17, 21, 23, 26, 27, 30]
best_n_neighbors = 17
