In [1]:
# RAND Score Clustering Analysis
#Imports
import sys
print(sys.executable)
import pandas as pd
import numpy as np
import umap
import hdbscan
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score, rand_score
import plotly.express as px
print("all good")

/Users/hailey/anaconda3/envs/nasa-capstone/bin/python


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load and scale data
df = pd.read_csv("data/features_updated.csv")
file_names = df["file_name"]
X = df.drop(columns=["file_name"])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
#UMAP
umap_model = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=2,
    metric="euclidean",
    random_state=42
)
X_umap = umap_model.fit_transform(X_scaled)

In [None]:
#HDBSCAN tuned for 2 clusters
# need to tune min_cluster_size and min_samples until we get exactly 2 clusters
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=50, # increase to merge smaller clusters into 1
    min_samples=10,
    metric="euclidean"
)
predicted_labels = clusterer.fit_predict(X_umap)
n_clusters = len(set(predicted_labels) - {-1})
n_noise = (predicted_labels == -1).sum()
print(f"Number of clusters: {n_clusters}")
print(f"Noise points: {n_noise} / {len(predicted_labels)}")

In [None]:
#Visualization of clusters
plot_df = pd.DataFrame({
    "UMAP-1": X_umap[:, 0],
    "UMAP-2": X_umap[:, 1],
    "cluster": predicted_labels.astype(str),
    "file_name": file_names.values
})
fig = px.scatter(plot_df, x="UMAP-1", y="UMAP-2", color="cluster",
                 hover_data=["file_name"], title="UMAP + HDBSCAN (2-cluster target)")
fig.update_layout(width=800, height=600)
fig.show()

In [None]:
# Load ground truth from Excel and map to binary

# Expects: column A = file name, column B = category number (1-8)
gt_df = pd.read_excel("data/Labeling.xlsx")  

# Drop unlabeled rows
gt_df = gt_df.dropna(subset=["Label"])
gt_df["Label"] = gt_df["Label"].astype(int)

# Add "MATLAB " prefix to match features.csv file_name format
gt_df["file_name"] = "MATLAB " + gt_df["File"]

# Category mapping:
# 1 = Single Rhythmic        → rhythmic
# 2 = Double Rhythmic         → rhythmic
# 3 = Random                  → non-rhythmic
# 4 = Rhythmic with Climax    → rhythmic
# 5 = Noise                   → non-rhythmic
# 6 = 1 Rhythmic with Random  → rhythmic
# 7 = Triple Rhythmic         → rhythmic
# 8 = Transition              → EXCLUDE (ambiguous)

category_names = {
    1: "Single Rhythmic",
    2: "Double Rhythmic",
    3: "Random",
    4: "Rhythmic with Climax",
    5: "Noise",
    6: "1 Rhythmic with Random",
    7: "Triple Rhythmic",
    8: "Transition"
}

rhythmic_ids = [1, 2, 4, 6, 7]
non_rhythmic_ids = [3, 5]

gt_df["ground_truth_binary"] = gt_df["Label"].apply(
    lambda x: 1 if x in rhythmic_ids else (0 if x in non_rhythmic_ids else np.nan)
)

# Drop Transition (8) samples
gt_df = gt_df.dropna(subset=["ground_truth_binary"])
gt_df["ground_truth_binary"] = gt_df["ground_truth_binary"].astype(int)

print(f"Labeled samples: {len(gt_df)}")
print(f"  Rhythmic (1):     {(gt_df['ground_truth_binary'] == 1).sum()}")
print(f"  Non-rhythmic (0): {(gt_df['ground_truth_binary'] == 0).sum()}")

In [None]:
#RAND calculation
pred_df = pd.DataFrame({
    "file_name": file_names,
    "predicted_cluster": predicted_labels
})

merged = pd.merge(pred_df, gt_df[["file_name", "ground_truth_binary"]], on="file_name", how="inner")

# Optional: also drop HDBSCAN noise points (-1)
# merged = merged[merged["predicted_cluster"] != -1]

print(f"\nSamples matched: {len(merged)} / {len(file_names)}")
print(f"Unmatched (no ground truth or Transition): {len(file_names) - len(merged)}")

ri = rand_score(merged["ground_truth_binary"], merged["predicted_cluster"])
ari = adjusted_rand_score(merged["ground_truth_binary"], merged["predicted_cluster"])

print(f"\nRAND Index:          {ri:.4f}")
print(f"Adjusted RAND Index: {ari:.4f}")