In [None]:
# RAND Score Clustering Analysis
#Imports
import pandas as pd
import numpy as np
import umap
import hdbscan
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score, rand_score
import plotly.express as px

In [None]:
# Load and scale data
df = pd.read_csv("data/features_updated.csv")
file_names = df["file_name"]
X = df.drop(columns=["file_name"])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
#UMAP
umap_model = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=2,
    metric="euclidean",
    random_state=42
)
X_umap = umap_model.fit_transform(X_scaled)

In [None]:
#HDBSCAN tuned for 2 clusters
# need to tune min_cluster_size and min_samples until we get exactly 2 clusters
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=50, # increase to merge smaller clusters into 1
    min_samples=10,
    metric="euclidean"
)
predicted_labels = clusterer.fit_predict(X_umap)
n_clusters = len(set(predicted_labels) - {-1})
n_noise = (predicted_labels == -1).sum()
print(f"Number of clusters: {n_clusters}")
print(f"Noise points: {n_noise} / {len(predicted_labels)}")

In [None]:
#Visualization of clusters
plot_df = pd.DataFrame({
    "UMAP-1": X_umap[:, 0],
    "UMAP-2": X_umap[:, 1],
    "cluster": predicted_labels.astype(str),
    "file_name": file_names.values
})
fig = px.scatter(plot_df, x="UMAP-1", y="UMAP-2", color="cluster",
                 hover_data=["file_name"], title="UMAP + HDBSCAN (2-cluster target)")
fig.update_layout(width=800, height=600)
fig.show()

In [None]:
#Ground truth labels (organize ground truth labesl- import from google drive)
#gt_df = pd.read_csv("../data/ground_truth_labels.csv")
#
# rhythmic_groups = [
#     "single rhythm",
#     # add other rhythmic group names here
# ]
# non_rhythmic_groups = [
#     "random noise",
#     # add other non-rhythmic group names here
# ]
#
# gt_df["ground_truth_binary"] = gt_df["ground_truth_cluster"].apply(
#     lambda x: 1 if x in rhythmic_groups else 0
# )

In [None]:
#RAND calculation (uncomment when Ground Truth is organized into 2 clusters)
# pred_df = pd.DataFrame({
#     "file_name": file_names,
#     "predicted_cluster": predicted_labels
# })
#
# merged = pd.merge(pred_df, gt_df[["file_name", "ground_truth_binary"]], on="file_name", how="inner")
#
# # Optional: drop noise points
# # merged = merged[merged["predicted_cluster"] != -1]
#
# ri = rand_score(merged["ground_truth_binary"], merged["predicted_cluster"])
# ari = adjusted_rand_score(merged["ground_truth_binary"], merged["predicted_cluster"])
#
# print(f"RAND Index:          {ri:.4f}")
# print(f"Adjusted RAND Index: {ari:.4f}")
# print(f"Samples compared:    {len(merged)}")
# print(f"  (noise excluded:   {(predicted_labels == -1).sum()})")