Possible TODOs:

- use train / xval sets, evaluate in xval, maybe 5 k-fold xvals
- do longer optuna with a couple hundred tests
- add params:
  - pca v umap
- do once with dbscan and once with hdbscan
- give it 200 test inputs and compare vs a canonical clustering

In [13]:
import dotenv

from typing import List
from collections import Counter

import os
import sqlite3

import numpy as np
import pandas as pd

import openai

from sklearn.metrics import pairwise_distances, silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.decomposition import TruncatedSVD

import umap

import hdbscan
from hdbscan.validity import validity_index as dbcv

import optuna

import warnings
warnings.filterwarnings('ignore')

dotenv.load_dotenv() 


True

In [27]:
MIN_COMPONENTS=50
N_TRIALS=50
RANDOM_STATE=42
EMBEDDING_MODEL = "text-embedding-3-large"


In [2]:
# Connect to SQLite and load bullets into a DataFrame
db_path = "articles.db"
query = """
SELECT DISTINCT
    bullet
FROM daily_summaries
WHERE bullet IS NOT NULL
"""

with sqlite3.connect(db_path) as conn:
    df = pd.read_sql_query(query, conn)

print(f"Loaded {len(df)} unique rows with non-null bullets")


Loaded 9360 unique rows with non-null bullets


In [3]:
# Initialize the OpenAI client
client = openai.OpenAI()  # Uses OPENAI_API_KEY environment variable


def get_embeddings(texts: List[str],
                   model: str = EMBEDDING_MODEL,
                   batch_size: int = 200) -> np.ndarray:
    """
    Returns a list of embedding vectors, one per text in `texts`.
    Batches requests in sizes of `batch_size` to avoid hitting rate or length limits.
    """
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        resp = client.embeddings.create(model=model, input=batch)
        # resp.data is a list with same order as inputs
        batch_embeddings = [d.embedding for d in resp.data]
        all_embeddings.extend(batch_embeddings)
    return np.array(all_embeddings)

# Fetch embeddings for the bullet column
embeddings_array = get_embeddings(df["bullet"].tolist())

embeddings_array.shape


(9360, 3072)

In [11]:
def calculate_clustering_metrics(embeddings_array, labels, clusterer=None):
    """
    Calculate various clustering quality metrics for HDBSCAN results.
    
    Args:
        embeddings_array: Original normalized embeddings used for clustering
        labels: Cluster labels from HDBSCAN
        clusterer: Optional HDBSCAN clusterer object
    
    Returns:
        Dictionary of clustering metrics
    """
    
    # Filter out noise points (-1 labels) for some metrics
    non_noise_mask = labels != -1
    non_noise_embeddings = embeddings_array[non_noise_mask]
    non_noise_labels = labels[non_noise_mask]
    
    metrics = {}
    
    # Basic cluster statistics
    unique_labels = set(labels)
    n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
    n_noise = np.sum(labels == -1)
    
    metrics['n_clusters'] = n_clusters
    metrics['n_noise_points'] = n_noise
    metrics['noise_ratio'] = n_noise / len(labels)
    
    # Cluster size distribution
    cluster_sizes = Counter(labels[labels != -1])
    if cluster_sizes:
        metrics['avg_cluster_size'] = np.mean(list(cluster_sizes.values()))
        metrics['std_cluster_size'] = np.std(list(cluster_sizes.values()))
        metrics['min_cluster_size'] = min(cluster_sizes.values())
        metrics['max_cluster_size'] = max(cluster_sizes.values())
    
    # Skip other metrics if we have too few clusters or too much noise
    if n_clusters < 2 or len(non_noise_labels) < 2:
        print("Warning: Too few clusters or too much noise for some metrics")
        return metrics
    
    # HDBSCAN-specific metrics
    # gives some divide by 0 errors
    if clusterer is not None:
        try:
            # Validity index (HDBSCAN's internal metric)
            validity_idx = hdbscan.validity.validity_index(
                embeddings_array, labels, metric='euclidean'
            )
            metrics['hdbscan_validity_index'] = validity_idx
        except Exception as e:
            print(f"Could not compute HDBSCAN validity index: {e}")
        
        # Cluster persistence (stability)
        if hasattr(clusterer, 'cluster_persistence_'):
            metrics['cluster_persistence'] = clusterer.cluster_persistence_
    
    # Scikit-learn clustering metrics (excluding noise points)
    try:
        # Silhouette Score (higher is better, range [-1, 1])
        sil_score = silhouette_score(non_noise_embeddings, non_noise_labels, metric='euclidean')
        metrics['silhouette_score'] = sil_score
        
        # Calinski-Harabasz Index (higher is better)
        ch_score = calinski_harabasz_score(non_noise_embeddings, non_noise_labels)
        metrics['calinski_harabasz_score'] = ch_score
        
        # Davies-Bouldin Index (lower is better)
        db_score = davies_bouldin_score(non_noise_embeddings, non_noise_labels)
        metrics['davies_bouldin_score'] = db_score
        
    except Exception as e:
        print(f"Could not compute sklearn metrics: {e}")
    
    # Custom composite score balancing cluster quality and quantity
    if 'silhouette_score' in metrics and n_clusters > 0:
        # Penalize too many small clusters or too few large clusters
        cluster_balance = 1 / (1 + abs(np.log(n_clusters / 10)))  # Optimal around 10 clusters
        size_consistency = 1 / (1 + metrics.get('std_cluster_size', 0) / max(metrics.get('avg_cluster_size', 1), 1))
        noise_penalty = 1 - min(metrics['noise_ratio'], 0.5)  # Penalize high noise
        
        composite_score = (
            0.5 * max(metrics['silhouette_score'], 0) +  # Quality component
            0.5 * max(metrics['hdbscan_validity_index'], 0)
#             0.1 * cluster_balance +                       # Quantity component  
#             0.1 * size_consistency +                      # Size consistency
#             0.3 * noise_penalty                           # Noise penalty
        )
        metrics['composite_score'] = composite_score
    
    return metrics

def print_clustering_summary(metrics):
    """Print a nice summary of clustering metrics."""
    print("=== Clustering Quality Metrics ===")
    print(f"Number of clusters: {metrics.get('n_clusters', 'N/A')}")
    print(f"Noise points: {metrics.get('n_noise_points', 'N/A')} ({metrics.get('noise_ratio', 0):.1%})")
    
    if 'avg_cluster_size' in metrics:
        print(f"Average cluster size: {metrics['avg_cluster_size']:.1f} ± {metrics.get('std_cluster_size', 0):.1f}")
        print(f"Cluster size range: {metrics.get('min_cluster_size', 'N/A')} - {metrics.get('max_cluster_size', 'N/A')}")
    
    print("=== Quality Scores ===")
    if 'silhouette_score' in metrics:
        print(f"Silhouette Score: {metrics['silhouette_score']:.3f} (higher is better)")
    if 'calinski_harabasz_score' in metrics:
        print(f"Calinski-Harabasz Score: {metrics['calinski_harabasz_score']:.1f} (higher is better)")
    if 'davies_bouldin_score' in metrics:
        print(f"Davies-Bouldin Score: {metrics['davies_bouldin_score']:.3f} (lower is better)")
    if 'hdbscan_validity_index' in metrics:
        print(f"HDBSCAN Validity Index: {metrics['hdbscan_validity_index']:.3f}")
    if 'composite_score' in metrics:
        print(f"Composite Score: {metrics['composite_score']:.3f} (higher is better)")
    print()



=== Clustering Quality Metrics ===
Number of clusters: 405
Noise points: 5693 (60.8%)
Average cluster size: 9.1 ± 11.6
Cluster size range: 3 - 119
=== Quality Scores ===
Silhouette Score: 0.251 (higher is better)
Calinski-Harabasz Score: 17.0 (higher is better)
Davies-Bouldin Score: 1.376 (lower is better)
HDBSCAN Validity Index: 0.088
Composite Score: 0.169 (higher is better)



In [42]:
n_components = 200
svd = TruncatedSVD(n_components=n_components, random_state=RANDOM_STATE)
reduced_embeddings = svd.fit_transform(embeddings_array)
# Re-normalize after SVD
reduced_embeddings /= np.linalg.norm(reduced_embeddings, axis=1, keepdims=True)

min_cluster_size = 3
min_samples = 3
        
# Fit HDBSCAN
print("=== HDBSCAN Parameters ===")
print(f"min_cluster_size:   {min_cluster_size}")
print(f"min_samples:        {min_samples}")
print(f"n_components:       {n_components}")
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=min_samples,
    metric="euclidean",
    cluster_selection_method="eom",
)

labels = clusterer.fit_predict(reduced_embeddings)

# Calculate metrics
metrics = calculate_clustering_metrics(reduced_embeddings, labels, clusterer)
print_clustering_summary(metrics)




=== HDBSCAN Parameters ===
min_cluster_size:   3
min_samples:        3
n_components:       200
=== Clustering Quality Metrics ===
Number of clusters: 423
Noise points: 5669 (60.6%)
Average cluster size: 8.7 ± 9.8
Cluster size range: 3 - 107
=== Quality Scores ===
Silhouette Score: 0.298 (higher is better)
Calinski-Harabasz Score: 25.3 (higher is better)
Davies-Bouldin Score: 1.193 (lower is better)
HDBSCAN Validity Index: 0.104
Composite Score: 0.201 (higher is better)



In [41]:
reducer = umap.UMAP(n_components=n_components)
# # Fit the reducer to the data without transforming
reducer.fit(embeddings_array)
# force np64 or hdbscan pukes
reduced_embeddings = reducer.transform(embeddings_array).astype(np.float64)
reduced_embeddings /= np.linalg.norm(reduced_embeddings, axis=1, keepdims=True)


min_cluster_size = 3
min_samples = 3
        
# Fit HDBSCAN
print("=== HDBSCAN Parameters ===")
print(f"min_cluster_size:   {min_cluster_size}")
print(f"min_samples:        {min_samples}")
print(f"n_components:       {n_components}")
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=min_samples,
    metric="euclidean",
    cluster_selection_method="eom",
)

labels = clusterer.fit_predict(reduced_embeddings)

# Calculate metrics
metrics = calculate_clustering_metrics(reduced_embeddings, labels, clusterer)
print_clustering_summary(metrics)



=== HDBSCAN Parameters ===
min_cluster_size:   3
min_samples:        3
n_components:       200
=== Clustering Quality Metrics ===
Number of clusters: 612
Noise points: 2301 (24.6%)
Average cluster size: 11.5 ± 13.3
Cluster size range: 3 - 169
=== Quality Scores ===
Silhouette Score: 0.599 (higher is better)
Calinski-Harabasz Score: 4880.3 (higher is better)
Davies-Bouldin Score: 0.503 (lower is better)
HDBSCAN Validity Index: 0.443
Composite Score: 0.521 (higher is better)



In [44]:
def objective(trial, embeddings_array):
    """
    Optuna objective function to optimize HDBSCAN hyperparameters and dimensionality.
    
    Args:
        trial: Optuna trial object
        embeddings_array: Original normalized embeddings
    
    Returns:
        Negative composite score (Optuna minimizes, we want to maximize)
    """
    
    # HDBSCAN hyperparameters to optimize
    min_cluster_size = trial.suggest_int('min_cluster_size', 2, 10)
    min_samples = trial.suggest_int('min_samples', 2, min_cluster_size)
        
    # Dimensionality reduction 
    original_dim = embeddings_array.shape[1]
    n_components = trial.suggest_int('n_components', 
                                     MIN_COMPONENTS, 
                                     original_dim // 4)  
    
    print("=== HDBSCAN Parameters ===")
    print(f"min_cluster_size:   {min_cluster_size}")
    print(f"min_samples:        {min_samples}")
    print(f"n_components:       {n_components}")
    
    # SVD dimensionality reduction
#     if n_components < original_dim:
#         svd = TruncatedSVD(n_components=n_components, random_state=RANDOM_STATE)
#         reduced_embeddings = svd.fit_transform(embeddings_array)
#         # Re-normalize after SVD
#         reduced_embeddings = reduced_embeddings / np.linalg.norm(reduced_embeddings, axis=1, keepdims=True)
#     else:
#         reduced_embeddings = embeddings_array

   
    # UMAP dimensionality reduction
    if n_components < original_dim:
        reducer = umap.UMAP(n_components=n_components)
        # # Fit the reducer to the data without transforming
        reducer.fit(embeddings_array)
        # force np64 or hdbscan pukes
        reduced_embeddings = reducer.transform(embeddings_array).astype(np.float64)
        # Re-normalize after UMAP
        reduced_embeddings /= np.linalg.norm(reduced_embeddings, axis=1, keepdims=True)        
    else:
        reduced_embeddings = embeddings_array
        
    try:
        # Fit HDBSCAN
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric="euclidean",
            cluster_selection_method="eom",
        )
        
        labels = clusterer.fit_predict(reduced_embeddings)
        
        # Calculate metrics
        metrics = calculate_clustering_metrics(reduced_embeddings, labels, clusterer)
        print_clustering_summary(metrics)
        
        # Return negative composite score (Optuna minimizes)
        composite_score = metrics.get('composite_score', -1.0)
        
        # Penalize if no valid clusters found or too much noise
        if metrics.get('n_clusters', 0) < 2 or metrics.get('noise_ratio', 1.0) > 0.8:
            composite_score = -1.0
            
        return -composite_score
        
    except Exception as e:
        print(f"Error in trial: {e}")
        return 1.0  # Bad score for failed trials

def optimize_hdbscan(embeddings_array, n_trials=100, timeout=None):
    """
    Optimize HDBSCAN hyperparameters using Optuna.
    
    Args:
        embeddings_array: Normalized embeddings array
        n_trials: Number of optimization trials
        timeout: Maximum time in seconds (None for no limit)
    
    Returns:
        Dictionary with best parameters and results
    """
    
    print(f"Starting optimization with {n_trials} trials...")
    print(f"Original embedding shape: {embeddings_array.shape}")
    
    # Create study
    study = optuna.create_study(
        direction='minimize',  # We return negative composite score
        sampler=optuna.samplers.TPESampler(seed=42),
        pruner=optuna.pruners.MedianPruner(n_startup_trials=10)
    )
    
    # Optimize
    study.optimize(
        lambda trial: objective(trial, embeddings_array),
        n_trials=n_trials,
        timeout=timeout,
        show_progress_bar=True
    )
    
    # Get best parameters
    best_params = study.best_params
    best_score = -study.best_value  # Convert back to positive
    
    print(f"\nOptimization completed!")
    print(f"Best composite score: {best_score:.4f}")
    print(f"Best parameters: {best_params}")
    
    # Test best parameters
    print(f"\n=== Results with Best Parameters ===")
        
    # Apply best dimensionality reduction
    if best_params['n_components'] < embeddings_array.shape[1]:
        reducer = umap.UMAP(n_components=best_params['n_components'])
        # # Fit the reducer to the data without transforming
        reducer.fit(embeddings_array)
        # force np64 or hdbscan pukes
        best_embeddings = reducer.transform(embeddings_array).astype(np.float64)
        # Re-normalize after UMAP
        best_embeddings /= np.linalg.norm(best_embeddings, axis=1, keepdims=True)        
        print(f"Reduced dimensions from {embeddings_array.shape[1]} to {best_params['n_components']}")
    else:
        best_embeddings = embeddings_array
        reducer = None
        print("No dimensionality reduction applied")
     
    # Fit with best parameters
    best_clusterer = hdbscan.HDBSCAN(
        min_cluster_size=best_params['min_cluster_size'],
        min_samples=best_params['min_samples'],
        metric="euclidean",
        cluster_selection_method="eom",
    )
    
    best_labels = best_clusterer.fit_predict(best_embeddings)
    best_metrics = calculate_clustering_metrics(best_embeddings, best_labels, best_clusterer)
    
    print_clustering_summary(best_metrics)
    print()
    
    # Return results
    return {
        'study': study,
        'best_params': best_params,
        'best_score': best_score,
        'best_clusterer': best_clusterer,
        'best_labels': best_labels,
        'best_embeddings': best_embeddings,
        'best_metrics': best_metrics,
        'svd_transformer': svd if best_params['n_components'] < embeddings_array.shape[1] else None
    }

def plot_optimization_history(study):
    """Plot optimization history (requires plotly)"""
    try:
        import plotly.graph_objects as go
        
        # Get trial data
        trials = study.trials
        values = [-t.value for t in trials if t.value is not None]  # Convert back to positive
        trial_numbers = list(range(1, len(values) + 1))
        
        # Create plot
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=trial_numbers,
            y=values,
            mode='lines+markers',
            name='Composite Score',
            line=dict(color='blue', width=2),
            marker=dict(size=4)
        ))
        
        # Add best score line
        best_score = max(values)
        fig.add_hline(y=best_score, line_dash="dash", line_color="red",
                     annotation_text=f"Best: {best_score:.4f}")
        
        fig.update_layout(
            title='HDBSCAN Optimization History',
            xaxis_title='Trial',
            yaxis_title='Composite Score',
            showlegend=True
        )
        
        fig.show()
        
    except ImportError:
        print("Install plotly to visualize optimization history: pip install plotly")

results = optimize_hdbscan(embeddings_array, n_trials=N_TRIALS)


[I 2025-06-15 19:34:17,414] A new study created in memory with name: no-name-36d36f84-3670-4ec1-8c04-385d164ed4c1


Starting optimization with 50 trials...
Original embedding shape: (9360, 3072)


  0%|          | 0/50 [00:00<?, ?it/s]

=== HDBSCAN Parameters ===
min_cluster_size:   5
min_samples:        5
n_components:       576
=== Clustering Quality Metrics ===
Number of clusters: 360
Noise points: 2234 (23.9%)
Average cluster size: 19.8 ± 24.1
Cluster size range: 5 - 267
=== Quality Scores ===
Silhouette Score: 0.618 (higher is better)
Calinski-Harabasz Score: 4954.9 (higher is better)
Davies-Bouldin Score: 0.462 (lower is better)
HDBSCAN Validity Index: 0.463
Composite Score: 0.541 (higher is better)

[I 2025-06-15 19:37:28,750] Trial 0 finished with value: -0.5406824478860744 and parameters: {'min_cluster_size': 5, 'min_samples': 5, 'n_components': 576}. Best is trial 0 with value: -0.5406824478860744.
=== HDBSCAN Parameters ===
min_cluster_size:   7
min_samples:        2
n_components:       162
=== Clustering Quality Metrics ===
Number of clusters: 389
Noise points: 2009 (21.5%)
Average cluster size: 18.9 ± 15.5
Cluster size range: 7 - 144
=== Quality Scores ===
Silhouette Score: 0.573 (higher is better)
Calins

=== Clustering Quality Metrics ===
Number of clusters: 194
Noise points: 2803 (29.9%)
Average cluster size: 33.8 ± 35.8
Cluster size range: 11 - 268
=== Quality Scores ===
Silhouette Score: 0.646 (higher is better)
Calinski-Harabasz Score: 4822.8 (higher is better)
Davies-Bouldin Score: 0.423 (lower is better)
HDBSCAN Validity Index: 0.421
Composite Score: 0.533 (higher is better)

[I 2025-06-15 20:06:26,542] Trial 12 finished with value: -0.5333793286548366 and parameters: {'min_cluster_size': 10, 'min_samples': 10, 'n_components': 405}. Best is trial 4 with value: -0.5508775927238486.
=== HDBSCAN Parameters ===
min_cluster_size:   9
min_samples:        8
n_components:       57
=== Clustering Quality Metrics ===
Number of clusters: 232
Noise points: 2582 (27.6%)
Average cluster size: 29.2 ± 29.0
Cluster size range: 9 - 250
=== Quality Scores ===
Silhouette Score: 0.650 (higher is better)
Calinski-Harabasz Score: 5489.7 (higher is better)
Davies-Bouldin Score: 0.435 (lower is better)
H

=== Clustering Quality Metrics ===
Number of clusters: 244
Noise points: 2628 (28.1%)
Average cluster size: 27.6 ± 28.5
Cluster size range: 9 - 257
=== Quality Scores ===
Silhouette Score: 0.650 (higher is better)
Calinski-Harabasz Score: 5441.0 (higher is better)
Davies-Bouldin Score: 0.428 (lower is better)
HDBSCAN Validity Index: 0.448
Composite Score: 0.549 (higher is better)

[I 2025-06-15 20:41:39,118] Trial 24 finished with value: -0.5491605870200702 and parameters: {'min_cluster_size': 9, 'min_samples': 7, 'n_components': 351}. Best is trial 19 with value: -0.5544568178327329.
=== HDBSCAN Parameters ===
min_cluster_size:   6
min_samples:        5
n_components:       456
=== Clustering Quality Metrics ===
Number of clusters: 324
Noise points: 2199 (23.5%)
Average cluster size: 22.1 ± 26.6
Cluster size range: 6 - 259
=== Quality Scores ===
Silhouette Score: 0.620 (higher is better)
Calinski-Harabasz Score: 4650.2 (higher is better)
Davies-Bouldin Score: 0.448 (lower is better)
HD

=== Clustering Quality Metrics ===
Number of clusters: 235
Noise points: 2492 (26.6%)
Average cluster size: 29.2 ± 30.9
Cluster size range: 9 - 259
=== Quality Scores ===
Silhouette Score: 0.642 (higher is better)
Calinski-Harabasz Score: 5114.7 (higher is better)
Davies-Bouldin Score: 0.440 (lower is better)
HDBSCAN Validity Index: 0.441
Composite Score: 0.542 (higher is better)

[I 2025-06-15 21:24:17,930] Trial 36 finished with value: -0.541840261635683 and parameters: {'min_cluster_size': 9, 'min_samples': 8, 'n_components': 226}. Best is trial 19 with value: -0.5544568178327329.
=== HDBSCAN Parameters ===
min_cluster_size:   10
min_samples:        9
n_components:       641
=== Clustering Quality Metrics ===
Number of clusters: 212
Noise points: 2688 (28.7%)
Average cluster size: 31.5 ± 30.4
Cluster size range: 10 - 249
=== Quality Scores ===
Silhouette Score: 0.646 (higher is better)
Calinski-Harabasz Score: 5633.6 (higher is better)
Davies-Bouldin Score: 0.431 (lower is better)
H

=== Clustering Quality Metrics ===
Number of clusters: 233
Noise points: 2608 (27.9%)
Average cluster size: 29.0 ± 29.6
Cluster size range: 9 - 260
=== Quality Scores ===
Silhouette Score: 0.650 (higher is better)
Calinski-Harabasz Score: 5867.0 (higher is better)
Davies-Bouldin Score: 0.419 (lower is better)
HDBSCAN Validity Index: 0.450
Composite Score: 0.550 (higher is better)

[I 2025-06-15 22:05:21,748] Trial 48 finished with value: -0.5496349496618768 and parameters: {'min_cluster_size': 9, 'min_samples': 8, 'n_components': 384}. Best is trial 42 with value: -0.555188975544405.
=== HDBSCAN Parameters ===
min_cluster_size:   10
min_samples:        9
n_components:       318
=== Clustering Quality Metrics ===
Number of clusters: 210
Noise points: 2514 (26.9%)
Average cluster size: 32.6 ± 36.6
Cluster size range: 10 - 262
=== Quality Scores ===
Silhouette Score: 0.630 (higher is better)
Calinski-Harabasz Score: 4458.6 (higher is better)
Davies-Bouldin Score: 0.433 (lower is better)
H

KeyError: 'n_neighbors'

In [45]:
best_score


0.22566429917751107

In [19]:
import plotly.graph_objects as go

trials = results['study'].trials
values = [-t.value for t in trials if t.value is not None]  # Convert back to positive
trial_numbers = list(range(1, len(values) + 1))

# Create plot
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=trial_numbers,
    y=values,
    mode='lines+markers',
    name='Composite Score',
    line=dict(color='blue', width=2),
    marker=dict(size=4)
))

# Add best score line
best_score = max(values)
fig.add_hline(y=best_score, line_dash="dash", line_color="red",
             annotation_text=f"Best: {best_score:.4f}")

fig.update_layout(
    title='HDBSCAN Optimization History',
    xaxis_title='Trial',
    yaxis_title='Composite Score',
    showlegend=True
)

fig.show()


In [22]:
trial_numbers


[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50]

In [51]:
reducer = umap.UMAP(n_components=192)
# # Fit the reducer to the data without transforming
reducer.fit(embeddings_array)


In [53]:
import pickle 

with open('umap_reducer.pkl', 'wb') as f:
    pickle.dump(reducer, f)

In [None]:
min_cluster_size': 10, 'min_samples': 9, 'n_components': 179}

In [None]:
# hdbscan, truncatedsvd

# === Results with Best Parameters ===
# Reduced dimensions from 3072 to 64
# === Clustering Quality Metrics ===
# Number of clusters: 736
# Noise points: 4859 (51.9%)
# Average cluster size: 6.1 ± 5.6
# Cluster size range: 3 - 84
# === Quality Scores ===
# Silhouette Score: 0.319 (higher is better)
# Calinski-Harabasz Score: 31.7 (higher is better)
# Davies-Bouldin Score: 1.086 (lower is better)
# HDBSCAN Validity Index: 0.133
# Composite Score: 0.226 (higher is better)
    
