In [6]:
import json
import numpy as np
import hdbscan
from hdbscan import HDBSCAN
from hdbscan.validity import validity_index
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
import plotly.express as px
import colorcet as cc
from collections import defaultdict
import warnings
from sklearn.preprocessing import StandardScaler

def optimize_hdbscan_clustering(embeddings, min_cluster_size_range=range(5, 50, 5)):
    best_score = -np.inf
    best_labels = None
    best_params = {}
    
    # Scale the embeddings
    scaler = StandardScaler()
    embeddings_scaled = scaler.fit_transform(embeddings)
    
    for min_cluster_size in min_cluster_size_range:
        clusterer = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=1, gen_min_span_tree=True)
        clusterer.fit(embeddings_scaled)
        
        if len(set(clusterer.labels_)) > 1:  # Ensure we have more than one cluster
            try:
                with warnings.catch_warnings():
                    warnings.filterwarnings('ignore', category=RuntimeWarning)
                    score = hdbscan.validity.validity_index(embeddings_scaled, clusterer.labels_)
                if score > best_score:
                    best_score = score
                    best_labels = clusterer.labels_
                    best_params = {'min_cluster_size': min_cluster_size}
            except Exception as e:
                print(f"Error calculating validity index for min_cluster_size={min_cluster_size}: {str(e)}")
                continue
    
    return best_score, best_params, best_labels

def build_folder_structure(labels, titles):
    root = {"name": "Root", "type": "folder", "children": []}
    clusters = defaultdict(list)
    
    for label, title in zip(labels, titles):
        if label != -1:  # -1 is the noise label in HDBSCAN
            clusters[label].append({"name": title, "type": "bookmark"})
        else:
            root["children"].append({"name": title, "type": "bookmark"})
    
    for label, bookmarks in clusters.items():
        cluster_folder = {"name": f"Cluster {label}", "type": "folder", "children": bookmarks}
        root["children"].append(cluster_folder)
    
    return root

def print_folder_structure(folder, indent=0):
    prefix = "  " * indent
    if folder["type"] == "folder":
        print(f"{prefix}+ {folder['name']} ({len([c for c in folder['children'] if c['type'] == 'bookmark'])} bookmarks, {len([c for c in folder['children'] if c['type'] == 'folder'])} subfolders)")
        for child in folder["children"]:
            print_folder_structure(child, indent + 1)
    else:
        print(f"{prefix}- {folder['name']}")

def visualize_embeddings_plotly(embeddings_2d, labels, titles):
    df = pd.DataFrame({
        't-SNE1': embeddings_2d[:, 0],
        't-SNE2': embeddings_2d[:, 1],
        'Cluster': labels,
        'Title': titles
    })

    color_palette = cc.glasbey_dark

    fig = px.scatter(df, x='t-SNE1', y='t-SNE2', color='Cluster',
                     hover_data=['Title'], color_discrete_sequence=color_palette)

    fig.update_layout(
        title="Interactive t-SNE Visualization of Bookmark Clusters",
        plot_bgcolor='rgb(250,250,250)',
        width=1000,
        height=800,
    )

    fig.show()
    fig.write_html("bookmark_clusters_interactive.html")
    print("Interactive plot saved as 'bookmark_clusters_interactive.html'")

# Load the embedded bookmarks
with open('embedded_bookmarks.json') as f:
    data = json.load(f)

embeddings = np.array([bookmark["embedding"] for bookmark in data])

print("\nHDBSCAN Clustering:")
hdbscan_score, hdbscan_params, hdbscan_labels = optimize_hdbscan_clustering(embeddings)
titles = [bookmark["title"] for bookmark in data]
print(f"Best HDBSCAN Score: {hdbscan_score:.4f}")
print(f"Best Parameters: {hdbscan_params}")
print(f"Number of Clusters: {len(set(hdbscan_labels)) - (1 if -1 in hdbscan_labels else 0)}")

# Build and print folder structure
folder_structure = build_folder_structure(hdbscan_labels, titles)
print("\nFolder Structure:")
print_folder_structure(folder_structure)

embeddings_reduced = TruncatedSVD(n_components=50, random_state=42).fit_transform(embeddings)

# Perform t-SNE for visualization
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings_reduced)

# Visualize with Plotly
visualize_embeddings_plotly(embeddings_2d, hdbscan_labels, titles)


HDBSCAN Clustering:
Best HDBSCAN Score: 0.0128
Best Parameters: {'min_cluster_size': 5}
Number of Clusters: 4

Folder Structure:
+ Root (51 bookmarks, 4 subfolders)
  - About Us
  - januff/spotify-liked-songs-export: A Spotify Authorization Code flow using Remix and StepZen.
  - macOS support · Issue #22 · ufal/whisper_streaming
  - Windows and Mac support · Issue #5 · mingruimingrui/fast-mosestokenizer
  - Operation Sea-Spray - Wikipedia
  - Phoenix Program - Wikipedia
  - World Bank - Wikipedia
  - Announcing WIT: A Wikipedia-Based Image-Text Dataset
  - YouTube
  - https://www.whereareyoutube.com/
  - Imagine Dragons - Dream (Jorgen Odegard Remix) - YouTube
  - GLUTEN FREE MUSEUM
  - RunPod - The Cloud Built for AI
  - Gemini
  - Freedom of Information Act Electronic Reading Room | CIA FOIA (foia.cia.gov)
  - MR Online
  - The CIA & the Frankfurt school’s anti-communism | MR Online
  - Известия – новости политики, экономики, спорта, культуры | IZ.RU
  - Subscribe | The New Yorker
 

Interactive plot saved as 'bookmark_clusters_interactive.html'
