In [2]:
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import json

def lsa_cosine_clustering(embeddings, n_components=50, min_clusters=2, max_clusters=20):
    print("Performing LSA...")
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    embeddings_lsa = svd.fit_transform(embeddings)
    print(f"Explained variance ratio: {sum(svd.explained_variance_ratio_):.4f}")

    print("Computing cosine similarity...")
    cosine_sim = cosine_similarity(embeddings_lsa)
    distances = 1 - cosine_sim

    print("Preparing condensed distance matrix...")
    condensed_distances = distances[np.triu_indices(distances.shape[0], k=1)]

    print("Performing hierarchical clustering...")
    Z = linkage(condensed_distances, method='average', metric='euclidean')

    best_score = -1
    best_labels = None
    best_n_clusters = None

    print("Evaluating different numbers of clusters...")
    for n_clusters in tqdm(range(min_clusters, max_clusters + 1)):
        labels = fcluster(Z, t=n_clusters, criterion='maxclust')
        score = silhouette_score(embeddings_lsa, labels, metric='cosine')
        print(f"  Clusters: {n_clusters}, Silhouette Score: {score:.4f}")
        
        if score > best_score:
            best_score = score
            best_labels = labels
            best_n_clusters = n_clusters

    print(f"\nBest number of clusters: {best_n_clusters}")
    print(f"Best silhouette score: {best_score:.4f}")

    # Plotting the dendrogram
    plt.figure(figsize=(10, 7))
    dendrogram(Z, truncate_mode='lastp', p=30, leaf_rotation=90., leaf_font_size=12.)
    plt.title('Hierarchical Clustering Dendrogram (truncated)')
    plt.xlabel('Sample index or (cluster size)')
    plt.ylabel('Distance')
    plt.show()

    return best_labels, best_n_clusters, best_score, Z

def build_folder_structure(labels, bookmark_data):
    clusters = {}
    for i, label in enumerate(labels):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(bookmark_data[i])
    
    root_folder = {
        "name": "Root",
        "type": "folder",
        "children": []
    }
    
    for label, bookmarks in clusters.items():
        folder = {
            "name": f"Cluster {label}",
            "type": "folder",
            "children": [{"name": b["title"], "type": "bookmark", "url": b["url"]} for b in bookmarks]
        }
        root_folder["children"].append(folder)
    
    return root_folder

def print_folder_structure(folder, indent=0):
    if folder["type"] == "folder":
        print("  " * indent + f"+ {folder['name']} ({len(folder['children'])} items)")
        for child in folder['children']:
            print_folder_structure(child, indent + 1)
    else:
        print("  " * indent + f"- {folder['name']} (Bookmark)")

# Main execution
with open('embedded_bookmarks.json') as f:
    data = json.load(f)

embeddings = np.array([bookmark["embedding"] for bookmark in data])

best_labels, best_n_clusters, best_score, Z = lsa_cosine_clustering(embeddings)

print("\nBuilding folder structure...")
folder_structure = build_folder_structure(best_labels, data)

print("\nFolder Structure:")
print_folder_structure(folder_structure)

# Visualization of Cluster Sizes
cluster_sizes = [len(cluster["children"]) for cluster in folder_structure["children"]]

plt.figure(figsize=(12, 6))
plt.hist(cluster_sizes, bins=range(1, max(cluster_sizes) + 2, 1), align='left')
plt.title("Distribution of Cluster Sizes")
plt.xlabel("Cluster Size")
plt.ylabel("Frequency")
plt.xticks(range(1, max(cluster_sizes) + 1, 1))
plt.show()

print(f"\nAverage cluster size: {np.mean(cluster_sizes):.2f}")
print(f"Median cluster size: {np.median(cluster_sizes):.2f}")
print(f"Largest cluster size: {max(cluster_sizes)}")
print(f"Number of singleton clusters: {cluster_sizes.count(1)}")

Performing LSA...
Explained variance ratio: 0.6606
Computing cosine similarity...
Performing hierarchical clustering...


ValueError: Unknown Distance Metric: precomputed