In [12]:
import json
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.metrics import silhouette_score
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import pandas as pd
import colorcet as cc  

def optimize_hierarchical_clustering(embeddings, max_clusters=50):
    best_score = -1
    best_params = {}
    best_labels = None
   
    Z = linkage(embeddings, method='ward', metric='euclidean')
   
    for n_clusters in range(2, max_clusters + 1):
        labels = fcluster(Z, t=n_clusters, criterion='maxclust')
        score = silhouette_score(embeddings, labels, metric='cosine')
        if score > best_score:
            best_score = score
            best_params = {'n_clusters': n_clusters}
            best_labels = labels

    return best_score, best_params, best_labels

def visualize_embeddings_plotly(embeddings_2d, labels, titles):
    # Create DataFrame
    df = pd.DataFrame({
        't-SNE1': embeddings_2d[:, 0],
        't-SNE2': embeddings_2d[:, 1],
        'Cluster': labels,
        'Title': titles
    })

    # Use a colorblind-friendly and perceptually uniform color palette
    color_palette = cc.glasbey_dark

    # Create the scatter plot
    fig = px.scatter(df, x='t-SNE1', y='t-SNE2', color='Cluster',
                     hover_data=['Title'], color_discrete_sequence=color_palette)

    # Improve the layout
    fig.update_layout(
        title={
            'text': "Interactive t-SNE Visualization of Bookmark Clusters",
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(size=24)
        },
        plot_bgcolor='rgb(250,250,250)',  # Light grey background
        legend_title_text='Cluster',
        legend=dict(
            itemsizing='constant',
            title_font=dict(size=14),
            font=dict(size=12),
        ),
        width=1000,
        height=800,
    )

    # Update marker properties
    fig.update_traces(
        marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')),
        selector=dict(mode='markers')
    )

    # Add a semi-transparent lasso select tool
    fig.update_layout(
        dragmode='lasso',
        newshape=dict(line_color='cyan')
    )

    # Improve axis labels
    fig.update_xaxes(title_text="t-SNE Dimension 1", title_font=dict(size=14), tickfont=dict(size=12))
    fig.update_yaxes(title_text="t-SNE Dimension 2", title_font=dict(size=14), tickfont=dict(size=12))

    # Add helpful annotations
    fig.add_annotation(
        text="Use lasso tool to select points",
        xref="paper", yref="paper",
        x=0.01, y=0.99,
        showarrow=False,
        font=dict(size=12, color="darkgrey")
    )

    # Optionally, save the plot as an HTML file
    fig.write_html("bookmark_clusters_interactive.html")
    print("Interactive plot saved as 'bookmark_clusters_interactive.html'")

    return fig  # Return the figure object for further customization if needed

# Load the embedded bookmarks
with open('embedded_bookmarks.json') as f:
    data = json.load(f)

embeddings = np.array([bookmark["embedding"] for bookmark in data])
titles = [bookmark["title"] for bookmark in data]

# Reduce dimensions
svd = TruncatedSVD(n_components=50, random_state=42)
embeddings_reduced = svd.fit_transform(embeddings)

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings_reduced)

print("\nHierarchical Clustering:")
hier_score, hier_params, hier_labels = optimize_hierarchical_clustering(embeddings_reduced, max_clusters=50)

print(f"Best Silhouette Score: {hier_score:.4f}")
print(f"Best Parameters: {hier_params}")
print(f"Number of Clusters: {len(set(hier_labels))}")

# Visualize with Plotly
visualize_embeddings_plotly(embeddings_2d, hier_labels, titles)