In [7]:
# Cell 1: Importing required libraries
import json
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sentence_transformers import SentenceTransformer
import os
from transformers import BertTokenizer, TFBertModel, BertModel
import plotly.graph_objects as go

# Disable parallel tokenization to avoid multiprocessing issues
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [8]:
# Cell 2: Defining helper functions

def load_model(model_name):
    """
    Load a pre-trained SentenceTransformer model.
    
    Args:
        model_name (str): Name of the pre-trained model to load.
        
    Returns:
        SentenceTransformer: Loaded model.
        
    Raises:
        RuntimeError: If there is an error loading the model.
    """
    try:
        return SentenceTransformer(model_name)
    except Exception as e:
        raise RuntimeError(f"Error loading model: {e}")

def validate_bookmarks(bookmarks):
    """
    Validate the format of the bookmarks data.
    
    Args:
        bookmarks (list): List of bookmark dictionaries.
        
    Raises:
        ValueError: If the bookmarks data is empty or in an incorrect format.
    """
    if not bookmarks:
        raise ValueError("No bookmarks provided.")
    if not isinstance(bookmarks, list) or not all(isinstance(item, dict) for item in bookmarks):
        raise ValueError(f"Incorrect bookmarks format: {bookmarks}")

def encode_texts(model, texts):
    """
    Encode a list of texts using the provided SentenceTransformer model.
    
    Args:
        model (SentenceTransformer): Pre-trained model for encoding texts.
        texts (list): List of texts to encode.
        
    Returns:
        numpy.ndarray: Encoded text embeddings.
        
    Raises:
        RuntimeError: If there is an error encoding the texts.
    """
    try:
        return model.encode(texts)
    except Exception as e:
        raise RuntimeError(f"Error encoding texts: {e}")

def perform_clustering(embeddings, num_clusters):
    """
    Perform hierarchical clustering on the provided embeddings.
    
    Args:
        embeddings (numpy.ndarray): Encoded text embeddings.
        num_clusters (int): Number of clusters to create.
        
    Returns:
        numpy.ndarray: Cluster labels for each embedding.
        
    Raises:
        RuntimeError: If there is an error during clustering.
    """
    try:
        clustering = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
        clusters = clustering.fit_predict(embeddings)
        return clusters
    except Exception as e:
        raise RuntimeError(f"Error during clustering: {e}")


def generate_folder_structure_data(bookmarks, clusters, level=0):
    """
    Recursively generate data for the folder structure visualization.
    
    Args:
        bookmarks (list): List of bookmark dictionaries.
        clusters (numpy.ndarray): Cluster labels for each bookmark.
        level (int): Current depth level in the folder structure (default is 0).
        
    Returns:
        dict: Dictionary representing the folder structure data.
    """
    num_clusters = len(set(clusters))
    folder_data = {"name": f"Level {level}", "children": []}
    
    for i in range(num_clusters):
        subfolder_bookmarks = [bookmark for j, bookmark in enumerate(bookmarks) if clusters[j] == i]
        subfolder_data = {"name": f"Folder {i + 1}", "children": []}
        
        if len(subfolder_bookmarks) > 1:
            subfolder_embeddings = encode_texts(model, [f"{bookmark['name']} {bookmark['url']}" for bookmark in subfolder_bookmarks])
            subfolder_clusters = perform_clustering(subfolder_embeddings, min(len(subfolder_bookmarks), 5))
            subfolder_data["children"] = generate_folder_structure_data(subfolder_bookmarks, subfolder_clusters, level + 1)
        else:
            subfolder_data["children"] = [{"name": f"{bookmark['name']}", "url": bookmark['url']} for bookmark in subfolder_bookmarks]
        
        folder_data["children"].append(subfolder_data)
    
    return folder_data

In [9]:
# Cell 3: Loading and clustering bookmarks, and visualizing the folder structure

# Read the JSON data from the file
with open('tagged_data.json') as file:
    data = json.load(file)

# Load the pre-trained SentenceTransformer model
model = load_model('all-MiniLM-L6-v2')

# Perform initial clustering
embeddings = encode_texts(model, [f"{bookmark['name']} {bookmark['url']}" for bookmark in data])
clusters = perform_clustering(embeddings, min(len(data), 10))

# Generate the folder structure data
folder_structure_data = generate_folder_structure_data(data, clusters)

# Visualize the folder structure
fig = go.Figure(data=[go.Treemap(
    labels=[item['name'] for item in folder_structure_data['children']],
    parents=[folder_structure_data['name']] * len(folder_structure_data['children']),
    customdata=[item.get('url', '') for item in folder_structure_data['children']],
    hovertemplate='<b>%{label}</b><br>URL: %{customdata}<extra></extra>'
)])

fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))
fig.show()


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed