In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA


with open('tagged_data.json') as file:
    bookmarks = json.load(file)

# Preprocess the bookmarks data
bookmarks_data = [{"name": bookmark["name"], "url": bookmark["url"], "tags": " ".join(bookmark["tags"])} for bookmark in bookmarks]

# Extract features using TF-IDF vectorization
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform([bookmark["tags"] for bookmark in bookmarks_data])

# Reduce dimensionality using PCA
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(features.toarray())

# Apply HDBSCAN clustering
clusterer = hdbscan.HDBSCAN(min_cluster_size=5)
clusterer.fit(reduced_features)

# Visualize the clustering results
plt.figure(figsize=(8, 6))
sns.scatterplot(x=reduced_features[:, 0], y=reduced_features[:, 1], hue=clusterer.labels_, palette='viridis', s=80, linewidths=0)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('HDBSCAN Clustering of Bookmarks')
plt.tight_layout()
plt.show()

NameError: name 'bookmarks' is not defined