In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import umap.umap_ as umap
from sklearn.preprocessing import LabelEncoder
from disentangler.embedder import Embedder
from disentangler.clusterer import Clusterer

with open('../sample_data/sample_chat.json') as f:
    data = json.load(f)

utterances = [d['message'] for d in data]

embedder = Embedder()
embeddings = embedder.encode(utterances)

clusterer = Clusterer(distance_threshold=1.2)
labels = clusterer.cluster(np.array(embeddings))

umap_model = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='cosine')
umap_proj = umap_model.fit_transform(embeddings)

plt.figure(figsize=(10, 7))
plt.scatter(umap_proj[:, 0], umap_proj[:, 1], c=labels, cmap='tab10', s=60, edgecolors='k')
for i, text in enumerate(utterances):
    plt.annotate(text[:30] + ('...' if len(text) > 30 else ''), (umap_proj[i, 0], umap_proj[i, 1]), fontsize=8)
plt.title('UMAP Projection of Chat Topics')
plt.xlabel('UMAP-1')
plt.ylabel('UMAP-2')
plt.grid(True)
plt.show()