# Online clustering

this the notebook which contains the code for online clustering the embeddings.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
import warnings

from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import umap.umap_ as umap
from sklearn.decomposition import PCA

from OnlineKMeans import OnlineKMeans

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# Workflow

In [3]:
# Load data
X_semantic_train = np.load("../data/tensors/squad_train_v2_semantic_chunking.npy")
df_semantic_train = pd.read_excel("../data/prepared/squad_train_v2_semantic_chunking.xlsx")

In [4]:
num_docs = X_semantic_train.shape[0]
split_idx = num_docs // 2
embeddings_init = X_semantic_train[:split_idx]
embeddings_online = X_semantic_train[split_idx:]

In [5]:
online_kmeans = OnlineKMeans(
    n_clusters=500,               # start with 50 clusters (tune as needed)
    max_clusters=2000,            # allow max 200 clusters
    metric="cosine",             # cosine similarity is common for embeddings
    new_cluster_threshold=0.8,   # create new clusters for far away points
    merge_threshold=0.08,        # merge clusters that are very close
    decay=1.0,
    random_state=42
)

In [6]:
# Step 1: Initialize with first half of the data
online_kmeans.initialize_centroids(embeddings_init)

In [7]:
initial_centroids = online_kmeans.centroids.copy()

In [8]:
# Optionally: assign initial cluster labels
initial_labels = online_kmeans.predict(embeddings_init)
df_semantic_train.loc[:split_idx-1, 'cluster'] = initial_labels

In [9]:
# Step 2: Process the rest incrementally
batch_size = 256
for start in tqdm(range(0, embeddings_online.shape[0], batch_size)):
    end = min(start + batch_size, embeddings_online.shape[0])
    batch = embeddings_online[start:end]
    online_kmeans.partial_fit(batch)
    labels = online_kmeans.predict(batch)
    df_semantic_train.loc[split_idx + start: split_idx + end - 1, 'cluster'] = labels

100%|██████████| 165/165 [00:03<00:00, 49.79it/s]


In [10]:
df_semantic_train['cluster'].nunique()

533