In [43]:
# import sys
# !{sys.executable} -mpip install sentence-transformers sklearn

In [44]:
from sentence_transformers import SentenceTransformer
import numpy as np
import json

In [45]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

In [46]:
# Import user_tags_set.json
with open('../user_tags_set.json') as f:
    user_tags_set = np.array(json.load(f))

In [48]:
embeddings = model.encode(user_tags_set, device="cpu")

In [49]:
STEPS = np.linspace(500, 1200, 5)

In [50]:
from collections import defaultdict
from sklearn.cluster import KMeans


def predict_with_k(k):
    kmeans = KMeans(n_clusters=k).fit(embeddings)
    pred_clusters = kmeans.predict(embeddings)
    centroids = kmeans.cluster_centers_
    cluster_items = defaultdict(list)
    mse = np.zeros(k)

    for i, cluster_index in enumerate(pred_clusters):
        cluster_items[int(cluster_index)].append(user_tags_set[i])
        mse[cluster_index] += (
            np.linalg.norm(embeddings[i] - centroids[cluster_index])
        ) ** 2

    for i, cluster in enumerate(cluster_items.values()):
        mse[i] /= len(cluster)

    sorted_mse_indices = np.argsort(mse)

    return [cluster_items[i] for i in sorted_mse_indices if len(cluster_items[i]) > 1]


In [51]:
from tqdm.notebook import tqdm
k_steps = {k: predict_with_k(k) for k in tqdm(STEPS.astype(int).tolist())}
k_steps.keys()

  0%|          | 0/5 [00:00<?, ?it/s]

dict_keys([500, 675, 850, 1025, 1200])

In [52]:
with open("k-steps.json", "w") as f:
    json.dump(k_steps, f)