### **Elbow Method**

In [None]:
!pip install sentence_transformers funcy

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting funcy
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transformer

In [None]:
from sentence_transformers import SentenceTransformer
from funcy import print_durations

encoder_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

@print_durations
def get_embeddings(labels, sent_tran_model):
    embeddings = sent_tran_model.encode(labels, show_progress_bar=False)
    return embeddings

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
words = ["is larger than", "with his fat", "is wider than", "is going to vote for", "supports", "very bigger than", "endorses", "is very likely going to support", "removes", "withdraws from", "extracted with", "draws out"]

In [None]:
embeddings = get_embeddings(words, encoder_model)

   76.83 ms in get_embeddings(['larger than', 'more ..., SentenceTransformer( (...)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

In [None]:
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(1, len(embeddings) + 1)

for k in K:

    kmeanModel = KMeans(n_clusters=k, random_state=42)
    kmeanModel.fit(embeddings)

    distortions.append(sum(np.min(cdist(embeddings, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / embeddings.shape[0])
    inertias.append(kmeanModel.inertia_)

    mapping1[k] = sum(np.min(cdist(embeddings, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / embeddings.shape[0]
    mapping2[k] = kmeanModel.inertia_

In [None]:
print(kmeanModel.labels_)

[4 2 0 6 5 7 3 1]


### **Silhouette Score**

---



In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
def cluster_embeddings(embeddings):
    silhouette_scores = []
    for k in range(2, len(embeddings)):
        kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=42)
        kmeans.fit(embeddings)
        score = silhouette_score(embeddings, kmeans.labels_)
        silhouette_scores.append(score)

    optimal_k = np.argmax(silhouette_scores) + 2

    kmeans = KMeans(n_clusters=optimal_k, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(embeddings)

    cluster_centers = kmeans.cluster_centers_
    cluster_labels = kmeans.labels_

    return cluster_centers, cluster_labels

In [None]:
cluster_centers, cluster_labels = cluster_embeddings(embeddings)

In [None]:
print(cluster_labels)

[2 2 2 0 0 2 0 0 1 1 1 1]


In [None]:
clusters = {}
for label, item in zip(cluster_labels, words):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(item)

for cluster_id, cluster_items in clusters.items():
    print(f"Cluster {cluster_id}:")
    for item in cluster_items:
        print(f"  {item}")

Cluster 2:
  is larger than
  more fat
  is wider than
  very bigger to
Cluster 0:
  is going to vote for
  supports
  endorses
  is very likely going to support
Cluster 1:
  removes
  withdraw from
  extracts
  draw out
