In [None]:
# %pip install -q pandas numpy==1.26.4 scikit-learn hdbscan sentence-transformers langchain langchain-huggingface

In [350]:
import pandas as pd

In [351]:
import numpy as np

In [352]:
from sklearn.metrics import silhouette_score

In [353]:
import hdbscan

In [354]:
from langchain_huggingface import HuggingFaceEmbeddings

In [355]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2", # sentence-transformers/all-MiniLM-L6-v2 is a more cheaper option
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': False}
)

In [356]:
data = pd.read_csv("input_data/mdc-1 (1) (1) (2) (1) (1).csv")

In [357]:
data.head()

Unnamed: 0,id,activity_id,name
0,3191380,MDC1-UP#13,MDC1 Master Update #13- DD 8.31.2021Submitted
1,3191381,MDC1-UP#13.1,Executive Summary / Milestones
2,3192817,MDC1-UP#13.5,Construction
3,3191388,MDC1-UP#13.2,Construction Summary
4,3191673,MDC1-UP#13.4,Preconstruction


In [358]:
missing_values_count = int(data["name"].isnull().sum().sum())
if missing_values_count > 0:
    data["name"].dropna(inplace=True)
    print("rows with missing values removed.")
else:
    print("no missing values found.")
        
print(f"total rows: {data.shape[0]}, total columns: {data.shape[1]}\n")

no missing values found.
total rows: 3436, total columns: 3



In [360]:
embeddings = embedding_model.embed_documents(data["name"].tolist())
numpy_embeddings = np.array(embeddings)
numpy_embeddings

array([[ 0.0346262 , -0.00400697, -0.02562224, ..., -0.00772224,
        -0.09697308, -0.0855745 ],
       [ 0.0210641 ,  0.0076285 , -0.01034808, ..., -0.03108582,
        -0.02503936, -0.04262969],
       [-0.01147243,  0.09289231, -0.00782322, ...,  0.00320298,
        -0.03512751, -0.01276768],
       ...,
       [ 0.03782113, -0.07565072,  0.03591477, ...,  0.00927046,
        -0.075997  ,  0.00016669],
       [ 0.02486056, -0.06569406,  0.03583554, ...,  0.00699133,
        -0.07059027, -0.00478684],
       [-0.01100751, -0.03183262, -0.00564734, ...,  0.02782308,
        -0.03704263, -0.02425504]])

In [361]:
cluster_function = hdbscan.HDBSCAN(min_cluster_size=2, metric='euclidean', alpha=0.85, cluster_selection_epsilon=0.85)
cluster_function

In [362]:
labels = cluster_function.fit_predict(numpy_embeddings)
labels

array([ -1,  85, 144, ..., 237, 237,  -1])

In [363]:
unique_labels = set(labels)

# removing the noise labels (-1)
if -1 in unique_labels:
    unique_labels.remove(-1)

num_clusters = len(unique_labels)
num_clusters


277

In [364]:
clustered_points = np.sum(labels != -1)
clustered_points

3332

In [365]:
non_clustered_points = np.sum(labels == -1)
non_clustered_points

104

In [366]:
# a high score (close to 1) means clusters are well-defined and separated.
# a low or negative score means clusters are overlapping or poorly defined.
# calculating for more than one cluster (num_clusters > 1), because with just one cluster, there's nothing to compare it to.
silhouette = silhouette_score(numpy_embeddings, labels) if num_clusters > 1 else None
silhouette

0.6119768028551619

In [None]:
def save_clusters_no_noise(data, labels, output_file):
    data["Cluster ID"] = labels
    clustered_data = data[data["Cluster ID"] != -1]  # excludes points that werent clustered (label -1)
    clustered_data.to_csv(output_file, index=False)
    print(f"clusters saved to {output_file}")

output_file = "output_data/output_clusters_no_noise.csv" # specify output path to save results
save_clusters_no_noise(data, labels, output_file)

In [None]:
def save_clusters_with_noise(data, labels, output_file):
    data["Cluster ID"] = labels
    clustered_data = data[data["Cluster ID"] > -2]  # includes all points (addition of label -1)
    clustered_data.to_csv(output_file, index=False)
    print(f"clusters saved to {output_file}")

output_file = "output_data/output_clusters_with_noise.csv"  # specify output path to save results
save_clusters_with_noise(data, labels, output_file)