In [None]:
import os
import pandas as pd
import json
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_distances
from sentence_transformers import SentenceTransformer

In [None]:
with open("../../data/shuffled_output.json", "r", encoding="utf-8") as f:
    data = json.load(f)

data = pd.json_normalize(data)

print(len(data))

In [None]:
data["completed_notes"]

In [None]:
# Dictionary to cache the embedded data
embedding_cache = {}

# Check for existing embedding files and load them into the cache
if os.path.isfile("gte-multilingual-base.parquet"):
    embedding_cache['gte-multilingual-base'] = pd.read_parquet('gte-multilingual-base.parquet', engine='pyarrow')

if os.path.isfile("paraphrase-multilingual-MiniLM-L12-v2.parquet"):
    embedding_cache["paraphrase-multilingual-MiniLM-L12-v2"] = pd.read_parquet('paraphrase-multilingual-MiniLM-L12-v2.parquet', engine='pyarrow')

if os.path.isfile("LaBSE.parquet"):
    embedding_cache["LaBSE"] = pd.read_parquet('LaBSE.parquet', engine='pyarrow')

if os.path.isfile("bge-m3.parquet"):
    embedding_cache["bge-m3"] = pd.read_parquet('bge-m3.parquet', engine='pyarrow')

In [None]:
def embed_data(model_name, dataframe, columns_to_embed):
    # Check if we already have embeddings for this model cached
    if model_name.split("/")[1] in embedding_cache:
        print(f"Using cached embeddings for model: {model_name.split('/')[1]}")
        return embedding_cache[model_name.split("/")[1]]

    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Load the sentence transformer model and move it to the GPU if available
    model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
    
    # Combine the text from specified columns into a single string per row
    combined_texts = dataframe[columns_to_embed].astype(str).agg(' '.join, axis=1).tolist()
    
    # Generate the embeddings for the combined texts, and run them on the GPU if available
    embeddings = model.encode(combined_texts, show_progress_bar=True, normalize_embeddings=True, device=device)

    # Create a DataFrame with companies and embeddings
    embedded_df = dataframe.copy()
    embedded_df["metadata.embedding"] = list(embeddings)

    # Cache the embeddings for future use
    embedding_cache[model_name.split("/")[1]] = embedded_df

    # Release the GPU memory by deleting the model and clearing the cache
    del model
    torch.cuda.empty_cache()

    return embedded_df

In [None]:
model_names = ["sentence-transformers/LaBSE", "Alibaba-NLP/gte-multilingual-base", 
               "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "BAAI/bge-m3"]

columns_to_embed = ["info.product_description"]

for model_name in model_names:
    embedded_df = embed_data(model_name, data, columns_to_embed)
    short_name = model_name.split("/")[1]
    embedded_df.to_parquet(f'{short_name}.parquet', engine='pyarrow')

In [None]:
def cluster_DBSCAN(dataframe, eps, metric):
    # Create a copy of the original dataframe to avoid modifying it
    df_copy = dataframe.copy()

    # Extract the embeddings from the dataframe
    embeddings = np.vstack(df_copy['metadata.embedding'].values)  # Stack the embeddings into a 2D array

    # Perform DBSCAN clustering using the given distance metric
    dbscan = DBSCAN(eps=eps, min_samples=1, metric=metric)
    df_copy['metadata.cluster'] = dbscan.fit_predict(embeddings)

    # For each cluster, rename the cluster name to the first company in that cluster
    cluster_names = {}
    for cluster in df_copy['metadata.cluster'].unique():
        # Find the first company name for the cluster
        first_company = df_copy[df_copy['metadata.cluster'] == cluster].iloc[0]['info.company_name']
        cluster_names[cluster] = first_company

    # Replace cluster IDs with the first company name
    df_copy['metadata.cluster'] = df_copy['metadata.cluster'].map(cluster_names)

    # Return the dataframe with the relevant columns
    return df_copy

In [None]:
import hdbscan

def cluster_HDBSCAN(dataframe, min_cluster_size, min_samples, metric, eps):
    df_copy = dataframe.copy()
    embeddings = np.vstack(df_copy['metadata.embedding'].values).astype(np.float64)

    if metric == 'cosine':
        distance_matrix = cosine_distances(embeddings).astype(np.float64)
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric='precomputed',
            gen_min_span_tree=True,
            cluster_selection_method='eom',
            cluster_selection_epsilon=eps
        )
        cluster_labels = clusterer.fit_predict(distance_matrix)
    else:
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric=metric,
            gen_min_span_tree=True,
            cluster_selection_method='eom',
            cluster_selection_epsilon=eps
        )
        cluster_labels = clusterer.fit_predict(embeddings)

    df_copy['cluster_id'] = cluster_labels

    # Step 1: Separate clusters and outliers
    clustered = df_copy[df_copy['cluster_id'] != -1].copy()
    outliers = df_copy[df_copy['cluster_id'] == -1].copy()

    # Step 2: Calculate cluster centroids
    centroids = clustered.groupby('cluster_id')['metadata.embedding'].apply(
        lambda x: np.mean(np.vstack(x.values), axis=0)
    ).to_dict()

    # Step 3: Assign each outlier to the closest centroid
    for idx, row in outliers.iterrows():
        embedding = np.array(row['metadata.embedding']).reshape(1, -1)
        closest_cluster = min(
            centroids.items(),
            key=lambda item: cosine_distances(embedding, item[1].reshape(1, -1))[0][0]
        )[0]
        
        # Assign this outlier to the closest cluster
        df_copy.at[idx, 'cluster_id'] = closest_cluster

        # Update centroid with new point (running mean)
        old_centroid = centroids[closest_cluster]
        cluster_points = df_copy[df_copy['cluster_id'] == closest_cluster]['metadata.embedding'].values
        updated_centroid = np.mean(np.vstack(cluster_points), axis=0)
        centroids[closest_cluster] = updated_centroid

    # Step 4: Rename clusters by first company name
    cluster_names = {}
    for cluster in df_copy['cluster_id'].unique():
        first_company = df_copy[df_copy['cluster_id'] == cluster].iloc[0]['info.company_name']
        cluster_names[cluster] = first_company

    df_copy['metadata.cluster'] = df_copy['cluster_id'].map(cluster_names)
    df_copy.drop(columns=['cluster_id'], inplace=True)

    return df_copy

In [None]:
def convert_to_cluster_members(df):
  # Group the dataframe by 'cluster' and aggregate the company names into lists
  cluster_members = df.groupby('metadata.cluster')['info.company_name'].apply(list).reset_index()
    
  # Rename the columns to 'cluster_name' and 'members'
  cluster_members.columns = ['metadata.cluster', 'members']
    
  return cluster_members

In [None]:
def calculate_silhouette_score(df, metric):
    embeddings = np.vstack(df['metadata.embedding'].values)
    labels = df['metadata.cluster']
    
    if len(np.unique(labels)) == 1:
        # Silhouette score cannot be computed with only one cluster
        return -1
    
    score = silhouette_score(embeddings, labels, metric=metric)
    return score

In [None]:
def calculate_gap_statistic(df, n_refs=5):
    random_state = 42
    
    embeddings = np.vstack(df['metadata.embedding'].values)
    labels = df['metadata.cluster']

    def calculate_dispersion_cosine(embeddings, labels):
        """Calculate the sum of squared cosine distances from points to their cluster centroids."""
        unique_labels = np.unique(labels)
        dispersion = 0
        for label in unique_labels:
            cluster_points = embeddings[labels == label]
            if len(cluster_points) > 1:
                centroid = np.mean(cluster_points, axis=0)
                # Compute cosine distances between points and their cluster centroids
                distances = cosine_distances(cluster_points, centroid.reshape(1, -1))
                dispersion += np.sum(distances ** 2)
        return dispersion

    # Calculate dispersion for actual data using cosine distances
    actual_dispersion = calculate_dispersion_cosine(embeddings, labels)
    
    # Generate reference datasets and calculate their dispersion
    ref_disps = np.zeros(n_refs)
    for i in range(n_refs):
        # Create a random reference dataset with a different seed for each run
        np.random.seed(random_state + i)  # Different seed for each reference dataset
        random_ref = np.random.uniform(low=np.min(embeddings, axis=0), high=np.max(embeddings, axis=0), size=embeddings.shape)
        ref_kmeans = KMeans(n_clusters=len(np.unique(labels)), random_state=random_state).fit(random_ref)
        ref_disps[i] = calculate_dispersion_cosine(random_ref, ref_kmeans.labels_)

    # Calculate the gap statistic
    gap_stat = np.log(np.mean(ref_disps)) - np.log(actual_dispersion)
    return gap_stat

In [None]:
def calculate_error_sum_of_squares_cosine(df):
    embeddings = np.vstack(df['metadata.embedding'].values)
    labels = df['metadata.cluster']
    
    unique_labels = np.unique(labels)
    total_ess = 0
    for label in unique_labels:
        cluster_points = embeddings[labels == label]
        if len(cluster_points) > 1:
            centroid = np.mean(cluster_points, axis=0)
            # Compute cosine distances from points to centroid
            distances = cosine_distances(cluster_points, centroid.reshape(1, -1))
            total_ess += np.sum(distances ** 2)
    
    return total_ess

In [None]:
# Main function to return all three scores
def run_tests(df, metric):
    silhouette = calculate_silhouette_score(df, metric)
    gap_stat = calculate_gap_statistic(df)
    ess = calculate_error_sum_of_squares_cosine(df)
    
    return silhouette, gap_stat, ess

In [None]:
def cluster_and_evaluate(model_name, dataframe, columns_to_embed, eps=0.1, metric='cosine'):
    # Step 1: Embed the data
    embedded_df = embed_data(model_name, dataframe, columns_to_embed)
    
    # Step 2: Cluster the embedded data using DBSCAN
    #clustered_df = cluster_DBSCAN(embedded_df, eps=eps, metric=metric)
    clustered_df = cluster_HDBSCAN(embedded_df, min_cluster_size=2, min_samples=1, metric=metric, eps=eps)
    
    # Step 3: Evaluate clustering with silhouette score, gap statistic, and error sum of squares
    silhouette, gap_stat, ess = run_tests(clustered_df, metric)
    
    # Return the clustered dataframe along with the evaluation metrics
    return clustered_df, silhouette, gap_stat, ess

In [None]:
columns_to_embed = ["info.product_description"]

model_names = ["BAAI/bge-m3", "Alibaba-NLP/gte-multilingual-base", 
               "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "sentence-transformers/LaBSE"]

eps_values = [0, 0.00675, 0.0125, 0.025, 0.05, 0.1, 0.15, 0.2, 0.5, 1, 2, 4, 8, 16]

In [None]:
def evaluate_models_and_eps(dataframe, columns_to_embed, model_names, eps_values):
    results = []

    # Outer loop for model names
    for model_name in model_names:
        # Inner loop for eps values
        for eps in eps_values:
            # Run the cluster_and_evaluate function with the current model and eps
            clustered_df, silhouette, gap_stat, ess = cluster_and_evaluate(
                model_name, dataframe, columns_to_embed, eps=eps
            )
            
            # Append the results as a row in the list
            results.append({
                'model_name': model_name,
                'eps': eps,
                'silhouette_score': silhouette,
                'gap_statistic': gap_stat,
                'error_sum_of_squares': ess
            })
    
    # Convert the results list to a DataFrame and return it
    return pd.DataFrame(results)

results_df = evaluate_models_and_eps(data, columns_to_embed, model_names, eps_values)

In [None]:
results_df.head(60)

In [None]:
clustered_df, silhouette, gap_stat, ess = cluster_and_evaluate("Alibaba-NLP/gte-multilingual-base", data, columns_to_embed, eps=0, metric='cosine')

In [None]:
clustered_df.columns

In [None]:
run_tests(clustered_df)

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns


# Extract the embeddings and clusters
embeddings = clustered_df['metadata.embedding'].tolist()
clusters = clustered_df['metadata.cluster'].tolist()


# Convert embeddings to a suitable format (if they're not already numpy arrays)
embeddings_array = np.array(embeddings)

# Initialize PCA
pca = PCA(n_components=2)  # Reduce to 2 components for visualization

# Fit PCA on embeddings
pca_embeddings = pca.fit_transform(embeddings_array)

# Create a new dataframe with the PCA components
df_pca_embeddings = pd.DataFrame(pca_embeddings, columns=['PCA1', 'PCA2'])
df_pca_embeddings['cluster'] = clusters  # Add the cluster information for coloring

# Visualize the PCA results
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PCA1', y='PCA2', hue='cluster', palette='viridis', data=df_pca_embeddings, s=100, edgecolor='k')
plt.title('PCA of Embeddings')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend().set_visible(False)
plt.show()

In [None]:
from sklearn.manifold import TSNE

# Extract the embeddings and clusters
embeddings = clustered_df['metadata.embedding'].tolist()
clusters = clustered_df['metadata.cluster'].tolist()

# Convert embeddings to a suitable format (if they're not already numpy arrays)
embeddings_array = np.array(embeddings)

# Initialize t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42, n_iter=1000)

# Fit and transform the embeddings using t-SNE
tsne_embeddings = tsne.fit_transform(embeddings_array)

# Create a new dataframe with the t-SNE components
df_tsne_embeddings = pd.DataFrame(tsne_embeddings, columns=['TSNE1', 'TSNE2'])
df_tsne_embeddings['cluster'] = clusters  # Add the cluster information for coloring

# Visualize the t-SNE results
plt.figure(figsize=(10, 8))
sns.scatterplot(x='TSNE1', y='TSNE2', hue='cluster', palette='viridis', data=df_tsne_embeddings, s=100, edgecolor='k')
plt.title('t-SNE of Embeddings')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend().set_visible(False)
plt.show()


In [None]:
len(clustered_df["metadata.cluster"].unique())

In [None]:
clustered_df["metadata.cluster"].value_counts()

In [None]:
clustered_df[clustered_df["metadata.cluster"] == "Sanofi Sağlık Ürünleri Limited Şirketi"]

In [None]:
# Extract the embeddings and clusters
embeddings = clustered_df['metadata.embedding'].tolist()
clusters = clustered_df['metadata.cluster'].tolist()


# Convert embeddings to a suitable format (if they're not already numpy arrays)
embeddings_array = np.array(embeddings)

# Initialize PCA
pca = PCA(n_components=2)  # Reduce to 2 components for visualization

# Fit PCA on embeddings
pca_embeddings = pca.fit_transform(embeddings_array)

# Create a new dataframe with the PCA components
df_pca_embeddings = pd.DataFrame(pca_embeddings, columns=['PCA1', 'PCA2'])
df_pca_embeddings['cluster'] = clusters  # Add the cluster information for coloring

# Visualize the PCA results
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PCA1', y='PCA2', hue='cluster', palette='viridis', data=df_pca_embeddings, s=100, edgecolor='k')
plt.title('PCA of Embeddings')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend().set_visible(False)
plt.show()



In [None]:
# Extract the embeddings and clusters
embeddings = clustered_df['metadata.embedding'].tolist()
clusters = clustered_df['metadata.cluster'].tolist()

# Convert embeddings to a suitable format (if they're not already numpy arrays)
embeddings_array = np.array(embeddings)

# Initialize t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42, n_iter=1000)

# Fit and transform the embeddings using t-SNE
tsne_embeddings = tsne.fit_transform(embeddings_array)

# Create a new dataframe with the t-SNE components
df_tsne_embeddings = pd.DataFrame(tsne_embeddings, columns=['TSNE1', 'TSNE2'])
df_tsne_embeddings['cluster'] = clusters  # Add the cluster information for coloring

# Visualize the t-SNE results
plt.figure(figsize=(10, 8))
sns.scatterplot(x='TSNE1', y='TSNE2', hue='cluster', palette='viridis', data=df_tsne_embeddings, s=100, edgecolor='k')
plt.title('t-SNE of Embeddings')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend().set_visible(False)
plt.show()
