In [1]:
import os
import random
import pandas as pd
import json
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import torch
import hdbscan
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_distances
from sentence_transformers import SentenceTransformer
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("../../data/shuffled_output.json", "r", encoding="utf-8") as f:
    parent_data = json.load(f)

parent_data = pd.json_normalize(parent_data)

print(len(parent_data))

400


In [3]:
with open("../../data/subsidiary_companies_with_prod_descs.json", "r", encoding="utf-8") as f:
    sub_data = json.load(f)

sub_data = pd.json_normalize(sub_data)

print(len(sub_data))

sub_data = sub_data[~sub_data["info.product_description"].isna()]

print(len(sub_data))

2581
2581


In [4]:
singular_data = parent_data[~parent_data["company_name"].str.contains("holding", case=False) &
                            ~parent_data["company_name"].str.contains("grup", case=False) &
                            ~parent_data["company_name"].str.contains("grubu", case=False)]

only_parent_data = parent_data[parent_data["company_name"].str.contains("holding", case=False) |
                               parent_data["company_name"].str.contains("grup", case=False) |
                               parent_data["company_name"].str.contains("grubu", case=False)]


len(parent_data), len(singular_data), len(only_parent_data)

(400, 355, 45)

In [None]:
holdings_out_subsidiries_in_df = singular_data.copy()

In [None]:
for index, row in only_parent_data.iterrows():
    for inner_idx, inner_row in sub_data.iterrows():
        if row["info.company_name"] == inner_row["parent_name"]:
            inner_df = pd.DataFrame({"info.company_name": [inner_row["info.company_name"]],
                                     "info.affiliations.parents": [[inner_row["parent_name"]]],
                                     "info.product_description": [inner_row["info.product_description"]]
                                     })
            
            holdings_out_subsidiries_in_df = pd.concat([holdings_out_subsidiries_in_df, inner_df], ignore_index=True)

In [None]:
holdings_out_subsidiries_in_df.tail()

In [None]:
# Dictionary to cache the embedded data
embedding_cache = {}

# Check for existing embedding files and load them into the cache
if os.path.isfile("gte-multilingual-base.parquet"):
    embedding_cache['gte-multilingual-base'] = pd.read_parquet('gte-multilingual-base.parquet', engine='pyarrow')

if os.path.isfile("paraphrase-multilingual-MiniLM-L12-v2.parquet"):
    embedding_cache["paraphrase-multilingual-MiniLM-L12-v2"] = pd.read_parquet('paraphrase-multilingual-MiniLM-L12-v2.parquet', engine='pyarrow')

if os.path.isfile("LaBSE.parquet"):
    embedding_cache["LaBSE"] = pd.read_parquet('LaBSE.parquet', engine='pyarrow')

if os.path.isfile("bge-m3.parquet"):
    embedding_cache["bge-m3"] = pd.read_parquet('bge-m3.parquet', engine='pyarrow')

In [None]:
def embed_data(model_name, dataframe, columns_to_embed):
    # Check if we already have embeddings for this model cached
    if model_name.split("/")[1] in embedding_cache:
        print(f"Using cached embeddings for model: {model_name.split('/')[1]}")
        return embedding_cache[model_name.split("/")[1]]

    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Load the sentence transformer model and move it to the GPU if available
    model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
    
    # Combine the text from specified columns into a single string per row
    combined_texts = dataframe[columns_to_embed].astype(str).agg(' '.join, axis=1).tolist()
    
    # Generate the embeddings for the combined texts, and run them on the GPU if available
    embeddings = model.encode(combined_texts, show_progress_bar=True, normalize_embeddings=True, device=device)

    # Create a DataFrame with companies and embeddings
    embedded_df = dataframe.copy()
    embedded_df["metadata.embedding"] = list(embeddings)

    # Cache the embeddings for future use
    embedding_cache[model_name.split("/")[1]] = embedded_df

    # Release the GPU memory by deleting the model and clearing the cache
    del model
    torch.cuda.empty_cache()

    return embedded_df

In [None]:
model_names = ["sentence-transformers/LaBSE", "Alibaba-NLP/gte-multilingual-base", 
               "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "BAAI/bge-m3"]

columns_to_embed = ["info.product_description"]

for model_name in model_names:
    embedded_df = embed_data(model_name, holdings_out_subsidiries_in_df, columns_to_embed)
    short_name = model_name.split("/")[1]
    embedded_df.to_parquet(f'{short_name}.parquet', engine='pyarrow')

In [None]:
def cluster_DBSCAN_PCA(dataframe, eps, metric, reduced_dims=None):
    """
    Perform DBSCAN clustering on embeddings with optional PCA dimensionality reduction.
    Parameters:
    -----------
    dataframe : pandas DataFrame
        DataFrame containing embeddings in 'metadata.embedding' column
    eps : float
        The maximum distance between two samples for one to be considered in the neighborhood of the other
    metric : str
        Distance metric to use ('cosine', 'euclidean', etc.)
    reduced_dims : int, optional
        Number of dimensions to reduce to using PCA before clustering
        If None, clustering is performed on the original embeddings
    Returns:
    --------
    DataFrame with cluster assignments
    """
    # Create a copy of the original dataframe to avoid modifying it
    df_copy = dataframe.copy()
    
    # Extract the embeddings from the dataframe
    embeddings = np.vstack(df_copy['metadata.embedding'].values) # Stack the embeddings into a 2D array
    
    # Apply PCA dimensionality reduction if requested
    if reduced_dims is not None:
        print(f"Applying PCA dimensionality reduction to {reduced_dims} dimensions...")
        pca = PCA(n_components=reduced_dims, random_state=42)
        reduced_embeddings = pca.fit_transform(embeddings)
        
        # Print the explained variance ratio
        explained_variance = sum(pca.explained_variance_ratio_) * 100
        print(f"PCA explained variance: {explained_variance:.2f}%")
        
        # Perform DBSCAN clustering on the reduced embeddings
        dbscan = DBSCAN(eps=eps, min_samples=1, metric=metric)
        df_copy['metadata.cluster'] = dbscan.fit_predict(reduced_embeddings)
    else:
        # Perform DBSCAN clustering on the original embeddings
        dbscan = DBSCAN(eps=eps, min_samples=1, metric=metric)
        df_copy['metadata.cluster'] = dbscan.fit_predict(embeddings)
    
    # For each cluster, rename the cluster name to the first company in that cluster
    cluster_names = {}
    for cluster in df_copy['metadata.cluster'].unique():
        # Find the first company name for the cluster
        first_company = df_copy[df_copy['metadata.cluster'] == cluster].iloc[0]['info.company_name']
        cluster_names[cluster] = first_company
    
    # Replace cluster IDs with the first company name
    df_copy['metadata.cluster'] = df_copy['metadata.cluster'].map(cluster_names)
    
    # Return the dataframe with the relevant columns
    return df_copy

In [None]:
def cluster_DBSCAN(dataframe, eps, metric, reduced_dims=None):
    """
    Perform DBSCAN clustering on embeddings with optional t-SNE dimensionality reduction.
    
    Parameters:
    -----------
    dataframe : pandas DataFrame
        DataFrame containing embeddings in 'metadata.embedding' column
    eps : float
        The maximum distance between two samples for one to be considered in the neighborhood of the other
    metric : str
        Distance metric to use ('cosine', 'euclidean', etc.)
    reduced_dims : int, optional
        Number of dimensions to reduce to using t-SNE before clustering
        If None, clustering is performed on the original embeddings
    
    Returns:
    --------
    DataFrame with cluster assignments
    """
    # Create a copy of the original dataframe to avoid modifying it
    df_copy = dataframe.copy()
    
    # Extract the embeddings from the dataframe
    embeddings = np.vstack(df_copy['metadata.embedding'].values)  # Stack the embeddings into a 2D array
    
    # Apply t-SNE dimensionality reduction if requested
    if reduced_dims is not None:
        print(f"Applying t-SNE dimensionality reduction to {reduced_dims} dimensions...")

        if reduced_dims <= 3:
            tsne = TSNE(n_components=reduced_dims, random_state=42, n_jobs=-1, method='barnes_hut')
        else:
            tsne = TSNE(n_components=reduced_dims, random_state=42, n_jobs=-1, method='exact')

        reduced_embeddings = tsne.fit_transform(embeddings)
        
        # Print the KL divergence (information loss)
        print(f"t-SNE KL divergence (information loss): {tsne.kl_divergence_:.4f}")
        
        # Perform DBSCAN clustering on the reduced embeddings
        dbscan = DBSCAN(eps=eps, min_samples=1, metric=metric)
        df_copy['metadata.cluster'] = dbscan.fit_predict(reduced_embeddings)
    else:
        # Perform DBSCAN clustering on the original embeddings
        dbscan = DBSCAN(eps=eps, min_samples=1, metric=metric)
        df_copy['metadata.cluster'] = dbscan.fit_predict(embeddings)
    
    # For each cluster, rename the cluster name to the first company in that cluster
    cluster_names = {}
    for cluster in df_copy['metadata.cluster'].unique():
        # Find the first company name for the cluster
        first_company = df_copy[df_copy['metadata.cluster'] == cluster].iloc[0]['info.company_name']
        cluster_names[cluster] = first_company
    
    # Replace cluster IDs with the first company name
    df_copy['metadata.cluster'] = df_copy['metadata.cluster'].map(cluster_names)
    
    # Return the dataframe with the relevant columns
    return df_copy

In [None]:
def cluster_HDBSCAN(dataframe, min_cluster_size, min_samples, metric, eps, reduced_dims=None):
    """
    Perform HDBSCAN clustering on embeddings with optional t-SNE dimensionality reduction.
    
    Parameters:
    -----------
    dataframe : pandas DataFrame
        DataFrame containing embeddings in 'metadata.embedding' column
    min_cluster_size : int
        Minimum size of clusters
    min_samples : int
        Minimum number of samples in the neighborhood to be considered a core point
    metric : str
        Distance metric to use ('cosine', 'euclidean', etc.)
    eps : float
        Cluster selection epsilon
    reduced_dims : int, optional
        Number of dimensions to reduce to using t-SNE before clustering
        If None, clustering is performed on the original embeddings
    
    Returns:
    --------
    DataFrame with cluster assignments
    """
    df_copy = dataframe.copy()
    embeddings = np.vstack(df_copy['metadata.embedding'].values).astype(np.float64)
    
    # Apply t-SNE dimensionality reduction if requested
    if reduced_dims is not None:
        print(f"Applying t-SNE dimensionality reduction to {reduced_dims} dimensions...")

        if reduced_dims <= 3:
            tsne = TSNE(n_components=reduced_dims, random_state=42, n_jobs=-1, method='barnes_hut')
        else:
            tsne = TSNE(n_components=reduced_dims, random_state=42, n_jobs=-1, method='exact')
            
        reduced_embeddings = tsne.fit_transform(embeddings)
        
        # Use the reduced embeddings for clustering
        if metric == 'cosine':
            distance_matrix = cosine_distances(reduced_embeddings).astype(np.float64)
            clusterer = hdbscan.HDBSCAN(
                min_cluster_size=min_cluster_size,
                min_samples=min_samples,
                metric='precomputed',
                gen_min_span_tree=True,
                cluster_selection_method='eom',
                cluster_selection_epsilon=eps
            )
            cluster_labels = clusterer.fit_predict(distance_matrix)
        else:
            clusterer = hdbscan.HDBSCAN(
                min_cluster_size=min_cluster_size,
                min_samples=min_samples,
                metric=metric,
                gen_min_span_tree=True,
                cluster_selection_method='eom',
                cluster_selection_epsilon=eps
            )
            cluster_labels = clusterer.fit_predict(reduced_embeddings)
    else:
        # Use original embeddings for clustering (original behavior)
        if metric == 'cosine':
            distance_matrix = cosine_distances(embeddings).astype(np.float64)
            clusterer = hdbscan.HDBSCAN(
                min_cluster_size=min_cluster_size,
                min_samples=min_samples,
                metric='precomputed',
                gen_min_span_tree=True,
                cluster_selection_method='eom',
                cluster_selection_epsilon=eps
            )
            cluster_labels = clusterer.fit_predict(distance_matrix)
        else:
            clusterer = hdbscan.HDBSCAN(
                min_cluster_size=min_cluster_size,
                min_samples=min_samples,
                metric=metric,
                gen_min_span_tree=True,
                cluster_selection_method='eom',
                cluster_selection_epsilon=eps
            )
            cluster_labels = clusterer.fit_predict(embeddings)
    
    df_copy['cluster_id'] = cluster_labels
    
    # Step 1: Separate clusters and outliers
    clustered = df_copy[df_copy['cluster_id'] != -1].copy()
    outliers = df_copy[df_copy['cluster_id'] == -1].copy()
    
    # Step 2: Calculate cluster centroids (using original embeddings for consistent distance calculation)
    centroids = clustered.groupby('cluster_id')['metadata.embedding'].apply(
        lambda x: np.mean(np.vstack(x.values), axis=0)
    ).to_dict()
    
    # Step 3: Assign each outlier to the closest centroid
    for idx, row in outliers.iterrows():
        embedding = np.array(row['metadata.embedding']).reshape(1, -1)
        closest_cluster = min(
            centroids.items(),
            key=lambda item: cosine_distances(embedding, item[1].reshape(1, -1))[0][0]
        )[0]
        
        # Assign this outlier to the closest cluster
        df_copy.at[idx, 'cluster_id'] = closest_cluster
        
        # Update centroid with new point (running mean)
        cluster_points = df_copy[df_copy['cluster_id'] == closest_cluster]['metadata.embedding'].values
        updated_centroid = np.mean(np.vstack(cluster_points), axis=0)
        centroids[closest_cluster] = updated_centroid
    
    # Step 4: Rename clusters by first company name
    cluster_names = {}
    for cluster in df_copy['cluster_id'].unique():
        first_company = df_copy[df_copy['cluster_id'] == cluster].iloc[0]['info.company_name']
        cluster_names[cluster] = first_company
    
    df_copy['metadata.cluster'] = df_copy['cluster_id'].map(cluster_names)
    df_copy.drop(columns=['cluster_id'], inplace=True)
    
    return df_copy

In [None]:
def convert_to_cluster_members(df):
  # Group the dataframe by 'cluster' and aggregate the company names into lists
  cluster_members = df.groupby('metadata.cluster')['info.company_name'].apply(list).reset_index()
    
  # Rename the columns to 'cluster_name' and 'members'
  cluster_members.columns = ['metadata.cluster', 'members']
    
  return cluster_members

In [None]:
def calculate_silhouette_score(df, metric, reduced_dims=None):
    embeddings = np.vstack(df['metadata.embedding'].values)
    labels = df['metadata.cluster']
    
    if len(np.unique(labels)) == 1:
        return -1  # Cannot compute with only one cluster
    
    if reduced_dims is not None:
        print(f"Applying t-SNE dimensionality reduction to {reduced_dims} dimensions for silhouette score...")
        if reduced_dims <= 3:
            tsne = TSNE(n_components=reduced_dims, random_state=42, n_jobs=-1, method='barnes_hut')
        else:
            tsne = TSNE(n_components=reduced_dims, random_state=42, n_jobs=-1, method='exact')
        embeddings = tsne.fit_transform(embeddings)
        print(f"t-SNE KL divergence (information loss): {tsne.kl_divergence_:.4f}")

    score = silhouette_score(embeddings, labels, metric=metric if reduced_dims is None else 'euclidean')
    return score

In [None]:
def calculate_gap_statistic(df, n_refs=5):
    random_state = 42
    
    embeddings = np.vstack(df['metadata.embedding'].values)
    labels = df['metadata.cluster']

    def calculate_dispersion_cosine(embeddings, labels):
        """Calculate the sum of squared cosine distances from points to their cluster centroids."""
        unique_labels = np.unique(labels)
        dispersion = 0
        for label in unique_labels:
            cluster_points = embeddings[labels == label]
            if len(cluster_points) > 1:
                centroid = np.mean(cluster_points, axis=0)
                # Compute cosine distances between points and their cluster centroids
                distances = cosine_distances(cluster_points, centroid.reshape(1, -1))
                dispersion += np.sum(distances ** 2)
        return dispersion

    # Calculate dispersion for actual data using cosine distances
    actual_dispersion = calculate_dispersion_cosine(embeddings, labels)
    
    # Generate reference datasets and calculate their dispersion
    ref_disps = np.zeros(n_refs)
    for i in range(n_refs):
        # Create a random reference dataset with a different seed for each run
        np.random.seed(random_state + i)  # Different seed for each reference dataset
        random_ref = np.random.uniform(low=np.min(embeddings, axis=0), high=np.max(embeddings, axis=0), size=embeddings.shape)
        ref_kmeans = KMeans(n_clusters=len(np.unique(labels)), random_state=random_state).fit(random_ref)
        ref_disps[i] = calculate_dispersion_cosine(random_ref, ref_kmeans.labels_)

    # Calculate the gap statistic
    gap_stat = np.log(np.mean(ref_disps)) - np.log(actual_dispersion)
    return gap_stat

In [None]:
def calculate_error_sum_of_squares_cosine(df):
    embeddings = np.vstack(df['metadata.embedding'].values)
    labels = df['metadata.cluster']
    
    unique_labels = np.unique(labels)
    total_ess = 0
    for label in unique_labels:
        cluster_points = embeddings[labels == label]
        if len(cluster_points) > 1:
            centroid = np.mean(cluster_points, axis=0)
            # Compute cosine distances from points to centroid
            distances = cosine_distances(cluster_points, centroid.reshape(1, -1))
            total_ess += np.sum(distances ** 2)
    
    return total_ess

In [None]:
def calculate_davies_bouldin_index(df, reduced_dims=None):
    embeddings = np.vstack(df['metadata.embedding'].values)
    labels = df['metadata.cluster']
    
    if len(np.unique(labels)) == 1:
        return -1  # Undefined for one cluster

    if reduced_dims is not None:
        print(f"Applying t-SNE dimensionality reduction to {reduced_dims} dimensions for Davies-Bouldin index...")
        if reduced_dims <= 3:
            tsne = TSNE(n_components=reduced_dims, random_state=42, n_jobs=-1, method='barnes_hut')
        else:
            tsne = TSNE(n_components=reduced_dims, random_state=42, n_jobs=-1, method='exact')
        embeddings = tsne.fit_transform(embeddings)
        print(f"t-SNE KL divergence (information loss): {tsne.kl_divergence_:.4f}")
    
    score = davies_bouldin_score(embeddings, labels)
    return score

In [None]:
def calculate_calinski_harabasz_index(df, reduced_dims=None):
    embeddings = np.vstack(df['metadata.embedding'].values)
    labels = df['metadata.cluster']
    
    if len(np.unique(labels)) == 1:
        return -1  # Undefined for one cluster

    if reduced_dims is not None:
        print(f"Applying t-SNE dimensionality reduction to {reduced_dims} dimensions for Calinski-Harabasz index...")
        if reduced_dims <= 3:
            tsne = TSNE(n_components=reduced_dims, random_state=42, n_jobs=-1, method='barnes_hut')
        else:
            tsne = TSNE(n_components=reduced_dims, random_state=42, n_jobs=-1, method='exact')
        embeddings = tsne.fit_transform(embeddings)
        print(f"t-SNE KL divergence (information loss): {tsne.kl_divergence_:.4f}")
    
    score = calinski_harabasz_score(embeddings, labels)
    return score

In [None]:
# Main function to return all three scores
def run_tests(df, metric, reduced_dims=None):
    silhouette = calculate_silhouette_score(df, metric, reduced_dims=reduced_dims)
    gap_stat = calculate_gap_statistic(df)
    ess = calculate_error_sum_of_squares_cosine(df)
    dbi = calculate_davies_bouldin_index(df, reduced_dims=reduced_dims)
    chi = calculate_calinski_harabasz_index(df, reduced_dims=reduced_dims)
    
    return silhouette, gap_stat, ess, dbi, chi

In [None]:
def cluster_and_evaluate(model_name, dataframe, columns_to_embed, eps=0.1, metric='cosine', reduced_dims=None):
    # Step 1: Embed the data
    embedded_df = embed_data(model_name, dataframe, columns_to_embed)
    
    # Step 2: Cluster the embedded data using DBSCAN
    clustered_df = cluster_DBSCAN(embedded_df, eps=eps, metric=metric, reduced_dims=reduced_dims)
    # clustered_df = cluster_HDBSCAN(embedded_df, min_cluster_size=2, min_samples=1, metric=metric, eps=eps, reduced_dims=reduced_dims)
    
    # Step 3: Evaluate clustering with silhouette score, gap statistic, and error sum of squares
    silhouette, gap_stat, ess, dbi, chi = run_tests(clustered_df, metric, reduced_dims=reduced_dims)
    
    # Return the clustered dataframe along with the evaluation metrics
    return clustered_df, silhouette, gap_stat, ess, dbi, chi

In [None]:
columns_to_embed = ["info.product_description"]

model_names = ["BAAI/bge-m3", "Alibaba-NLP/gte-multilingual-base", 
               "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", "sentence-transformers/LaBSE"]

eps_values = [0] # 0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1 ]

In [None]:
def evaluate_models_and_eps(dataframe, columns_to_embed, model_names, eps_values, reduced_dims):
    results = []

    # Outer loop for model names
    for model_name in model_names:
        # Inner loop for eps values
        for eps in eps_values:
            # Run the cluster_and_evaluate function with the current model and eps
            clustered_df, silhouette, gap_stat, ess, dbi, chi = cluster_and_evaluate(
                model_name, dataframe, columns_to_embed, eps=eps, metric="euclidean", reduced_dims=reduced_dims
            )
            
            # Append the results as a row in the list
            results.append({
                'model_name': model_name,
                'eps': eps,
                'silhouette_score': silhouette,
                'gap_statistic': gap_stat,
                'error_sum_of_squares': ess,
                'dbi': dbi,
                'chi': chi
            })
    
    # Convert the results list to a DataFrame and return it
    return pd.DataFrame(results)

In [None]:
results_df = evaluate_models_and_eps(holdings_out_subsidiries_in_df, columns_to_embed, model_names, eps_values, reduced_dims=2)

In [None]:
results_df.head(60)

In [None]:
clustered_df, silhouette, gap_stat, ess, dbi, chi = cluster_and_evaluate("Alibaba-NLP/gte-multilingual-base", holdings_out_subsidiries_in_df, columns_to_embed, eps=2.0, metric='euclidean', reduced_dims=2)

In [None]:
print(f"{silhouette:.4f} {gap_stat:.4f} {ess:.4f} {dbi:.4f} {chi:.4f}")

In [None]:
clustered_df.columns

In [None]:
# Extract the embeddings and clusters
embeddings = clustered_df['metadata.embedding'].tolist()
clusters = clustered_df['metadata.cluster'].tolist()
company_names = clustered_df['info.company_name'].tolist()  # Extract company names

# Convert embeddings to numpy array
embeddings_array = np.array(embeddings)

# Initialize t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42, max_iter=1000)

# Fit and transform the embeddings using t-SNE
tsne_embeddings = tsne.fit_transform(embeddings_array)

# Create a new dataframe with the t-SNE components
df_tsne_embeddings = pd.DataFrame(tsne_embeddings, columns=['TSNE1', 'TSNE2'])
df_tsne_embeddings['cluster'] = clusters  # Add the cluster information for coloring
df_tsne_embeddings['company_name'] = company_names  # Add company names for hover data

# Get unique clusters
unique_clusters = sorted(df_tsne_embeddings['cluster'].unique())
num_clusters = len(unique_clusters)

print(f"Total number of clusters: {num_clusters}")

# Function to generate distinct colors
def generate_distinct_colors(n):
    """Generate visually distinct colors by cycling through hues and varying saturation/value"""
    colors = []
    
    # Use HSV color space for better control
    for i in range(n):
        # Cycle through the hue spectrum
        h = i / n
        # Alternate between high and medium saturation
        s = 0.8 if i % 2 == 0 else 0.6
        # Alternate between high and medium value (brightness)
        v = 0.9 if i % 3 == 0 else 0.7
        
        # Convert HSV to RGB
        rgb = plt.cm.hsv(h)
        colors.append(rgb)
    
    # Shuffle colors to avoid having similar colors next to each other
    random.seed(42)  # For reproducibility
    random.shuffle(colors)
    
    return colors

# Generate distinct colors for all clusters
distinct_colors = generate_distinct_colors(num_clusters)

# Create a custom color map
cluster_color_map = {cluster: distinct_colors[i] for i, cluster in enumerate(unique_clusters)}

# Create the interactive plotly visualization with optimizations
try:
    import plotly.express as px
    import plotly.graph_objects as go
    import plotly.io as pio
    
    # Set renderer to 'browser' to avoid notebook rendering issues
    pio.renderers.default = 'browser'
    
    # Create a color column for plotly
    df_tsne_embeddings['color'] = df_tsne_embeddings['cluster'].map(
        {cluster: mcolors.to_hex(cluster_color_map[cluster]) for cluster in unique_clusters}
    )
    
    # OPTIMIZATION: Create one trace per cluster instead of one trace per point
    fig = go.Figure()
    
    for cluster in sorted(unique_clusters):
        cluster_data = df_tsne_embeddings[df_tsne_embeddings['cluster'] == cluster]
        
        fig.add_trace(go.Scatter(
            x=cluster_data['TSNE1'],
            y=cluster_data['TSNE2'],
            mode='markers',
            marker=dict(
                color=mcolors.to_hex(cluster_color_map[cluster]),
                size=8,
                line=dict(width=1, color='DarkSlateGrey')
            ),
            name=f'Cluster {cluster}',
            text=cluster_data['company_name'],  # Use text for basic hover
            hoverinfo='text',
            hovertemplate='<b>%{text}</b><br>Cluster: ' + str(cluster) + '<extra></extra>',
        ))
    
    # Update layout
    fig.update_layout(
        title='t-SNE Visualization of Embeddings',
        legend_title_text='Cluster',
        width=1000,
        height=800
    )
    
    # If there are too many clusters, hide the legend
    if num_clusters > 30:
        fig.update_layout(showlegend=False)
    
    # OPTIMIZATION: Save with lower precision and without full_html option
    try:
        fig.write_html(
            "tsne_clusters_visualization_optimized.html",
            include_plotlyjs='cdn',  # Use CDN for plotly.js
            full_html=False,  # Don't include the full HTML wrapper
            include_mathjax=False,  # No MathJax
            post_script=None,  # No additional scripts
            validate=False  # Skip validation for speed
        )
        print("Optimized interactive visualization saved to 'tsne_clusters_visualization_optimized.html'")
        print("Open this file in your browser for the interactive plot")

        fig.update_layout(title=None)
        fig.update_layout(
            margin=dict(l=0, r=0, t=0, b=0),  # Remove all margins
            paper_bgcolor='rgba(0,0,0,0)',   # Transparent background
        )

        pio.write_image(fig, "tsne_clusters_visualization_optimized.svg")
        print("Visualization saved as 'tsne_clusters_visualizatsne_clusters_visualization_optimizedtion.svg'")

    except Exception as e:
        print(f"Could not save HTML or SVG file: {e}")
        # Fallback to showing the plot if available
        try:
            fig.show()
        except:
            pass
    
except ImportError:
    print("Plotly not available. Install with 'pip install plotly' for interactive visualization.")

In [None]:
# Create the plotly visualization with annotations for selected companies
try:
    # Set renderer
    pio.renderers.default = 'browser'
    
    # Create a color column for plotly
    df_tsne_embeddings['color'] = df_tsne_embeddings['cluster'].map(
        {cluster: mcolors.to_hex(cluster_color_map[cluster]) for cluster in unique_clusters}
    )
    
    # Create the base figure with all clusters
    fig = go.Figure()
    
    for cluster in sorted(unique_clusters):
        cluster_data = df_tsne_embeddings[df_tsne_embeddings['cluster'] == cluster]
        
        fig.add_trace(go.Scatter(
            x=cluster_data['TSNE1'],
            y=cluster_data['TSNE2'],
            mode='markers',
            marker=dict(
                color=mcolors.to_hex(cluster_color_map[cluster]),
                size=8,
                line=dict(width=1, color='DarkSlateGrey')
            ),
            name=f'Cluster {cluster}',
            text=cluster_data['company_name'],
            hoverinfo='text',
            hovertemplate='<b>%{text}</b><br>Cluster: ' + str(cluster) + '<extra></extra>',
        ))
    
    # Define specific clusters to highlight
    highlight_clusters = [
        "Türkiye İş Bankası A.Ş.", 
        "MLP Sağlık Hizmetleri A.Ş.", 
        "Turkuvaz Haberleşme ve Yayıncılık A.Ş.", 
        "Türkiye Sigorta A.Ş.", 
        "Doğuş Enerji",
        "MUTLU AKÜ MALZ. SAN. A.Ş.",
        "Eti Bakır A.Ş.",
        "GÜRİŞ İNŞAAT VE MÜHENDİSLİK A.Ş.",
        "Pınar Gıda Süt Ve Süt Ürünleri San. Tic. Ltd. Şti.",
        "DESA DERİ SAN VE TİC AŞ",
        "Yapı Kredi Bomonti Ada"
    ]
    
    # Function to calculate positions for annotations to avoid overlap
    def get_annotation_positions(companies, max_labels=3):
        """
        Determine positions for annotation labels to avoid overlap
        """
        if len(companies) <= 0:
            return []
            
        # Get only up to max_labels companies
        if len(companies) > max_labels:
            selected = companies.sample(max_labels, random_state=10) # 3, 4, 6, 7, "10"
        else:
            selected = companies
        
        # Calculate positions for each label
        positions = []
        for i, (_, company) in enumerate(selected.iterrows()):
            # Create different positions based on the index
            if i % 4 == 0:
                ax, ay = 70, -40  # Right and up
            elif i % 4 == 1:
                ax, ay = -70, -40  # Left and up
            elif i % 4 == 2:
                ax, ay = 70, 40   # Right and down
            else:
                ax, ay = -70, 40  # Left and down
                
            positions.append({
                'company': company,
                'ax': ax,
                'ay': ay
            })
            
        return positions
    
    # Process each cluster and add annotations
    for cluster_name in highlight_clusters:
        # Get companies from this cluster
        cluster_companies = df_tsne_embeddings[df_tsne_embeddings['cluster'] == cluster_name]
        
        if len(cluster_companies) == 0:
            print(f"Warning: No companies found in cluster '{cluster_name}'")
            continue
        
        # Get positions for annotations
        annotation_positions = get_annotation_positions(cluster_companies)
        
        # Add annotations for each selected company
        for position in annotation_positions:
            company = position['company']
            ax = position['ax']
            ay = position['ay']
            
            # Truncate long company names
            display_name = company['company_name']
            if len(display_name) > 25:
                display_name = display_name[:22] + "..."
                
            fig.add_annotation(
                x=company['TSNE1'],
                y=company['TSNE2'],
                text=display_name,
                showarrow=True,
                arrowhead=2,
                arrowsize=1,
                arrowwidth=1,
                ax=ax,
                ay=ay,
                bgcolor="white",
                bordercolor=mcolors.to_hex(cluster_color_map[cluster_name]),
                borderwidth=2,
                font=dict(size=9),
                opacity=0.9
            )
    
    # Update layout
    fig.update_layout(
        width=1000,
        height=800,
        margin=dict(l=0, r=0, t=30, b=0),
        paper_bgcolor='rgba(0,0,0,0)',
        legend_title_text='Cluster'
    )
    
    # If there are too many clusters, hide the legend
    if num_clusters > 30:
        fig.update_layout(showlegend=False)
    
    # Create a version without title for SVG export
    fig_svg = go.Figure(fig)
    fig_svg.update_layout(
        title=None,
        margin=dict(l=0, r=0, t=0, b=0)
    )
    
    # Save as PDF
    pio.write_image(fig_svg, "tsne_clusters_with_labels.pdf")
    print("Visualization saved as 'tsne_clusters_with_labels.pdf'")
    
except Exception as e:
    print(f"Error creating visualization: {e}")

In [None]:
# Initialize PCA (reduce to 2 dimensions)
pca = PCA(n_components=2, random_state=42)

# Fit and transform the embeddings
pca_embeddings = pca.fit_transform(embeddings_array)

# Create a DataFrame with PCA components
df_pca_embeddings = pd.DataFrame(pca_embeddings, columns=['PCA1', 'PCA2'])
df_pca_embeddings['company_name'] = company_names

# Set the same renderer
pio.renderers.default = 'browser'

# Choose one of the existing colors from previous cluster color map (e.g., cluster 0)
selected_color = mcolors.to_hex(cluster_color_map["Türkiye İş Bankası A.Ş."])

# Create a PCA plot with consistent styling
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_pca_embeddings['PCA1'],
    y=df_pca_embeddings['PCA2'],
    mode='markers',
    marker=dict(
        color=selected_color,
        size=8,
        line=dict(width=1, color='DarkSlateGrey')
    ),
    text=df_pca_embeddings['company_name'],  # Optional hover info
    hoverinfo='text',
    hovertemplate='<b>%{text}</b><extra></extra>',
    name='All Companies'
))

# Update layout to match style
fig.update_layout(
    width=1000,
    height=800,
    margin=dict(l=0, r=0, t=30, b=0),
    paper_bgcolor='rgba(0,0,0,0)',
    showlegend=False
)

# Create version without title for export
fig_svg = go.Figure(fig)
fig_svg.update_layout(
    title=None,
    margin=dict(l=0, r=0, t=0, b=0)
)

# Save to PDF
pio.write_image(fig_svg, "pca_visualization.pdf")
print("PCA visualization saved as 'pca_visualization.pdf'")


In [None]:
len(clustered_df["metadata.cluster"].unique())

In [None]:
clustered_df["metadata.cluster"].value_counts().head(40)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
clustered_df[clustered_df["metadata.cluster"] == "DEARSAN GEMİ İNŞAAT SANAYİ A.Ş."]["info.company_name"]