In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm.notebook import tqdm

# 1. Embed the ideas
def embed_ideas(ideas, model_name='BAAI/bge-large-en-v1.5'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(ideas, show_progress_bar=True, normalize_embeddings=True)
    return embeddings

# 2. Metric Computation Functions
def compute_coherence(embeddings, labels):
    similarities = cosine_similarity(embeddings)
    intra_similarities = [np.mean(similarities[np.ix_(labels == label, labels == label)]) for label in np.unique(labels)]
    return np.mean(intra_similarities)

def compute_exclusivity(embeddings, labels):
    cluster_centers = np.array([embeddings[labels == label].mean(axis=0) for label in np.unique(labels)])
    pairwise_dists = cosine_similarity(cluster_centers)
    np.fill_diagonal(pairwise_dists, -np.inf)
    exclusivity_scores = pairwise_dists.max(axis=1)
    return 1 - np.mean(exclusivity_scores)

def compute_inertia(embeddings, labels):
    centers = np.array([embeddings[labels == label].mean(axis=0) for label in np.unique(labels)])
    inertia = 0.0
    for idx, label in enumerate(labels):
        inertia += np.sum((embeddings[idx] - centers[np.where(np.unique(labels) == label)[0][0]])**2)
    return inertia

# 3. Find best number of clusters and plot metrics
def find_best_k(embeddings, method='kmeans', k_min=5, k_max=950, output_filepath_base=None):
    silhouette_scores = []
    coherence_scores = []
    exclusivity_scores = []
    inertia_scores = []
    k_values = list(range(k_min, k_max+1))

    for k in tqdm(k_values, desc=f"Finding best k for {method}"):
        if method == 'kmeans':
            model = KMeans(n_clusters=k, random_state=42, n_init='auto')
        elif method == 'agglomerative':
            model = AgglomerativeClustering(n_clusters=k)
        else:
            raise ValueError("Method must be 'kmeans' or 'agglomerative'")

        labels = model.fit_predict(embeddings)

        if len(np.unique(labels)) > 1:
            silhouette_scores.append(silhouette_score(embeddings, labels))
            coherence_scores.append(compute_coherence(embeddings, labels))
            exclusivity_scores.append(compute_exclusivity(embeddings, labels))
            inertia_scores.append(compute_inertia(embeddings, labels))
        else:
            silhouette_scores.append(-1)
            coherence_scores.append(-1)
            exclusivity_scores.append(-1)
            inertia_scores.append(np.inf)

    semantic_scores = np.sqrt(np.array(coherence_scores) * np.array(exclusivity_scores))

    elbow_k = k_values[np.argmin(np.gradient(np.gradient(inertia_scores)))]

    best_k = {
        'silhouette': k_values[np.argmax(silhouette_scores)],
        'semantic': k_values[np.argmax(semantic_scores)],
        'elbow': elbow_k
    }

    if output_filepath_base:
        plt.figure(figsize=(14, 8))
        sns.lineplot(x=k_values, y=silhouette_scores, label='Silhouette')
        sns.lineplot(x=k_values, y=semantic_scores, label='Semantic Quality')
        plt.title(f"Metric Scores vs k ({method})")
        plt.xlabel("Number of Clusters (k)")
        plt.ylabel("Score")
        plt.legend()
        plt.tight_layout()
        plot_path = f"{output_filepath_base}_{method}_metric_plots.png"
        plt.savefig(plot_path)
        plt.close()
        print(f"Metric plots saved to {plot_path}")

        plt.figure(figsize=(10, 6))
        sns.lineplot(x=k_values, y=inertia_scores)
        plt.title(f"Inertia vs k ({method})")
        plt.xlabel("Number of Clusters (k)")
        plt.ylabel("Inertia")
        plt.tight_layout()
        inertia_plot_path = f"{output_filepath_base}_{method}_inertia_plot.png"
        plt.savefig(inertia_plot_path)
        plt.close()
        print(f"Inertia plot saved to {inertia_plot_path}")

    return best_k

# 4. Cluster with KMeans
def cluster_with_kmeans(embeddings, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    labels = kmeans.fit_predict(embeddings)
    return labels

# 5. Cluster with Agglomerative Clustering
def cluster_with_agglomerative(embeddings, n_clusters):
    agglo = AgglomerativeClustering(n_clusters=n_clusters)
    labels = agglo.fit_predict(embeddings)
    return labels

# 6. Full Pipeline
def cluster_alternative_uses(input_filepath, object_name, model_name, output_base_dir):
    print(f"Loading ideas from {input_filepath}...")
    df = pd.read_csv(input_filepath)

    if 'idea_content' not in df.columns:
        raise ValueError("Input CSV must contain an 'idea_content' column.")

    ideas = df['idea_content'].tolist()

    print(f"Embedding with model {model_name}...")
    embeddings = embed_ideas(ideas, model_name=model_name)

    clustering_algorithms = {
        'kmeans': cluster_with_kmeans,
        'agglomerative': cluster_with_agglomerative
    }

    safe_model_name = model_name.split("/")[-1]
    output_filepath_base = f"{output_base_dir}/{object_name}_{safe_model_name}"

    for method, cluster_func in clustering_algorithms.items():
        print(f"Finding best k for {method}...")
        best_ks = find_best_k(embeddings, method=method, output_filepath_base=output_filepath_base)

        for metric_name, k in best_ks.items():
            print(f"Clustering with {method} optimized for {metric_name} (k={k})...")
            labels = cluster_func(embeddings, n_clusters=k)
            df[f'{method}_cluster_id_{metric_name}'] = labels

    return df

# Main execution
if __name__ == "__main__":
    object_names = ["shoe", "button"]#, "key", "wooden_pencil", "automobile_tire"]
    embedding_models = ["mixedbread-ai/mxbai-embed-large-v1"]#,"intfloat/e5-large-v2","BAAI/bge-large-en-v1.5", "all-mpnet-base-v2"]
    output_base_dir = "exports/clustering_all_objects"

    for model_name in embedding_models:
        all_dfs = []
        for object_name in object_names:
            input_filepath = f"input_files/ideas_{object_name}.csv"
            df = cluster_alternative_uses(input_filepath, object_name, model_name, output_base_dir)
            df['object_name'] = object_name
            all_dfs.append(df)
        final_df = pd.concat(all_dfs, ignore_index=True)
        safe_model_name = model_name.split("/")[-1]
        output_filepath = f"{output_base_dir}/clustering_all_objects_{safe_model_name}.csv"
        os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
        final_df.to_csv(output_filepath, index=False)
        print(f"Final combined CSV saved to {output_filepath}")


Loading ideas from input_files/ideas_shoe.csv...
Embedding with model mixedbread-ai/mxbai-embed-large-v1...


Batches:   0%|          | 0/36 [00:00<?, ?it/s]

Finding best k for kmeans...


Finding best k for kmeans:   0%|          | 0/946 [00:00<?, ?it/s]

