# Text clustering: HDBSCAN is probably all you need

<a target="_blank" href="https://colab.research.google.com/github/daniel-furman/awesome-chatgpt-prompts-clustering/blob/main/notebooks/stable-diffusion-prompts-clustering.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## Sections

1. Setup
2. Data I/O
3. Embed text
4. Clustering
5. Exemplar sub-clustering
6. Knowledge graph theming
7. Write final df results to disk
8. Create a JSON knowledge graph viz

## Setup

In [None]:
# copying larger files to GDrive storage for this experiment

from google.colab import drive

drive.mount("/content/drive")

In [None]:
!git clone https://github.com/daniel-furman/awesome-chatgpt-prompts-clustering.git

In [None]:
# for local run, see below commands for setting up a new venv

#!python -m venv .venv_clust_demo
#!source .venv_clust_demo/bin/activate
#!pip install --upgrade pip
#!pip list

In [None]:
import os

os.chdir("/content/awesome-chatgpt-prompts-clustering")
!ls

In [None]:
!pip install -qUr requirements.txt

In [None]:
os.chdir("../..")
!ls

In [None]:
#!pip list

In [None]:
import argparse
import os
from tqdm.notebook import tqdm
import datetime
import json
import pickle
import numpy as np
from numpy import random
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

import umap
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import torch
import hdbscan
from sklearn.metrics.pairwise import euclidean_distances
import openai
import tiktoken
import cleantext

from UliPlot.XLSX import auto_adjust_xlsx_column_width

In [None]:
args = argparse.Namespace()
args.inference = True
args

In [None]:
now = datetime.datetime.now()
# dd/mm/YY H:M:S
dt_string = now.strftime("%d_%m_%Y_%H_%M_%S")

# hardcode in an existing experiment datetime for inference runs

if args.inference:
    # dt_string identifiers from cached experiments:
    dt_string = "04_09_2023_03_02_25"

print("experiment's datetime identifier =", dt_string)

# create results folder if it doesn't exist
if not os.path.isdir(
    f"/content/drive/MyDrive/colab_files/text_clustering/experiments/{dt_string}"
):
    os.mkdir(
        f"/content/drive/MyDrive/colab_files/text_clustering/experiments/{dt_string}"
    )

In [None]:
args.cache_folder = (
    f"/content/drive/MyDrive/colab_files/text_clustering/experiments/{dt_string}"
)
args

## Data I/O

In [None]:
ds_hf = load_dataset("Gustavosta/Stable-Diffusion-Prompts")
ds = ds_hf["train"]

ds = ds.to_pandas()
ds["id"] = ds.index
ds = ds[["id", "Prompt"]]
ds

## Embed Text

* See [pretrained models](https://www.sbert.net/docs/pretrained_models.html) for supporting information

In [None]:
model = SentenceTransformer("all-mpnet-base-v2")

In [None]:
if not args.inference:
    embeddings = torch.zeros([len(ds), 768])
    for i in tqdm(range(len(ds))):
        emb = model.encode(ds.loc[i, "Prompt"], convert_to_tensor=True)
        embeddings[i, :] = emb
    embeddings

In [None]:
f_name = os.path.join(
    args.cache_folder, "stable_diffusion_prompts_embeddings_all_mpnet_base_v2.pt"
)
print(f_name, "\n")

if not args.inference:
    torch.save(embeddings, f_name)
loaded_embeddings = torch.load(f_name)
loaded_embeddings

In [None]:
loaded_embeddings.shape

In [None]:
if not args.inference:
    torch.equal(loaded_embeddings.cpu(), embeddings.cpu())

In [None]:
test_itr = random.randint(low=0, high=len(ds))
test_itr

In [None]:
# test embeddings worked
ds.loc[test_itr, "Prompt"]

In [None]:
test_emb = model.encode(ds.loc[test_itr, "Prompt"], convert_to_tensor=True)
a = np.array(test_emb.cpu())
b = np.array(loaded_embeddings[test_itr, :].cpu())
np.allclose(a, b, rtol=1e-02)

## Clustering

* See [how hdbscan works](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html) for supporting information

In [None]:
# second, perform clustering

# first, perform dimensionality reduction from 768 to 15
f_name = os.path.join(args.cache_folder, "reducer_umap_15.pkl")
print(f_name, "\n")

if not args.inference:
    reducer_15 = umap.UMAP(n_components=15)
    reducer_15.fit(loaded_embeddings)
    embeddings_umap_dim_15 = reducer_15.transform(loaded_embeddings)
    # Verify that the result of calling transform is
    # idenitical to accessing the embedding_ attribute
    assert np.all(embeddings_umap_dim_15 == reducer_15.embedding_)

    # cache fitted umap object
    pickle.dump(reducer_15, open(f_name, "wb"))

loaded_reducer_15 = pickle.load((open(f_name, "rb")))

embeddings_umap_dim_15 = loaded_reducer_15.transform(loaded_embeddings)
# Verify that the result of calling transform is
# idenitical to accessing the embedding_ attribute
assert np.all(embeddings_umap_dim_15 == loaded_reducer_15.embedding_)

print(embeddings_umap_dim_15.shape)

In [None]:
args.inference = False
args

In [None]:
f_name = os.path.join(args.cache_folder, "clusterer_hdbscan.pkl")
print(f_name, "\n")

if not args.inference:
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=110, gen_min_span_tree=True, prediction_data=True
    )
    clusterer.fit(embeddings_umap_dim_15)
    pickle.dump(clusterer, open(f_name, "wb"))

loaded_clusterer = pickle.load((open(f_name, "rb")))

if not args.inference:
    print(
        pd.DataFrame.equals(
            pd.Series(clusterer.labels_).value_counts(),
            pd.Series(loaded_clusterer.labels_).value_counts(),
        )
    )
    print(
        pd.DataFrame.equals(
            pd.Series(clusterer.probabilities_).value_counts(),
            pd.Series(loaded_clusterer.probabilities_).value_counts(),
        )
    )

num_ouliers = pd.Series(loaded_clusterer.labels_).value_counts().loc[-1]

print(pd.Series(loaded_clusterer.labels_).value_counts())
print(f"\nCluster outliers : {num_ouliers}\n")

In [None]:
# sum of top 25 cluster counts

pd.Series(loaded_clusterer.labels_).value_counts()[1:26].sum()

In [None]:
ds["cluster"] = loaded_clusterer.labels_
ds["cluster membership prob"] = loaded_clusterer.probabilities_
ds

In [None]:
loaded_clusterer.condensed_tree_.plot()

In [None]:
loaded_clusterer.condensed_tree_.plot(
    select_clusters=True, selection_palette=sns.color_palette()
)

In [None]:
args.inference = True

In [None]:
# third, perform dimensionality reduction from 15 to 2

f_name = os.path.join(args.cache_folder, "reducer_umap_2.pkl")
print(f_name, "\n")

if not args.inference:
    reducer_2 = umap.UMAP(n_components=2)
    reducer_2.fit(embeddings_umap_dim_15)
    embeddings_umap_dim_2 = reducer_2.transform(embeddings_umap_dim_15)

    # Verify that the result of calling transform is
    # idenitical to accessing the embedding_ attribute
    assert np.all(embeddings_umap_dim_2 == reducer_2.embedding_)

    # cache fitted umap object
    pickle.dump(reducer_2, open(f_name, "wb"))

loaded_reducer_2 = pickle.load((open(f_name, "rb")))

embeddings_umap_dim_2 = loaded_reducer_2.transform(embeddings_umap_dim_15)
# Verify that the result of calling transform is
# idenitical to accessing the embedding_ attribute
assert np.all(embeddings_umap_dim_2 == loaded_reducer_2.embedding_)
embeddings_umap_dim_2.shape

In [None]:
ds["x"] = embeddings_umap_dim_2[:, 0]
ds["y"] = embeddings_umap_dim_2[:, 1]

In [None]:
# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = ds[ds["cluster"] == -1]
clustered = ds[ds["cluster"] != -1]
plt.scatter(outliers.x, outliers.y, color="#BDBDBD", s=10, alpha=0.1)
plt.scatter(
    clustered.x, clustered.y, c=clustered.cluster, s=10, alpha=0.35, cmap="viridis"
)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=ds["x"][ds["cluster"] != -1],
        y=ds["y"][ds["cluster"] != -1],
        mode="markers",
        marker_color=ds["cluster"][ds["cluster"] != -1],
        marker_colorscale="Viridis",
        text=ds["cluster"][ds["cluster"] != -1],
    )
)

fig.update_traces(marker={"size": 5, "opacity": 0.45}, showlegend=False)
fig.update_coloraxes(showscale=False)
fig.update_layout(width=550 * 2, height=400 * 2)
fig.show()

## Exemplar Sub-Clustering

* See [soft clustering explanation](https://hdbscan.readthedocs.io/en/latest/soft_clustering_explanation.html) for supporting information

In [None]:
# function copied from:
# https://hdbscan.readthedocs.io/en/latest/soft_clustering_explanation.html#distance-based-membership


def exemplars(cluster_id, condensed_tree):
    raw_tree = condensed_tree._raw_tree
    # Just the cluster elements of the tree, excluding singleton points
    cluster_tree = raw_tree[raw_tree["child_size"] > 1]
    # Get the leaf cluster nodes under the cluster we are considering
    leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)
    # Now collect up the last remaining points of each leaf cluster (the heart of the leaf)
    result = np.array([])
    for leaf in leaves:
        max_lambda = raw_tree["lambda_val"][raw_tree["parent"] == leaf].max()
        points = raw_tree["child"][
            (raw_tree["parent"] == leaf) & (raw_tree["lambda_val"] == max_lambda)
        ]
        result = np.hstack((result, points))
    return result.astype(np.int)

In [None]:
tree = loaded_clusterer.condensed_tree_

exemplar_ids = []
for i, c in enumerate(tree._select_clusters()):
    c_exemplars = exemplars(c, tree)
    print(f"Cluster {i} has {len(c_exemplars)} exemplars")
    exemplar_ids.extend(c_exemplars)

In [None]:
ds["exemplars yes/no"] = np.zeros(len(ds))
ds.loc[exemplar_ids, "exemplars yes/no"] = 1

assert len(ds[ds["exemplars yes/no"] == 1]) == len(exemplar_ids)

In [None]:
print("\n")
fig = go.Figure()

custom_scale = [
    "#949494",  # Gray
    "#F65314",  # Google Red
    "#4285F4",  # Google Blue
]

fig.add_trace(
    go.Scatter(
        x=ds["x"][(ds["exemplars yes/no"] == 0) & (ds["cluster"] != -1)],
        y=ds["y"][(ds["exemplars yes/no"] == 0) & (ds["cluster"] != -1)],
        mode="markers",
        marker_color=custom_scale[0],
        text=ds["cluster"][(ds["exemplars yes/no"] == 0) & (ds["cluster"] != -1)],
    )
)

fig.add_trace(
    go.Scatter(
        x=ds["x"][ds["exemplars yes/no"] == 1],
        y=ds["y"][ds["exemplars yes/no"] == 1],
        mode="markers",
        marker_color=ds["cluster"][ds["exemplars yes/no"] == 1],
        marker_colorscale="Viridis",
        text=ds["cluster"][ds["exemplars yes/no"] == 1],
    )
)

fig.update_traces(marker={"size": 5, "opacity": 0.45}, showlegend=False)
fig.update_coloraxes(showscale=False)
fig.update_layout(width=550 * 2, height=400 * 2)
fig.show()

In [None]:
len(ds.loc[exemplar_ids])

In [None]:
embeddings_umap_dim_15[exemplar_ids].shape

In [None]:
# fourth, perform exemplar sub-clustering

f_name = os.path.join(args.cache_folder, "clusterer_subs_hdbscan.pkl")
print(f_name, "\n")

if not args.inference:
    sub_clusterer = hdbscan.HDBSCAN(
        min_cluster_size=4, gen_min_span_tree=True, prediction_data=True
    )
    sub_clusterer.fit(embeddings_umap_dim_15[exemplar_ids])
    pickle.dump(sub_clusterer, open(f_name, "wb"))

loaded_sub_clusterer = pickle.load((open(f_name, "rb")))

if not args.inference:
    print(
        pd.DataFrame.equals(
            pd.Series(sub_clusterer.labels_).value_counts(),
            pd.Series(loaded_sub_clusterer.labels_).value_counts(),
        )
    )
    print(
        pd.DataFrame.equals(
            pd.Series(sub_clusterer.probabilities_).value_counts(),
            pd.Series(loaded_sub_clusterer.probabilities_).value_counts(),
        )
    )

print("\nCluster value counts:\n")
pd.Series(loaded_sub_clusterer.labels_).value_counts()

In [None]:
loaded_sub_clusterer.labels_

In [None]:
ds["exemplar sub-cluster"] = np.repeat(np.nan, len(ds))
ds["cluster XX.YY"] = np.repeat(np.nan, len(ds))
# ds.loc[exemplar_ids] = loaded_sub_clusterer.labels_
ds
for i in range(len(ds.loc[exemplar_ids])):
    row = ds.loc[exemplar_ids].iloc[i]
    ds.loc[row.id, "exemplar sub-cluster"] = loaded_sub_clusterer.labels_[i]
for i in range(len(ds.loc[exemplar_ids])):
    row = ds.loc[exemplar_ids].iloc[i]
    ds.loc[row.id, "cluster XX.YY"] = (
        "Cluster "
        + str(row.cluster)
        + ", Sub-Cluster "
        + str(int(row["exemplar sub-cluster"]))
    )

# ds.loc[exemplar_ids]
# ds

In [None]:
ds_inner_exemplars = ds[ds["exemplars yes/no"] == 1]
ds_inner_exemplars = ds_inner_exemplars[
    ds_inner_exemplars["exemplar sub-cluster"] != -1
]
len(ds_inner_exemplars)

In [None]:
print("\n")

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=ds["x"][(ds["exemplars yes/no"] == 0) & (ds["cluster"] != -1)],
        y=ds["y"][(ds["exemplars yes/no"] == 0) & (ds["cluster"] != -1)],
        mode="markers",
        marker_color=custom_scale[0],
        text=ds["cluster XX.YY"][(ds["exemplars yes/no"] == 0) & (ds["cluster"] != -1)],
    )
)

fig.add_trace(
    go.Scatter(
        x=ds_inner_exemplars["x"],
        y=ds_inner_exemplars["y"],
        mode="markers",
        marker_color=ds_inner_exemplars["exemplar sub-cluster"],
        marker_colorscale="Viridis",
        text=ds_inner_exemplars["cluster XX.YY"],
    )
)

fig.update_traces(marker={"size": 11, "opacity": 0.55}, showlegend=False)
fig.update_coloraxes(showscale=False)
fig.update_layout(width=550 * 1.5, height=400 * 1.5)
fig.show()

In [None]:
for i in range(len(ds)):
    ds.loc[i, "Prompt head"] = " ".join(
        cleantext.clean_words(
            ds.loc[i, "Prompt"],
            clean_all=False,  # Execute all cleaning operations
            extra_spaces=True,  # Remove extra white spaces
            stemming=False,  # Stem the words
            stopwords=False,  # Remove stop words
            lowercase=False,  # Convert to lowercase
            numbers=False,  # Remove all digits
            punct=False,  # Remove all punctuations
            stp_lang="english",  # Language for stop words
        )[0:12]
    )

In [None]:
ds

In [None]:
ds["cluster + Prompt"] = (
    "Cluster: "
    + ds["cluster"].astype(str)
    + ", Prompt id "
    + ds["id"].astype(str)
    + ": "
    + '"'
    + ds["Prompt head"]
    + '"'
)
ds

In [None]:
# visualize top 25 clusters by count

clust_to_zoom_list = pd.Series(loaded_clusterer.labels_).value_counts().index[1:26]

for clust_to_zoom in clust_to_zoom_list:
    print(f"Cluster {clust_to_zoom}:")
    ds_inner_exemplars = ds[
        (ds["exemplars yes/no"] == 1) & (ds["cluster"] == clust_to_zoom)
    ]
    ds_inner_exemplars = ds_inner_exemplars[
        ds_inner_exemplars["exemplar sub-cluster"] != -1
    ]

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=ds["x"][(ds["exemplars yes/no"] == 0) & (ds["cluster"] == clust_to_zoom)],
            y=ds["y"][(ds["exemplars yes/no"] == 0) & (ds["cluster"] == clust_to_zoom)],
            mode="markers",
            marker_color=custom_scale[0],
            text=ds["cluster + Prompt"][
                (ds["exemplars yes/no"] == 0) & (ds["cluster"] == clust_to_zoom)
            ],
        )
    )

    fig.add_trace(
        go.Scatter(
            x=ds_inner_exemplars["x"],
            y=ds_inner_exemplars["y"],
            mode="markers",
            marker_color=ds_inner_exemplars["exemplar sub-cluster"],
            marker_colorscale="Viridis",
            text=ds_inner_exemplars["cluster + Prompt"],
        )
    )

    fig.update_traces(marker={"size": 11, "opacity": 0.55}, showlegend=False)
    fig.update_coloraxes(showscale=False)
    fig.update_layout(width=550 * 1.5, height=400 * 1.5)
    fig.show()

In [None]:
ds_inner_exemplars = ds[ds["exemplars yes/no"] == 1]
ds_inner_exemplars = ds_inner_exemplars[
    ds_inner_exemplars["exemplar sub-cluster"] != -1
]

In [None]:
print("\n")

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=ds["x"][(ds["exemplars yes/no"] == 0) & (ds["cluster"] != -1)],
        y=ds["y"][(ds["exemplars yes/no"] == 0) & (ds["cluster"] != -1)],
        mode="markers",
        marker_color=custom_scale[0],
        text=ds["cluster + Prompt"][
            (ds["exemplars yes/no"] == 0) & (ds["cluster"] != -1)
        ],
    )
)

fig.add_trace(
    go.Scatter(
        x=ds_inner_exemplars["x"],
        y=ds_inner_exemplars["y"],
        mode="markers",
        marker_color=ds_inner_exemplars["exemplar sub-cluster"],
        marker_colorscale="Viridis",
        text=ds_inner_exemplars["cluster + Prompt"],
    )
)

fig.update_traces(marker={"size": 11, "opacity": 0.55}, showlegend=False)
fig.update_coloraxes(showscale=False)
fig.update_layout(width=550 * 1.5, height=400 * 1.5)
fig.show()

## Create summary themes knowledge graph

In [None]:
claude_prompt = "Please identify and summarize the core theme for each Sub-Cluster. Respond as succinctly as possible. Each summary cannot be longer than 1 sentence. Do not skip any of the Sub-Clusters. Do not list out the names of individuals in the prompts. Let's think step by step before responding."

num_subclusts = 0
subclusts_in_order = []

for clust in np.unique(np.array(ds[(ds["exemplars yes/no"] == 1)]["cluster"])):
    sub_df = ds[(ds["cluster"] == clust) & (ds["exemplars yes/no"] == 1)]

    for clust_to_zoom in clust_to_zoom_list:
        if sub_df["cluster"].iloc[0] == clust_to_zoom:
            sub_clusts = list(np.unique(np.array(sub_df["cluster XX.YY"])))
            sub_clusts.sort()
            for sub_clust in sub_clusts:
                if int(sub_clust.split("Sub-Cluster ")[-1]) != -1:
                    # print(sub_clust)
                    num_subclusts += 1
                    subclusts_in_order.append(sub_clust)
                    sub_prompts = sub_df[sub_df["cluster XX.YY"] == sub_clust][
                        "Prompt"
                    ].astype(str)
                    claude_prompt += "\n" + str(sub_clust.split(", ")[1]) + ": "
                    claude_prompt += f"\n" + str(sub_prompts) + '"\n'

In [None]:
# print(claude_prompt)
# subclusts_in_order

In [None]:
num_subclusts

In [None]:
claude_prompt.count("Sub-Cluster ")

In [None]:
ds_exemps = ds[(ds["exemplars yes/no"] == 1) & (ds["exemplar sub-cluster"] != -1)]

mask = ds_exemps["cluster"].isin(clust_to_zoom_list)
ds_exemps_of_interest = ds_exemps[mask]
# ds_exemps_of_interest

In [None]:
len(np.unique(np.array(ds_exemps_of_interest["cluster XX.YY"])))

In [None]:
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo-16k")
len(tokenizer.encode(claude_prompt))

In [None]:
claude_prompt

In [None]:
# saved response from claude-2 conversation

text_generation = """Sub-Cluster 19: Portraits of characters in lofi style by various artists.

Sub-Cluster 17: Symmetry portraits of various people and characters.

Sub-Cluster 18: Symmetry sci-fi portraits of characters and people.

Sub-Cluster 128: Highly detailed illustrations of people, often describing hair and age.

Sub-Cluster 162: Highly detailed illustrations of sadistic or aggressive looking people.

Sub-Cluster 163: Highly detailed illustrations of attractive people, often with white hair.

Sub-Cluster 75: Highly detailed illustrations of beautiful, fierce, or smug women.

Sub-Cluster 76: Art of the League of Legends champion Vi.

Sub-Cluster 66: Greg Manchess portrait paintings of various characters as different roles.

Sub-Cluster 78: Art and portraits featuring Star Wars characters, especially Darth Vader.

Sub-Cluster 91: Portraits and art of female cyborg characters.

Sub-Cluster 93: Art and portraits of robots and humanoid AI characters.

Sub-Cluster 107: Art of Vladimir Putin being killed or defeated.

Sub-Cluster 168: Portraits of Putin and Biden as magical characters.

Sub-Cluster 235: Art depicting Vladimir Putin as various monsters, animals, or in humiliating situations.

Sub-Cluster 236: Art of Putin with Kim Jong Un's haircut.

Sub-Cluster 164: Art of characters like aliens eating hamburgers.

Sub-Cluster 191: Art of Final Fantasy 7 character Sephiroth.

Sub-Cluster 192: Beautiful, award winning pencil drawings and illustrations.

Sub-Cluster 240: Portraits of celebrities eating hamburgers.

Sub-Cluster 241: Portraits of various real people and characters eating hamburgers.

Sub-Cluster 85: Art and portraits of dragons in various settings.

Sub-Cluster 92: Art depicting Donald Trump in various roles and situations.

Sub-Cluster 89: Art and portraits of Batman characters.

Sub-Cluster 90: Art of Spider-Man and related Marvel characters.

Sub-Cluster 135: Award winning portrait commissions.

Sub-Cluster 136: Award winning portrait commissions of furry characters.

Sub-Cluster 143: Anthropomorphic furry fox characters.

Sub-Cluster 184: Trending furry fox character art.

Sub-Cluster 185: Beautiful portrait commissions of furry characters.

Sub-Cluster 65: Art and portraits of fox characters in various outfits and settings.

Sub-Cluster 121: Portraits and art of cats in various styles.

Sub-Cluster 146: Portraits of goddesses and divine figures.

Sub-Cluster 178: Portraits of Megan Fox as characters from video games.

Sub-Cluster 186: Psychedelic and Lovecraftian portraits of Megan Fox.

Sub-Cluster 187: Portraits of Megan Fox in various roles and outfits.

Sub-Cluster 54: Portraits of Emma Watson in various roles and settings.

Sub-Cluster 74: Alexandra Daddario and Megan Fox as Scarlet Witch.

Sub-Cluster 84: Futuristic and fantasy vehicle concept art.

Sub-Cluster 94: Highly detailed realistic portraits of men.

Sub-Cluster 113: Anime girl character portraits and concept art.

Sub-Cluster 117: Portraits of beautiful women in various settings.

Sub-Cluster 118: Portraits of young women in various outfits and styles.

Sub-Cluster 81: Cinematic concept art portraits by Jama Jurabaev.

Sub-Cluster 82: Futuristic sci-fi spaceship concept art.

Sub-Cluster 125: Concept art of knights and warriors.

Sub-Cluster 132: Surreal, cinematic, and futuristic digital art.

Sub-Cluster 167: Beautiful landscape paintings and matte art.

Sub-Cluster 151: Futuristic cityscape concept art."""

text_generation = text_generation.split("\n\n")
len(text_generation)

In [None]:
summaries_dict = {
    subclusts_in_order[i]: text_generation[i] for i in range(len(subclusts_in_order))
}
summaries_dict

In [None]:
for i in range(len(list(summaries_dict.keys()))):
    pass
    key = list(summaries_dict.keys())[i]
    summary = summaries_dict[key]
    key_subclust = key.split(", ")[-1]
    summary_subclust = summary.split(": ")[0]
    assert key_subclust == summary_subclust

In [None]:
# for i in range(len(text_generation)):
# text_generation[i] = text_generation[i].split(": ")[-1]

In [None]:
summaries_dict_cleaned = {
    subclusts_in_order[i]: text_generation[i] for i in range(len(subclusts_in_order))
}
summaries_dict_cleaned

In [None]:
for i in range(len(ds)):
    try:
        ds.loc[i, "theme"] = summaries_dict_cleaned[ds.loc[i, "cluster XX.YY"]]
    except KeyError:
        pass

In [None]:
ds["theme"][ds["cluster XX.YY"] == "Cluster 77, Sub-Cluster 151"].iloc[0]

## Write final df results to disk

In [None]:
# write final ds to disk
f_name = os.path.join(
    args.cache_folder, "stable_diffusion_prompts_dataframe_cached_with_results.xlsx"
)
print(f_name, "\n")

# re-order cols
ds = ds[
    [
        "id",
        "cluster",
        "x",
        "y",
        "cluster membership prob",
        "exemplars yes/no",
        "exemplar sub-cluster",
        "cluster XX.YY",
        "theme",
        "Prompt",
    ]
]
ds

In [None]:
# write with adjusted col width
# if not args.inference:
if True:
    with pd.ExcelWriter(f_name) as writer:
        ds.to_excel(writer, sheet_name="All Prompts")
        auto_adjust_xlsx_column_width(ds, writer, sheet_name="All Prompts", margin=1)

## Format a JSON viz graph

In [None]:
args.cache_folder

In [None]:
# optional ds cached loading
ds_loaded = pd.read_excel(
    os.path.join(
        args.cache_folder, "stable_diffusion_prompts_dataframe_cached_with_results.xlsx"
    ),
    index_col="Unnamed: 0",
)
ds_loaded

In [None]:
ds_clust = ds_loaded[ds_loaded["theme"].notna()]
ds_clust

In [None]:
len(np.unique(np.array(ds_clust["cluster XX.YY"])))

In [None]:
knowledge_graphs = []

for sub_clust in np.unique(np.array(ds_clust["cluster XX.YY"])):
    clust = sub_clust.split("Cluster ")[1].split(",")[0]

    prompts = []
    ds_inner = ds_clust[ds_clust["cluster XX.YY"] == sub_clust]
    for i in range(len(ds_inner)):
        row = ds_inner.iloc[i]
        if row["exemplars yes/no"] == 1:
            prompts.append(
                {
                    "Prompt": row.Prompt,
                    "id": float(row["id"]),
                }
            )

    viz = {
        "core theme": ds_inner.iloc[0]["theme"],
        "cluster id": ds_inner.iloc[0]["cluster XX.YY"],
        # "frequency": str(np.round(100 * len(ds_inner) / len(ds), 2)) + "%",
        # "count": len(ds_inner),
        # "exemplars": prompts,
    }

    knowledge_graphs.append(viz)

In [None]:
for i in range(len(np.unique(np.array(ds_clust["cluster XX.YY"])))):
    sub_clust = np.unique(np.array(ds_clust["cluster XX.YY"]))[i]

    # Serializing json
    json_object = json.dumps(knowledge_graphs[i], indent=4)
    print(json_object)
    print("\n\n")

In [None]:
# summaries_dict_cleaned["Cluster 75, Sub-Cluster 167"]

In [None]:
summaries_dict_cluster_level = {}
for clust in np.unique(np.array(ds_clust["cluster"])):
    summaries_dict_cluster_level[clust] = list(
        np.unique(np.array(ds_clust[ds_clust["cluster"] == clust]["theme"]))
    )

In [None]:
summaries_dict_cluster_level

In [None]:
summaries_dict_cluster_level[10]

In [None]:
knowledge_graphs = []

itr = 0
for clust in ds_loaded["cluster"].value_counts().index:
    if clust in list(np.unique(np.array(ds_clust["cluster"]))):
        ds_inner = ds_clust[ds_clust["cluster"] == int(clust)]

        viz = {
            "cluster id": "Cluster " + str(ds_inner.iloc[0]["cluster"]),
            "count": float(ds_loaded["cluster"].value_counts().loc[int(clust)]),
            "frequency": str(
                np.round(
                    100
                    * float(ds_loaded["cluster"].value_counts().loc[int(clust)])
                    / len(ds_loaded),
                    2,
                )
            )
            + "%",
            "core theme": summaries_dict_cluster_level[clust],
        }

        knowledge_graphs.append(viz)

In [None]:
len(knowledge_graphs)

In [None]:
knowledge_graphs = {"knowledge graph": knowledge_graphs}

In [None]:
# Serializing json
json_object = json.dumps(knowledge_graphs, indent=4)
print(json_object)
print("\n\n")

## Drift detection on the top 25 clusters


In [None]:
ds_hf = load_dataset("Gustavosta/Stable-Diffusion-Prompts")
ds = ds_hf["test"]

ds = ds.to_pandas()
ds["id"] = ds.index
ds = ds[["id", "Prompt"]]
ds

In [None]:
args.inference = True
args

In [None]:
# Embed Text
# * See [pretrained models](https://www.sbert.net/docs/pretrained_models.html) for supporting information

model = SentenceTransformer("all-mpnet-base-v2")

if not args.inference:
    embeddings = torch.zeros([len(ds), 768])
    for i in tqdm(range(len(ds))):
        emb = model.encode(ds.loc[i, "Prompt"], convert_to_tensor=True)
        embeddings[i, :] = emb
    embeddings

In [None]:
f_name = os.path.join(
    args.cache_folder, "stable_diffusion_prompts_test_embeddings_all_mpnet_base_v2.pt"
)
print(f_name, "\n")

if not args.inference:
    torch.save(embeddings, f_name)
loaded_embeddings = torch.load(f_name)
loaded_embeddings

In [None]:
loaded_embeddings.shape

In [None]:
if not args.inference:
    torch.equal(loaded_embeddings.cpu(), embeddings.cpu())

In [None]:
test_itr = random.randint(low=0, high=len(ds))
test_itr

In [None]:
# test embeddings worked
ds.loc[test_itr, "Prompt"]

In [None]:
test_emb = model.encode(ds.loc[test_itr, "Prompt"], convert_to_tensor=True)
a = np.array(test_emb.cpu())
b = np.array(loaded_embeddings[test_itr, :].cpu())
np.allclose(a, b, rtol=1e-02)

In [None]:
# second, perform clustering

# first, perform dimensionality reduction from 768 to 15
f_name = os.path.join(args.cache_folder, "reducer_umap_15.pkl")
print(f_name, "\n")

loaded_reducer_15 = pickle.load((open(f_name, "rb")))

embeddings_umap_dim_15 = loaded_reducer_15.transform(loaded_embeddings)

embeddings_umap_dim_15.shape

In [None]:
f_name = os.path.join(args.cache_folder, "clusterer_hdbscan.pkl")
print(f_name, "\n")

loaded_clusterer = pickle.load((open(f_name, "rb")))
loaded_clusterer

In [None]:
test_labels, strengths = hdbscan.approximate_predict(
    loaded_clusterer, embeddings_umap_dim_15
)
test_labels

In [None]:
pd.Series(test_labels).value_counts()[1:26]