In [1]:
# General modules
import os
import openai
import pandas as pd
from dotenv import load_dotenv
from helper.utils import *

# Setup API keys
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

# Specify models
chat_model_name = 'gpt-4o-mini'
openai_embedding_model = "text-embedding-3-small"
local_embedding_model = "all-MiniLM-L6-v2"

configure_api(client, chat_model_name)

# Specify paths for storing (backup) data
root_dir = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data/'
data_source = 'Cluster_tests'

path_db_prepared = os.path.join(root_dir, data_source, "db_prepared.json")          #backup
path_db_translated = os.path.join(root_dir, data_source, "db_translated.json")      #backup
path_db_analysed = os.path.join(root_dir, data_source, "db_analysed.json")          #backup
path_db_embedded = os.path.join(root_dir, data_source, "db_embedded.json")          #backup
path_db_clustered = os.path.join(root_dir, data_source, "db_clustered.json")        #backup
path_db_final = os.path.join(root_dir, data_source, "db_final.json")                #final file

In [2]:
# Reduce dimensions a priori
# The clustering does not perform to good. Some datapoints that clearly should be in a cluster based on eyeballing and their topic name but they are not. Rather often some points in a dense cluster are categorized as noise.
# I will try to improve this by first perform a dimension reduction and then perform clustering. Reason being, that in high dimensions the data might be too sparse for the clustering algorithm to work properly.


import umap
from sklearn.manifold import TSNE

data = read_json(path_db_embedded)
# Gather all embeddings
embeddings = [entry['embedding'] for entry in data]

# Convert to numpy array
X = np.array(embeddings)

# Perform UMAP
# X_embedded = umap.UMAP(n_components=40).fit_transform(X) # 40 dimensions

# Perform t-SNE
X_embedded = TSNE(n_components=2).fit_transform(X)

# Store the updated embeddings in the data
for i, entry in enumerate(data):
    entry['embedding'] = X_embedded[i].tolist()



In [3]:
from helper.cluster_analysis import *
from helper.utils import *

# Adjustable parameters
dimensionality_methods = ['UMAP', 'PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 10, "min_samples": 1, "cluster_selection_epsilon": 0.15}           #, "min_samples": 2, "cluster_selection_epsilon": 0.15

data = read_json(path_db_embedded)
df = pd.DataFrame(data)
df = df[df['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
print(f"Loaded {len(df)} valid entries with embeddings.")

# Extract embeddings
mat = np.array(df['embedding'].tolist())

hdbscan_clusterer = hdbscan.HDBSCAN(**hdbscan_params)
cluster_labels = hdbscan_clusterer.fit_predict(mat)

reduction_results = {}

for method in dimensionality_methods:
    coords_2d = dimensionality_reduction(mat, method, n_components=2)
    reduction_results[f'hdbscan_{method}_2D'] = {
        'x': coords_2d[:, 0],
        'y': coords_2d[:, 1]
    }

# 3D Reduction
    coords_3d = dimensionality_reduction(mat, method, n_components=3)
    reduction_results[f'hdbscan_{method}_3D'] = {
        'x': coords_3d[:, 0],
        'y': coords_3d[:, 1],
        'z': coords_3d[:, 2]
    }

# Add dimensional coordinates to DataFrame
for method_dim, coords in reduction_results.items():
    for axis, values in coords.items():
        df[f'{method_dim}_{axis}'] = values

# Add the cluster labels to the DataFrame
df['hdbscan_id'] = cluster_labels


Loaded 991 valid entries with embeddings.


2025-03-05 15:10:56,805 - INFO - Applying UMAP with 2 components.
2025-03-05 15:11:03,882 - INFO - Applying UMAP with 3 components.
2025-03-05 15:11:04,464 - INFO - Applying PCA with 2 components.
2025-03-05 15:11:04,491 - INFO - Applying PCA with 3 components.
2025-03-05 15:11:04,519 - INFO - Applying tSNE with 2 components.
2025-03-05 15:11:04,521 - INFO - Perplexity not provided, setting to 30 based on sample size.
2025-03-05 15:11:05,716 - INFO - Applying tSNE with 3 components.
2025-03-05 15:11:05,718 - INFO - Perplexity not provided, setting to 30 based on sample size.


In [27]:
df['hdbscan_id'].value_counts()

hdbscan_id
 4    459
-1    443
 0     26
 1     19
 2     19
 5     15
 3     10
Name: count, dtype: int64

In [28]:
# Save the clustered data
# NOTE: either cluster algorithm is optional as well as the dimension reduction. Generally you want to perform HDBSCAN though with reduced dimensions and use kmeans only if you have an idea of the number of clusters up front.
save_df_as_json(df, path_db_clustered)

2025-03-05 12:56:52,472 - INFO - Saving data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data/Cluster_tests\db_clustered.json


In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

from helper.cluster_naming import *

api_settings = {"client": client, "model": chat_model_name}

def name_clusters(
    df,
    cluster_columns,
    embedding_col="embedding",
    text_col="sentence",
    top_k=25,
    skip_noise_label=-1
):
    for col in cluster_columns:
        # Prepare a dict to store {cluster_id -> cluster_name}
        cluster_id_to_name = {}
        logger.info((f'Preparing to name clusters in column "{col}"'))

        # Get unique cluster IDs from this column
        cluster_ids = df[col].unique()

        for cluster_id in cluster_ids:

            if skip_noise_label is not None and cluster_id == skip_noise_label:
                continue

            # Select rows belonging to this cluster
            cluster_data = df[df[col] == cluster_id]
            if cluster_data.empty:
                continue

            # Compute centroid of embeddings
            embeddings = np.array(cluster_data[embedding_col].tolist())
            centroid = embeddings.mean(axis=0, dtype=np.float32, keepdims=True)

            # Find top_k closest points to centroid
            distances = cosine_distances(centroid, embeddings).flatten()
            closest_indices = np.argsort(distances)[:top_k]
            representative_texts = cluster_data.iloc[closest_indices][text_col].tolist()

            # Call your naming function
            cluster_name = generate_cluster_name(representative_texts, api_settings)
            cluster_id_to_name[cluster_id] = cluster_name

        # Create a new column with the cluster name for each row
        name_col = f"{col}_name"
        df[name_col] = df[col].apply(lambda cid: cluster_id_to_name.get(cid, "Noise"))

    return df


data = read_json(path_db_clustered)  # data is probably a list of dicts
df = pd.DataFrame(data)              # Convert to DataFrame

cluster_columns = ['hdbscan_id']

df_named = name_clusters(
    df,
    cluster_columns,
    embedding_col="embedding",
    text_col="sentence",
    top_k=10,
    skip_noise_label=-1  # for HDBSCAN noise
)


NameError: name 'chat_model_name' is not defined

In [33]:
from datetime import datetime

# rename pp_timestamp to timestamp_updated
df_named.rename(columns={'pp_timestamp':'timestamp_updated'}, inplace=True)

# refactor timestamp
def convert_timestamp_string_to_unix_ms(date_str):
    """
    Parse a date string of the form 'MM/DD/YYYY HH:MM:SS AM/PM'
    and convert it to a Unix timestamp in milliseconds.
    """
    dt = datetime.strptime(date_str, "%m/%d/%Y %I:%M:%S %p")
    return int(dt.timestamp())

# We assume 'timestamp_updated' is the column to convert
def parse_or_preserve(val):
    """
    If the value is a string, try converting it.
    If it's already numeric (maybe you have mixed data?), leave it as is.
    """
    if isinstance(val, str):
        return convert_timestamp_string_to_unix_ms(val)
    else:
        return val

# Apply the function to each row in the 'timestamp_updated' column
df_named["timestamp_updated"] = df_named["timestamp_updated"].apply(parse_or_preserve)


# Optionally get rid of the embeddings to save space
df_named.drop(columns=['embedding'], inplace=True)
save_data_for_streamlit(df_named, path_db_final)

2025-03-05 12:57:37,515 - INFO - Saving updated data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data/Cluster_tests\db_final.json
2025-03-05 12:57:37,584 - INFO - Data saved successfully.
