In [5]:
import os
import openai
from numpy.ma.core import shape

from helper.utils import *
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"
configure_api(client, chat_model_name)

data_source = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Steam'
data_storage = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests'

path_db_analysed = os.path.join(data_source, "db_analysed.json")
path_db_embedded = os.path.join(data_storage, "db_embedded.json")
path_db_clustered = os.path.join(data_storage, "db_clustered.json")
path_db_final = os.path.join(data_storage, "db_final.json")

# Use OpenAI to generate embeddings

In [3]:
data = read_json(path_db_analysed)
data[0]['topics']


[{'topic': 'Game Detail',
  'sentiment': 'Positive',
  'category': 'fact',
  'sentence': "Very detailed and it has lot's to do."}]

In [4]:

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   embedding = client.embeddings.create(input = [text], model=model).data[0].embedding
   return embedding



embed_key = "sentence"  # "topic" or "sentence"
data = read_json(path_db_analysed)

def process_embedding(data, embed_key):
    for i in range(0, len(data)):
        if i % 10 == 0:
            logger.info(f"Processing entry {i}")

        for d_topic in data[i]["topics"]:
            if isinstance(d_topic, dict):
                d_topic["embedding"] = get_embedding(d_topic[embed_key], model="text-embedding-3-small")
    return data


data_embedded = process_embedding(data, embed_key)


# Flatten
def flatten_data(data):
    flattened = []
    for entry in data:
        base_copy = dict(entry)
        topics = base_copy.pop("topics", [])

        for topic in topics:
            new_entry = dict(base_copy)
            new_entry.update(topic)
            flattened.append(new_entry)
    return flattened


data_flattened = flatten_data(data_embedded)

# Save the embedded data
save_to_json(data_flattened, path_db_embedded)

2025-02-11 15:16:39,123 - INFO - Processing entry 0
2025-02-11 15:16:50,876 - INFO - Processing entry 10
2025-02-11 15:17:02,992 - INFO - Processing entry 20
2025-02-11 15:17:11,654 - INFO - Processing entry 30
2025-02-11 15:17:26,457 - INFO - Processing entry 40
2025-02-11 15:17:46,633 - INFO - Processing entry 50
2025-02-11 15:17:59,753 - INFO - Processing entry 60
2025-02-11 15:18:10,020 - INFO - Processing entry 70
2025-02-11 15:18:17,698 - INFO - Processing entry 80
2025-02-11 15:18:29,181 - INFO - Processing entry 90
2025-02-11 15:18:38,152 - INFO - Processing entry 100
2025-02-11 15:18:53,174 - INFO - Processing entry 110
2025-02-11 15:19:09,533 - INFO - Processing entry 120
2025-02-11 15:19:19,007 - INFO - Processing entry 130
2025-02-11 15:19:28,200 - INFO - Processing entry 140
2025-02-11 15:19:43,601 - INFO - Processing entry 150
2025-02-11 15:19:54,683 - INFO - Processing entry 160
2025-02-11 15:20:05,614 - INFO - Processing entry 170
2025-02-11 15:20:36,931 - INFO - Proces

In [6]:
save_to_json(data_flattened, path_db_embedded)

2025-02-11 16:07:13,982 - INFO - Data successfully saved to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests\db_embedded.json


# Local Embedding

In [None]:
from helper.embedding import *

embed_key = "topic"  # "topic" or "sentence"

data = read_json(path_db_analysed)
embed_model = initialize_embedding_model(embed_model_name)


def process_embedding(data, embed_key):
    for i in range(0, len(data)):
        if i % 10 == 0:
            logger.info(f"Processing entry {i}")

        for d_topic in data[i]["topics"]:
            if isinstance(d_topic, dict):
                d_topic["embedding"] = embed_text(d_topic[embed_key], embed_model)
    return data


data_embedded = process_embedding(data, embed_key)


# Flatten
def flatten_data(data):
    flattened = []
    for entry in data:
        base_copy = dict(entry)
        topics = base_copy.pop("topics", [])

        for topic in topics:
            new_entry = dict(base_copy)
            new_entry.update(topic)
            flattened.append(new_entry)
    return flattened


data_flattened = flatten_data(data_embedded)

# Save the embedded data
save_to_json(data_flattened, path_db_embedded)

# Cluster analysis

## Dimensionality Reduction

In [None]:
# Reduce dimensions a priori
# The clustering does not perform to good. Some datapoints that clearly should be in a cluster based on eyeballing and their topic name but they are not. Rather often some points in a dense cluster are categorized as noise.
# I will try to improve this by first perform a dimension reduction and then perform clustering. Reason being, that in high dimensions the data might be too sparse for the clustering algorithm to work properly.


import umap

data = read_json(path_db_embedded)
# Gather all embeddings
embeddings = [entry['embedding'] for entry in data]

# Convert to numpy array
X = np.array(embeddings)

# Perform UMAP
X_embedded = umap.UMAP(n_components=40).fit_transform(X) # 40 dimensions

# Store the updated embeddings in the data
for i, entry in enumerate(data):
    entry['embedding'] = X_embedded[i].tolist()


## HDBSCAN

In [9]:
from helper.cluster_analysis import *
from helper.utils import *

# Adjustable parameters
dimensionality_methods = ['UMAP', 'PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 50, "min_samples": 2, "cluster_selection_epsilon": 0.15}


df = pd.DataFrame(data)
df = df[df['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
print(f"Loaded {len(df)} valid entries with embeddings.")

# Extract embeddings
mat = np.array(df['embedding'].tolist())

hdbscan_clusterer = hdbscan.HDBSCAN(**hdbscan_params)
cluster_labels = hdbscan_clusterer.fit_predict(mat)

reduction_results = {}

for method in dimensionality_methods:
    coords_2d = dimensionality_reduction(mat, method, n_components=2)
    reduction_results[f'hdbscan_{method}_2D'] = {
        'x': coords_2d[:, 0],
        'y': coords_2d[:, 1]
    }

# 3D Reduction
    coords_3d = dimensionality_reduction(mat, method, n_components=3)
    reduction_results[f'hdbscan_{method}_3D'] = {
        'x': coords_3d[:, 0],
        'y': coords_3d[:, 1],
        'z': coords_3d[:, 2]
    }

# Add dimensional coordinates to DataFrame
for method_dim, coords in reduction_results.items():
    for axis, values in coords.items():
        df[f'{method_dim}_{axis}'] = values

# Add the cluster labels to the DataFrame
df['hdbscan_id'] = cluster_labels


2025-02-11 16:11:05,358 - INFO - Applying UMAP with 2 components.


Loaded 5797 valid entries with embeddings.


2025-02-11 16:11:06,428 - INFO - Applying UMAP with 3 components.
2025-02-11 16:11:07,259 - INFO - Applying PCA with 2 components.
2025-02-11 16:11:07,271 - INFO - Applying PCA with 3 components.
2025-02-11 16:11:07,275 - INFO - Applying tSNE with 2 components.
2025-02-11 16:11:07,275 - INFO - Perplexity not provided, setting to 30 based on sample size.
2025-02-11 16:11:16,228 - INFO - Applying tSNE with 3 components.
2025-02-11 16:11:16,230 - INFO - Perplexity not provided, setting to 30 based on sample size.


In [10]:
df.head()

Unnamed: 0,app_id_name,recommendationid,playtime_at_review_minutes,last_played,review_text,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,hdbscan_PCA_2D_y,hdbscan_PCA_3D_x,hdbscan_PCA_3D_y,hdbscan_PCA_3D_z,hdbscan_tSNE_2D_x,hdbscan_tSNE_2D_y,hdbscan_tSNE_3D_x,hdbscan_tSNE_3D_y,hdbscan_tSNE_3D_z,hdbscan_id
0,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,179387177,1775,1735228376,Very detailed and it has lot's to do.,1732047413,True,0,0,0.0,...,-0.538176,2.341202,-0.538176,0.224469,36.038742,4.344414,6.135602,-2.857599,7.672416,-1
1,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,0.56695,-2.910132,0.56695,-0.460475,-43.029846,-6.428859,-10.736487,10.554285,-2.86188,27
2,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,-1.436639,3.877381,-1.436639,-0.180017,48.162617,-41.702019,11.515228,-15.288798,-3.505596,21
3,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178350425,2492,1736827844,i love this game one phone and pc. i just wish...,1730675154,True,0,0,0.0,...,2.218104,1.598334,2.218104,-0.814631,30.663876,55.568203,10.934895,11.204454,-5.23413,11
4,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,177351085,3678,1729362177,"""It's wonderful, simply one of the best horse ...",1729361303,True,0,0,0.0,...,-1.777535,-4.422598,-1.777535,-1.208463,-20.107914,-57.066879,-8.981243,-17.134008,-9.69514,16


## Kmeans

In [11]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

from helper.cluster_analysis import dimensionality_reduction


# Adjustable parameters
dimensionality_methods = ['UMAP', 'PCA', 'tSNE']
kmeans_clusters = [15, 21, 25, 50]


df = df[df['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
print(f"Loaded {len(df)} valid entries with embeddings.")

# Extract embeddings
mat = np.array(df['embedding'].tolist())  # shape (n_samples, n_dimensions)

# --- KMeans Clustering (High-Dimensional) ---
for n_clusters in kmeans_clusters:
    kmeans_model = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans_model.fit_predict(mat)

    logger.info(f"Found {len(np.unique(cluster_labels))} clusters for KMeans with {n_clusters} clusters.")

    # Store the labels in a column named by cluster count
    df[f'kmeans_{n_clusters}_id'] = cluster_labels

# --- Dimensionality Reduction ---
# We do each method in 2D and 3D exactly once
reduction_results = {}

for method in dimensionality_methods:
    # 2D
    coords_2d = dimensionality_reduction(mat, method, n_components=2)
    reduction_results[f'kmeans_{method}_2D'] = {
        'x': coords_2d[:, 0],
        'y': coords_2d[:, 1]
    }

    # 3D
    coords_3d = dimensionality_reduction(mat, method, n_components=3)
    reduction_results[f'kmeans_{method}_3D'] = {
        'x': coords_3d[:, 0],
        'y': coords_3d[:, 1],
        'z': coords_3d[:, 2]
    }

# --- Add Dimensional Coordinates to DataFrame ---
for method_dim, coords in reduction_results.items():
    for axis, values in coords.items():
        df[f'{method_dim}_{axis}'] = values



2025-02-11 16:12:30,320 - INFO - Found 15 clusters for KMeans with 15 clusters.
2025-02-11 16:12:30,344 - INFO - Found 21 clusters for KMeans with 21 clusters.
2025-02-11 16:12:30,376 - INFO - Found 25 clusters for KMeans with 25 clusters.
2025-02-11 16:12:30,440 - INFO - Found 50 clusters for KMeans with 50 clusters.
2025-02-11 16:12:30,446 - INFO - Applying UMAP with 2 components.


Loaded 5797 valid entries with embeddings.


2025-02-11 16:12:31,372 - INFO - Applying UMAP with 3 components.
2025-02-11 16:12:32,163 - INFO - Applying PCA with 2 components.
2025-02-11 16:12:32,166 - INFO - Applying PCA with 3 components.
2025-02-11 16:12:32,169 - INFO - Applying tSNE with 2 components.
2025-02-11 16:12:32,171 - INFO - Perplexity not provided, setting to 30 based on sample size.
2025-02-11 16:12:40,977 - INFO - Applying tSNE with 3 components.
2025-02-11 16:12:40,979 - INFO - Perplexity not provided, setting to 30 based on sample size.


In [12]:
df.head()

Unnamed: 0,app_id_name,recommendationid,playtime_at_review_minutes,last_played,review_text,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,kmeans_PCA_2D_x,kmeans_PCA_2D_y,kmeans_PCA_3D_x,kmeans_PCA_3D_y,kmeans_PCA_3D_z,kmeans_tSNE_2D_x,kmeans_tSNE_2D_y,kmeans_tSNE_3D_x,kmeans_tSNE_3D_y,kmeans_tSNE_3D_z
0,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,179387177,1775,1735228376,Very detailed and it has lot's to do.,1732047413,True,0,0,0.0,...,2.341202,-0.538176,2.341202,-0.538176,0.224469,36.038742,4.344414,6.135602,-2.857599,7.672416
1,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,-2.910132,0.56695,-2.910132,0.56695,-0.460475,-43.029846,-6.428859,-10.736487,10.554285,-2.86188
2,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,3.877381,-1.436639,3.877381,-1.436639,-0.180017,48.162617,-41.702019,11.515228,-15.288798,-3.505596
3,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178350425,2492,1736827844,i love this game one phone and pc. i just wish...,1730675154,True,0,0,0.0,...,1.598334,2.218104,1.598334,2.218104,-0.814631,30.663876,55.568203,10.934895,11.204454,-5.23413
4,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,177351085,3678,1729362177,"""It's wonderful, simply one of the best horse ...",1729361303,True,0,0,0.0,...,-4.422598,-1.777535,-4.422598,-1.777535,-1.208463,-20.107914,-57.066879,-8.981243,-17.134008,-9.69514


In [13]:
# Save the clustered data
# NOTE: either cluster algorithm is optional as well as the dimension reduction. Generally you want to perform HDBSCAN though with reduced dimensions and use kmeans only if you have an idea of the number of clusters up front.
save_df_as_json(df, path_db_clustered)

2025-02-11 16:13:46,493 - INFO - Saving data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests\db_clustered.json


# Cluster Naming

In [15]:
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

from helper.cluster_naming import *

api_settings = {"client": client, "model": chat_model_name}

def name_clusters(
    df,
    cluster_columns,
    embedding_col="embedding",
    text_col="sentence",
    top_k=25,
    skip_noise_label=-1
):
    for col in cluster_columns:
        # Prepare a dict to store {cluster_id -> cluster_name}
        cluster_id_to_name = {}
        logger.info((f'Preparing to name clusters in column "{col}"'))

        # Get unique cluster IDs from this column
        cluster_ids = df[col].unique()

        for cluster_id in cluster_ids:

            if skip_noise_label is not None and cluster_id == skip_noise_label:
                continue

            # Select rows belonging to this cluster
            cluster_data = df[df[col] == cluster_id]
            if cluster_data.empty:
                continue

            # Compute centroid of embeddings
            embeddings = np.array(cluster_data[embedding_col].tolist())
            centroid = embeddings.mean(axis=0, dtype=np.float32, keepdims=True)

            # Find top_k closest points to centroid
            distances = cosine_distances(centroid, embeddings).flatten()
            closest_indices = np.argsort(distances)[:top_k]
            representative_texts = cluster_data.iloc[closest_indices][text_col].tolist()

            # Call your naming function
            cluster_name = generate_cluster_name(representative_texts, api_settings)
            cluster_id_to_name[cluster_id] = cluster_name

        # Create a new column with the cluster name for each row
        name_col = f"{col}_name"
        df[name_col] = df[col].apply(lambda cid: cluster_id_to_name.get(cid, "Noise"))

    return df


data = read_json(path_db_clustered)  # data is probably a list of dicts
df = pd.DataFrame(data)              # Convert to DataFrame

cluster_columns = ['hdbscan_id', 'kmeans_15_id', 'kmeans_21_id', 'kmeans_25_id', 'kmeans_50_id']

df_named = name_clusters(
    df,
    cluster_columns,
    embedding_col="embedding",
    text_col="sentence",
    top_k=10,
    skip_noise_label=-1  # for HDBSCAN noise
)


2025-02-11 16:14:53,293 - INFO - Preparing to name clusters in column "hdbscan_id"
2025-02-11 16:14:53,803 - INFO - Generated cluster name: Interactive Horse Management Experience
2025-02-11 16:14:53,803 - INFO - Tokens used so far: Prompt Tokens: 2488, Completion Tokens: 75
2025-02-11 16:14:54,368 - INFO - Generated cluster name: Passionate Game Praise
2025-02-11 16:14:54,368 - INFO - Tokens used so far: Prompt Tokens: 2630, Completion Tokens: 80
2025-02-11 16:14:55,169 - INFO - Generated cluster name: Mobile Monetization vs. Desktop Pricing
2025-02-11 16:14:55,169 - INFO - Tokens used so far: Prompt Tokens: 2878, Completion Tokens: 88
2025-02-11 16:14:55,667 - INFO - Generated cluster name: Top Horse Games Overview
2025-02-11 16:14:55,683 - INFO - Tokens used so far: Prompt Tokens: 3078, Completion Tokens: 93
2025-02-11 16:14:56,358 - INFO - Generated cluster name: Celebrating a Great Game
2025-02-11 16:14:56,358 - INFO - Tokens used so far: Prompt Tokens: 3215, Completion Tokens: 99

In [16]:
df_named.head()

Unnamed: 0,app_id_name,recommendationid,playtime_at_review_minutes,last_played,review_text,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,kmeans_tSNE_2D_x,kmeans_tSNE_2D_y,kmeans_tSNE_3D_x,kmeans_tSNE_3D_y,kmeans_tSNE_3D_z,hdbscan_id_name,kmeans_15_id_name,kmeans_21_id_name,kmeans_25_id_name,kmeans_50_id_name
0,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,179387177,1775,1735228376,Very detailed and it has lot's to do.,1732047413,True,0,0,0.0,...,36.038742,4.344414,6.135602,-2.857599,7.672416,Noise,Dynamic Task Progression and Fun,Addictive and Engaging Gameplay,Engaging and Addictive Gameplay Experience,Engaging Story-Driven Quest Game
1,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,-43.029846,-6.428859,-10.736487,10.554285,-2.86188,Interactive Horse Management Experience,"""Equestrian Game Customization and Trading""",Horse Care and Customization Insights,Enhanced Equestrian Gaming Experience,Horse Care and Interaction Features
2,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,48.162617,-41.702019,11.515228,-15.288798,-3.505596,Passionate Game Praise,Enjoyable and Fun Gaming Experience,Game Love and Enthusiasm,Ultimate Game Appreciation,Game Love and Enthusiasm
3,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178350425,2492,1736827844,i love this game one phone and pc. i just wish...,1730675154,True,0,0,0.0,...,30.663876,55.568203,10.934895,11.204454,-5.23413,Mobile Monetization vs. Desktop Pricing,Cross-Platform Update Discrepancies,"""PC and Mobile Synchronization""",PC Version Update and Improvement,Mobile Update Disparity
4,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,177351085,3678,1729362177,"""It's wonderful, simply one of the best horse ...",1729361303,True,0,0,0.0,...,-20.107914,-57.066879,-8.981243,-17.134008,-9.69514,Top Horse Games Overview,Engaging Single-Player Horse Games,Horse Racing Game Excellence,Best Horse Racing Games,Top Horse Game Experiences


In [17]:
save_data_for_streamlit(df_named, path_db_final)

2025-02-11 16:17:08,992 - INFO - Saving updated data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests\db_final.json
2025-02-11 16:17:09,390 - INFO - Data saved successfully.
