In [1]:
import os
import openai
from numpy.ma.core import shape

from helper.utils import *
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"
configure_api(client, chat_model_name)

data_source = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Steam'
data_storage = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests'

path_db_analysed = os.path.join(data_source, "db_analysed.json")
path_db_embedded = os.path.join(data_source, "db_embedded.json")
path_db_clustered = os.path.join(data_storage, "db_clustered.json")
path_db_final = os.path.join(data_storage, "db_final.json")

# Extract unique topics

In [2]:
# Cluster algorithms force split datapoints even if they are identical. I will replace the embeddings of identical datapoints with the mean of their embeddings.
# Load the JSON data
data = read_json(path_db_embedded)
# data is your JSON array (list of dicts)
unique_topics = set()

for entry in data:
    unique_topics.add(entry['topic'])

# Now unique_topics is a set of all topic names
print(f'Original topics: {len(data)}')
print(f'Unique topics: {len(unique_topics)}')

Original topics: 5797
Unique topics: 2578


# reduce dimensions

In [3]:
# Reduce the dimensions with t-SNE and replace the old embeddings with the new ones
from sklearn.manifold import TSNE
import umap
import matplotlib.pyplot as plt

data = read_json(path_db_embedded)
# Gather all embeddings
embeddings = [entry['embedding'] for entry in data]

# Convert to numpy array
X = np.array(embeddings)

# Perform t-SNE
# X_embedded = TSNE(n_components=3).fit_transform(X)

# Perform UMAP
X_embedded = umap.UMAP(n_components=12).fit_transform(X)

# Update the embeddings in the data
for i, entry in enumerate(data):
    entry['embedding'] = X_embedded[i].tolist()

# Save the updated data



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data[0]

{'app_id_name': '1166860_Rival_Stars_Horse_Racing_Desktop_Edition',
 'recommendationid': 179387177,
 'playtime_at_review_minutes': 1775,
 'last_played': 1735228376,
 'review_text': "Very detailed and it has lot's to do.",
 'timestamp_updated': 1732047413,
 'voted_up': True,
 'votes_up': 0,
 'votes_funny': 0,
 'weighted_vote_score': 0.0,
 'steam_purchase': True,
 'received_for_free': False,
 'written_during_early_access': False,
 'language': 'english',
 'topic': 'Game Detail',
 'sentiment': 'Positive',
 'category': 'fact',
 'sentence': "Very detailed and it has lot's to do.",
 'embedding': [10.250261306762695,
  0.9153907895088196,
  3.830810785293579,
  4.515744686126709,
  3.7529430389404297,
  4.707149982452393,
  6.164806365966797,
  4.536752700805664,
  4.263311386108398,
  3.573075771331787,
  4.3914618492126465,
  5.210045337677002]}

In [5]:
save_data = data

In [6]:
import numpy as np

# Dictionary to store all embeddings for each topic
topic_to_embeddings = {topic: [] for topic in unique_topics}

# Gather embeddings by topic
for entry in data:
    topic = entry['topic']
    # print(topic)
    embedding = entry['embedding']
    # print(embedding)
    topic_to_embeddings[topic].append(embedding)
    # print(topic_to_embeddings[topic])

In [7]:
topic_to_embeddings['Game Detail']

[[10.250261306762695,
  0.9153907895088196,
  3.830810785293579,
  4.515744686126709,
  3.7529430389404297,
  4.707149982452393,
  6.164806365966797,
  4.536752700805664,
  4.263311386108398,
  3.573075771331787,
  4.3914618492126465,
  5.210045337677002],
 [10.250589370727539,
  0.9104607105255127,
  3.827085256576538,
  4.510655879974365,
  3.7494828701019287,
  4.708189964294434,
  6.161110877990723,
  4.535309314727783,
  4.260833740234375,
  3.573715925216675,
  4.3884148597717285,
  5.207934379577637],
 [10.248376846313477,
  0.9143823981285095,
  3.8335189819335938,
  4.520477771759033,
  3.7542688846588135,
  4.707557678222656,
  6.1696882247924805,
  4.536090850830078,
  4.267821788787842,
  3.5718727111816406,
  4.390283584594727,
  5.207627773284912],
 [10.24856948852539,
  0.9095073342323303,
  3.8342905044555664,
  4.51556396484375,
  3.7518787384033203,
  4.708734035491943,
  6.176413059234619,
  4.532730579376221,
  4.264269828796387,
  3.571284532546997,
  4.38809299468

In [8]:
# Calculate mean embedding for each topic
topic_to_mean_embedding = {}

for topic, embeddings_list in topic_to_embeddings.items():
    # Convert to a numpy array for easy mean calculation
    np_embeddings = np.array(embeddings_list)
    mean_embedding = np.mean(np_embeddings, axis=0)  # Shape will match original embedding dimension

    # Store the mean embedding (convert back to list if you want plain Python types)
    topic_to_mean_embedding[topic] = mean_embedding.tolist()

In [9]:
topic_to_mean_embedding['Game Detail']

[10.249449253082275,
 0.912435308098793,
 3.8314263820648193,
 4.515610575675964,
 3.752143383026123,
 4.7079079151153564,
 6.168004631996155,
 4.5352208614349365,
 4.2640591859817505,
 3.572487235069275,
 4.389563322067261,
 5.2080641984939575]

In [57]:
(-2.168487071990967 -2.173760414123535 -2.168487071990967 -2.173734426498413) / 4

-2.1711172461509705

In [10]:
for entry in data:
    topic = entry['topic']
    entry['embedding'] = topic_to_mean_embedding[topic]


In [14]:
data[0]

{'app_id_name': '1166860_Rival_Stars_Horse_Racing_Desktop_Edition',
 'recommendationid': 179387177,
 'playtime_at_review_minutes': 1775,
 'last_played': 1735228376,
 'review_text': "Very detailed and it has lot's to do.",
 'timestamp_updated': 1732047413,
 'voted_up': True,
 'votes_up': 0,
 'votes_funny': 0,
 'weighted_vote_score': 0.0,
 'steam_purchase': True,
 'received_for_free': False,
 'written_during_early_access': False,
 'language': 'english',
 'topic': 'Game Detail',
 'sentiment': 'Positive',
 'category': 'fact',
 'sentence': "Very detailed and it has lot's to do.",
 'embedding': [10.249449253082275,
  0.912435308098793,
  3.8314263820648193,
  4.515610575675964,
  3.752143383026123,
  4.7079079151153564,
  6.168004631996155,
  4.5352208614349365,
  4.2640591859817505,
  3.572487235069275,
  4.389563322067261,
  5.2080641984939575]}

# Cluster analysis

In [15]:
# The clustering does not perform to good. Some datapoints that clearly should be in a cluster based on eyeballing and their topic name but they are not. Rather often some points in a dense cluster are categorized as noise.
# I will try to improve this by first perform a dimension reduction and then perform clustering. Reason being, that in high dimensions the data might be too sparse for the clustering algorithm to work properly.

from helper.cluster_analysis import *
from helper.utils import *

# Adjustable parameters
dimensionality_methods = ['UMAP', 'PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 50, "min_samples": 2, "cluster_selection_epsilon": 0.15}

df = pd.DataFrame(data)
df = df[df['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
print(f"Loaded {len(df)} valid entries with embeddings.")

# Extract embeddings
mat = np.array(df['embedding'].tolist())

hdbscan_clusterer = hdbscan.HDBSCAN(**hdbscan_params)
cluster_labels = hdbscan_clusterer.fit_predict(mat)

reduction_results = {}

for method in dimensionality_methods:
    coords_2d = dimensionality_reduction(mat, method, n_components=2)
    reduction_results[f'{method}_2D'] = {
        'x': coords_2d[:, 0],
        'y': coords_2d[:, 1]
    }

# 3D Reduction
    coords_3d = dimensionality_reduction(mat, method, n_components=3)
    reduction_results[f'{method}_3D'] = {
        'x': coords_3d[:, 0],
        'y': coords_3d[:, 1],
        'z': coords_3d[:, 2]
    }

# Add dimensional coordinates to DataFrame
for method_dim, coords in reduction_results.items():
    for axis, values in coords.items():
        df[f'{method_dim}_{axis}'] = values

# Add the cluster labels to the DataFrame
df['cluster_id'] = cluster_labels


2025-01-30 09:09:15,304 - INFO - Applying UMAP with 2 components.


Loaded 5797 valid entries with embeddings.


[5.45539342e-15 1.48931072e-06 1.50321232e-06 1.23879171e-05]
not reaching the requested tolerance 1.817941665649414e-06.
Use iteration 1149 instead with accuracy 
1.0064173185583427e-06.

  _, diffusion_map = lobpcg(
[2.29703430e-15 4.86256749e-07 5.30348495e-07 3.00907056e-06]
not reaching the requested tolerance 1.817941665649414e-06.
  _, diffusion_map = lobpcg(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
2025-01-30 09:09:17,496 - INFO - Applying UMAP with 3 components.
[6.70244769e-13 4.90045056e-07 4.54334645e-07 4.75139506e-07
 7.50753537e-06]
not reaching the requested tole

In [17]:
df.head()

Unnamed: 0,app_id_name,recommendationid,playtime_at_review_minutes,last_played,review_text,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,PCA_2D_y,PCA_3D_x,PCA_3D_y,PCA_3D_z,tSNE_2D_x,tSNE_2D_y,tSNE_3D_x,tSNE_3D_y,tSNE_3D_z,cluster_id
0,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,179387177,1775,1735228376,Very detailed and it has lot's to do.,1732047413,True,0,0,0.0,...,-0.568135,0.681735,-0.568135,0.795651,-11.443602,-5.308172,-10.599676,-0.169257,-11.246475,-1
1,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,1.695222,-11.549797,1.695222,-0.141733,-57.855373,20.379818,-20.790731,1.832306,-8.828346,6
2,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,1.803372,3.46444,1.803372,-16.562259,-7.79351,45.615433,3.165056,0.021274,-26.633314,4
3,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178350425,2492,1736827844,i love this game one phone and pc. i just wish...,1730675154,True,0,0,0.0,...,1.45444,2.097897,1.45444,1.276368,21.608898,24.011642,8.558475,15.149671,5.031723,35
4,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,177351085,3678,1729362177,"""It's wonderful, simply one of the best horse ...",1729361303,True,0,0,0.0,...,2.763658,3.153257,2.763658,2.007338,68.333778,9.927643,12.30941,2.273433,32.916172,-1


In [18]:
save_df_as_json(df, path_db_clustered)

2025-01-30 09:10:22,913 - INFO - Saving data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests\db_clustered.json


# Cluster Naming

In [19]:
from helper.cluster_naming import *

api_settings = {"client": client, "model": chat_model_name}

data_cluster = read_json(path_db_clustered)

unique_cluster_names = {}

df = pd.DataFrame(data_cluster)
df = df[df['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
print(f"Loaded {len(df)} valid entries with embeddings.")


unique_clusters = df['cluster_id'].unique()

for cluster_id in unique_clusters:
    if cluster_id == -1:  # Skip noise clusters
        continue

    # Find the 10 most centric points in the cluster and store them in a list named 'topics'
    cluster_data = df[df['cluster_id'] == cluster_id]
    cluster_embeddings = np.array(cluster_data['embedding'].tolist())
    cluster_centroid = np.mean(cluster_embeddings, axis=0)
    cluster_centroid = cluster_centroid.tolist()

    distances = cosine_distances([cluster_centroid], cluster_embeddings).flatten()
    closest_indices = np.argsort(distances)[:10]
    representative_topics = cluster_data.iloc[closest_indices]['sentence'].tolist()

    #print(representative_topics)

    # Generate or retrieve cluster name
    if cluster_id not in unique_cluster_names:
        cluster_name = generate_cluster_name(representative_topics, api_settings)
        unique_cluster_names[cluster_id] = cluster_name



Loaded 5797 valid entries with embeddings.


2025-01-30 09:10:29,542 - INFO - Generated cluster name: Horse Ownership and Breeding Dynamics
2025-01-30 09:10:29,542 - INFO - Tokens used so far: Prompt Tokens: 205, Completion Tokens: 7
2025-01-30 09:10:30,126 - INFO - Generated cluster name: Game Appreciation and Enthusiasm
2025-01-30 09:10:30,126 - INFO - Tokens used so far: Prompt Tokens: 345, Completion Tokens: 13
2025-01-30 09:10:30,913 - INFO - Generated cluster name: "Update Delays: Mobile vs. PC"
2025-01-30 09:10:30,913 - INFO - Tokens used so far: Prompt Tokens: 564, Completion Tokens: 24
2025-01-30 09:10:33,798 - INFO - Generated cluster name: Mobile vs. PC Game Experience
2025-01-30 09:10:33,798 - INFO - Tokens used so far: Prompt Tokens: 797, Completion Tokens: 31
2025-01-30 09:10:34,398 - INFO - Generated cluster name: Horse Game Opinions and Feedback
2025-01-30 09:10:34,398 - INFO - Tokens used so far: Prompt Tokens: 938, Completion Tokens: 37
2025-01-30 09:10:35,021 - INFO - Generated cluster name: Addictive and Enjoy

In [20]:
unique_cluster_names

{6: 'Horse Ownership and Breeding Dynamics',
 4: 'Game Appreciation and Enthusiasm',
 35: '"Update Delays: Mobile vs. PC"',
 24: 'Mobile vs. PC Game Experience',
 10: 'Horse Game Opinions and Feedback',
 13: 'Addictive and Enjoyable Gaming Experience',
 11: 'Horse Customization Options and Features',
 22: 'Immersive Horse Racing Experience',
 0: 'Value Perception in Gaming',
 20: 'Engaging and Addictive Gaming Experience',
 8: 'Desire for Diverse Breeds',
 28: 'Diverse Racing Gameplay Experience',
 40: 'Social Interaction and Horse Care',
 30: 'Lag and Performance Issues',
 26: 'Racing Experience: Fun vs. Repetitiveness',
 1: 'Horse Racing Game Enthusiasts',
 21: 'Outstanding Horse Simulation Games',
 34: 'Horse Game Modes and Features',
 15: 'Chill and Entertaining Gaming Experience',
 2: 'Game Graphics Praise',
 25: 'Cross Country Enthusiasm and Features',
 14: 'Highly Recommended Horse Game',
 7: 'Diverse Horse Colors and Variations',
 37: 'Game Feedback and Improvement Suggestions'

In [21]:
# store the cluster names
df['cluster_name'] = df['cluster_id'].apply(lambda x: unique_cluster_names[x] if x in unique_cluster_names else "Unknown")

In [22]:
df.head()

Unnamed: 0,app_id_name,recommendationid,playtime_at_review_minutes,last_played,review_text,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,PCA_3D_x,PCA_3D_y,PCA_3D_z,tSNE_2D_x,tSNE_2D_y,tSNE_3D_x,tSNE_3D_y,tSNE_3D_z,cluster_id,cluster_name
0,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,179387177,1775,1735228376,Very detailed and it has lot's to do.,1732047413,True,0,0,0.0,...,0.681735,-0.568135,0.795651,-11.443602,-5.308172,-10.599676,-0.169257,-11.246475,-1,Unknown
1,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,-11.549797,1.695222,-0.141733,-57.855373,20.379818,-20.790731,1.832306,-8.828346,6,Horse Ownership and Breeding Dynamics
2,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,3.46444,1.803372,-16.562259,-7.79351,45.615433,3.165056,0.021274,-26.633314,4,Game Appreciation and Enthusiasm
3,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178350425,2492,1736827844,i love this game one phone and pc. i just wish...,1730675154,True,0,0,0.0,...,2.097897,1.45444,1.276368,21.608898,24.011642,8.558475,15.149671,5.031723,35,"""Update Delays: Mobile vs. PC"""
4,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,177351085,3678,1729362177,"""It's wonderful, simply one of the best horse ...",1729361303,True,0,0,0.0,...,3.153257,2.763658,2.007338,68.333778,9.927643,12.30941,2.273433,32.916172,-1,Unknown


In [23]:
save_data_for_streamlit(df, path_db_final)

2025-01-30 09:23:58,134 - INFO - Saving updated data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests\db_final.json
2025-01-30 09:23:58,345 - INFO - Data saved successfully.
