In [5]:
import os
import openai
from numpy.ma.core import shape

from helper.utils import *
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"
configure_api(client, chat_model_name)

data_source = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Steam'
data_storage = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests'

path_db_analysed = os.path.join(data_source, "db_analysed.json")
path_db_embedded = os.path.join(data_source, "db_embedded.json")
path_db_clustered = os.path.join(data_storage, "db_clustered.json")
path_db_final = os.path.join(data_storage, "db_final.json")

# Extract unique topics

In [15]:
# Cluster algorithms force split datapoints even if they are identical. I will replace the embeddings of identical datapoints with the mean of their embeddings.
# Load the JSON data
data = read_json(path_db_embedded)
# data is your JSON array (list of dicts)
unique_topics = set()

for entry in data:
    unique_topics.add(entry['topic'])

# Now unique_topics is a set of all topic names
print(f'Original topics: {len(data)}')
print(f'Unique topics: {len(unique_topics)}')

Original topics: 5797
Unique topics: 2578


# reduce dimensions

In [49]:
# Reduce the dimensions with t-SNE and replace the old embeddings with the new ones
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

data = read_json(path_db_embedded)
# Gather all embeddings
embeddings = [entry['embedding'] for entry in data]

# Convert to numpy array
X = np.array(embeddings)

# Perform t-SNE
X_embedded = TSNE(n_components=3).fit_transform(X)

# Update the embeddings in the data
for i, entry in enumerate(data):
    entry['embedding'] = X_embedded[i].tolist()

# Save the updated data



In [50]:
data[0]

{'app_id_name': '1166860_Rival_Stars_Horse_Racing_Desktop_Edition',
 'recommendationid': 179387177,
 'playtime_at_review_minutes': 1775,
 'last_played': 1735228376,
 'review_text': "Very detailed and it has lot's to do.",
 'timestamp_updated': 1732047413,
 'voted_up': True,
 'votes_up': 0,
 'votes_funny': 0,
 'weighted_vote_score': 0.0,
 'steam_purchase': True,
 'received_for_free': False,
 'written_during_early_access': False,
 'language': 'english',
 'topic': 'Game Detail',
 'sentiment': 'Positive',
 'category': 'fact',
 'sentence': "Very detailed and it has lot's to do.",
 'embedding': [-2.168487071990967, -7.140398025512695, -1.3362581729888916]}

In [51]:
save_data = data

In [52]:
import numpy as np

# Dictionary to store all embeddings for each topic
topic_to_embeddings = {topic: [] for topic in unique_topics}

# Gather embeddings by topic
for entry in data:
    topic = entry['topic']
    # print(topic)
    embedding = entry['embedding']
    # print(embedding)
    topic_to_embeddings[topic].append(embedding)
    # print(topic_to_embeddings[topic])

In [53]:
topic_to_embeddings['Game Detail']

[[-2.168487071990967, -7.140398025512695, -1.3362581729888916],
 [-2.173760414123535, -7.140198707580566, -1.3354815244674683],
 [-2.168487071990967, -7.140398025512695, -1.3362581729888916],
 [-2.173734426498413, -7.140158653259277, -1.3354891538619995]]

In [54]:
# Calculate mean embedding for each topic
topic_to_mean_embedding = {}

for topic, embeddings_list in topic_to_embeddings.items():
    # Convert to a numpy array for easy mean calculation
    np_embeddings = np.array(embeddings_list)
    mean_embedding = np.mean(np_embeddings, axis=0)  # Shape will match original embedding dimension

    # Store the mean embedding (convert back to list if you want plain Python types)
    topic_to_mean_embedding[topic] = mean_embedding.tolist()

In [56]:
topic_to_mean_embedding['Game Detail']

[-2.1711172461509705, -7.140288352966309, -1.3358717560768127]

In [57]:
(-2.168487071990967 -2.173760414123535 -2.168487071990967 -2.173734426498413) / 4

-2.1711172461509705

In [58]:
for entry in data:
    topic = entry['topic']
    entry['embedding'] = topic_to_mean_embedding[topic]


In [61]:
data['embedding']

TypeError: list indices must be integers or slices, not str

# Cluster analysis

In [64]:
# The clustering does not perform to good. Some datapoints that clearly should be in a cluster based on eyeballing and their topic name but they are not. Rather often some points in a dense cluster are categorized as noise.
# I will try to improve this by first perform a dimension reduction and then perform clustering. Reason being, that in high dimensions the data might be too sparse for the clustering algorithm to work properly.

from helper.cluster_analysis import *
from helper.utils import *

# Adjustable parameters
dimensionality_methods = ['UMAP', 'PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 50, "min_samples": 2, "cluster_selection_epsilon": 0.15}

df = pd.DataFrame(data)
df = df[df['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
print(f"Loaded {len(df)} valid entries with embeddings.")

# Extract embeddings
mat = np.array(df['embedding'].tolist())

hdbscan_clusterer = hdbscan.HDBSCAN(**hdbscan_params)
cluster_labels = hdbscan_clusterer.fit_predict(mat)

# Add the cluster labels to the DataFrame
df['cluster_id'] = cluster_labels


Loaded 5797 valid entries with embeddings.




In [65]:
df['cluster_id'].value_counts()

cluster_id
-1     1314
 26     281
 32     252
 3      226
 0      220
 24     204
 13     190
 37     171
 19     170
 29     164
 23     164
 28     145
 27     137
 33     136
 12     136
 2      126
 30     126
 20     119
 35     110
 14     108
 22      97
 8       95
 11      93
 15      84
 6       81
 21      74
 25      72
 31      71
 7       70
 36      64
 18      63
 9       58
 34      57
 16      57
 17      56
 1       55
 4       51
 5       50
 10      50
Name: count, dtype: int64

In [66]:
save_df_as_json(df, path_db_clustered)

2025-01-29 15:37:24,362 - INFO - Saving data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests\db_clustered.json


# Cluster Naming

In [72]:
from helper.cluster_naming import *

api_settings = {"client": client, "model": chat_model_name}

data_cluster = read_json(path_db_clustered)

unique_cluster_names = {}

df = pd.DataFrame(data_cluster)
df = df[df['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
print(f"Loaded {len(df)} valid entries with embeddings.")


unique_clusters = df['cluster_id'].unique()

for cluster_id in unique_clusters:
    if cluster_id == -1:  # Skip noise clusters
        continue

    # Find the 10 most centric points in the cluster and store them in a list named 'topics'
    cluster_data = df[df['cluster_id'] == cluster_id]
    cluster_embeddings = np.array(cluster_data['embedding'].tolist())
    cluster_centroid = np.mean(cluster_embeddings, axis=0)
    cluster_centroid = cluster_centroid.tolist()

    distances = cosine_distances([cluster_centroid], cluster_embeddings).flatten()
    closest_indices = np.argsort(distances)[:10]
    representative_topics = cluster_data.iloc[closest_indices]['sentence'].tolist()

    #print(representative_topics)

    # Generate or retrieve cluster name
    if cluster_id not in unique_cluster_names:
        cluster_name = generate_cluster_name(representative_topics, api_settings)
        unique_cluster_names[cluster_id] = cluster_name



Loaded 5797 valid entries with embeddings.


2025-01-29 16:37:29,829 - INFO - Generated cluster name: AI Racing Challenges and Frustrations
2025-01-29 16:37:29,829 - INFO - Tokens used so far: Prompt Tokens: 300839, Completion Tokens: 102
2025-01-29 16:37:30,355 - INFO - Generated cluster name: Horse Models and Game Mechanics
2025-01-29 16:37:30,371 - INFO - Tokens used so far: Prompt Tokens: 301060, Completion Tokens: 108
2025-01-29 16:37:30,822 - INFO - Generated cluster name: Ultimate Game Praise
2025-01-29 16:37:30,822 - INFO - Tokens used so far: Prompt Tokens: 301202, Completion Tokens: 112
2025-01-29 16:37:31,291 - INFO - Generated cluster name: Ongoing Game Updates and Improvements
2025-01-29 16:37:31,291 - INFO - Tokens used so far: Prompt Tokens: 301434, Completion Tokens: 119
2025-01-29 16:37:31,762 - INFO - Generated cluster name: Game Ratings and Reviews
2025-01-29 16:37:31,777 - INFO - Tokens used so far: Prompt Tokens: 301593, Completion Tokens: 124
2025-01-29 16:37:32,279 - INFO - Generated cluster name: Mobile vs

In [73]:
unique_cluster_names

{32: 'AI Racing Challenges and Frustrations',
 26: 'Horse Models and Game Mechanics',
 0: 'Ultimate Game Praise',
 35: 'Ongoing Game Updates and Improvements',
 5: 'Game Ratings and Reviews',
 28: 'Mobile vs. PC Game Experience',
 12: '"Passion for Amazing Games"',
 13: 'Game Enjoyment and Fun Factor',
 9: 'Horse Customization Options',
 7: 'Value Perception of Video Games',
 8: 'Game Experience Insights',
 29: 'Cloud and Mobile Gaming Experience',
 33: 'Gameplay Difficulty and Engagement Balance',
 37: 'Dynamic Equestrian Customization and Fun',
 30: 'Story Engagement and Interaction Issues',
 23: 'Horse Breeding Enthusiasm',
 18: 'Character Interaction in Horse Games',
 20: 'Fast Gold Earning Strategies',
 27: 'Storage Management in Gaming',
 24: 'Competitive Horse Racing Experience',
 1: "Horse Lovers' Gaming Experience",
 25: 'Horse Breeding and Racing Dynamics',
 4: 'Engaging Horse Racing Fun',
 3: 'Impressive Graphics in Gaming',
 2: 'Highly Recommended Horse Game',
 11: 'Equine 

In [74]:
# store the cluster names
df['cluster_name'] = df['cluster_id'].apply(lambda x: unique_cluster_names[x] if x in unique_cluster_names else "Unknown")

In [76]:
df.head()

Unnamed: 0,app_id_name,recommendationid,playtime_at_review_minutes,last_played,review_text,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,received_for_free,written_during_early_access,language,topic,sentiment,category,sentence,embedding,cluster_id,cluster_name
0,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,179387177,1775,1735228376,Very detailed and it has lot's to do.,1732047413,True,0,0,0.0,...,False,False,english,Game Detail,Positive,fact,Very detailed and it has lot's to do.,"[-2.1711172461509705, -7.140288352966309, -1.3...",32,AI Racing Challenges and Frustrations
1,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,False,False,english,Horse Care Mechanics,Negative,request,I DO wish you could care for your horses (e.g....,"[18.323121070861816, 6.4783313274383545, -5.73...",26,Horse Models and Game Mechanics
2,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,False,False,english,Overall Enjoyment,Positive,fact,"definitely a favourite go-to game of mine, I c...","[-23.07435281059959, -10.297671816565774, 6.20...",0,Ultimate Game Praise
3,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178350425,2492,1736827844,i love this game one phone and pc. i just wish...,1730675154,True,0,0,0.0,...,False,False,english,Cross-Platform Updates,Negative,request,I just wish it would update at the same time!,"[1.4599988460540771, 18.51365025838216, 3.0994...",35,Ongoing Game Updates and Improvements
4,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,177351085,3678,1729362177,"""It's wonderful, simply one of the best horse ...",1729361303,True,0,0,0.0,...,False,False,french,Overall Quality,Positive,fact,"It's wonderful, simply one of the best horse g...","[-23.896740408504712, 9.22358378242044, -0.476...",5,Game Ratings and Reviews


In [77]:
save_data_for_streamlit(df, path_db_final)

2025-01-29 16:40:22,416 - INFO - Saving updated data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests\db_final.json
2025-01-29 16:40:22,558 - INFO - Data saved successfully.
