In [1]:
import os
import openai
from numpy.ma.core import shape

from helper.utils import *
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"
configure_api(client, chat_model_name)

data_source = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Steam'
data_storage = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests'

path_db_analysed = os.path.join(data_source, "db_analysed.json")
path_db_embedded = os.path.join(data_source, "db_embedded.json")
path_db_clustered = os.path.join(data_storage, "db_clustered.json")
path_db_final = os.path.join(data_storage, "db_final.json")

# Extract unique topics

In [2]:
# Cluster algorithms force split datapoints even if they are identical. I will replace the embeddings of identical datapoints with the mean of their embeddings.
# Load the JSON data
data = read_json(path_db_embedded)
# data is your JSON array (list of dicts)
unique_topics = set()

for entry in data:
    unique_topics.add(entry['topic'])

# Now unique_topics is a set of all topic names
print(f'Original topics: {len(data)}')
print(f'Unique topics: {len(unique_topics)}')

Original topics: 5797
Unique topics: 2578


# Use OpenAI to generate embeddings

In [15]:
data = read_json(path_db_analysed)
data[0]['topics']


[{'topic': 'Game Detail',
  'sentiment': 'Positive',
  'category': 'fact',
  'sentence': "Very detailed and it has lot's to do."}]

In [2]:

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   embedding = client.embeddings.create(input = [text], model=model).data[0].embedding
   return embedding



embed_key = "sentence"  # "topic" or "sentence"
data = read_json(path_db_analysed)

def process_embedding(data, embed_key):
    for i in range(0, len(data)):
        if i % 10 == 0:
            logger.info(f"Processing entry {i}")

        for d_topic in data[i]["topics"]:
            if isinstance(d_topic, dict):
                d_topic["embedding"] = get_embedding(d_topic[embed_key], model="text-embedding-3-small")
    return data


data_embedded = process_embedding(data, embed_key)


# Flatten
def flatten_data(data):
    flattened = []
    for entry in data:
        base_copy = dict(entry)
        topics = base_copy.pop("topics", [])

        for topic in topics:
            new_entry = dict(base_copy)
            new_entry.update(topic)
            flattened.append(new_entry)
    return flattened


data_flattened = flatten_data(data_embedded)

# Save the embedded data
with open(path_db_embedded, "w") as output_file:
    json.dump(data_flattened, output_file, indent=4)


2025-02-04 07:59:42,574 - INFO - Processing entry 0
2025-02-04 07:59:53,961 - INFO - Processing entry 10
2025-02-04 08:00:04,901 - INFO - Processing entry 20
2025-02-04 08:00:12,861 - INFO - Processing entry 30
2025-02-04 08:00:25,006 - INFO - Processing entry 40
2025-02-04 08:00:37,008 - INFO - Processing entry 50
2025-02-04 08:00:48,422 - INFO - Processing entry 60
2025-02-04 08:00:56,452 - INFO - Processing entry 70
2025-02-04 08:01:06,392 - INFO - Processing entry 80
2025-02-04 08:01:19,296 - INFO - Processing entry 90
2025-02-04 08:01:35,694 - INFO - Processing entry 100
2025-02-04 08:01:48,046 - INFO - Processing entry 110
2025-02-04 08:02:02,472 - INFO - Processing entry 120
2025-02-04 08:02:13,103 - INFO - Processing entry 130
2025-02-04 08:02:22,537 - INFO - Processing entry 140
2025-02-04 08:02:34,791 - INFO - Processing entry 150
2025-02-04 08:04:30,950 - INFO - Retrying request to /embeddings in 0.390738 seconds
2025-02-04 08:04:40,841 - INFO - Processing entry 160
2025-02-

# Local Embedding

In [None]:
from helper.embedding import *

embed_key = "topic"  # "topic" or "sentence"

data = read_json(path_db_analysed)
embed_model = initialize_embedding_model(embed_model_name)


def process_embedding(data, embed_key):
    for i in range(0, len(data)):
        if i % 10 == 0:
            logger.info(f"Processing entry {i}")

        for d_topic in data[i]["topics"]:
            if isinstance(d_topic, dict):
                d_topic["embedding"] = embed_text(d_topic[embed_key], embed_model)
    return data


data_embedded = process_embedding(data, embed_key)


# Flatten
def flatten_data(data):
    flattened = []
    for entry in data:
        base_copy = dict(entry)
        topics = base_copy.pop("topics", [])

        for topic in topics:
            new_entry = dict(base_copy)
            new_entry.update(topic)
            flattened.append(new_entry)
    return flattened


data_flattened = flatten_data(data_embedded)

# Save the embedded data
with open(path_db_embedded, "w") as output_file:
    json.dump(data_flattened, output_file, indent=4)

# reduce dimensions

In [3]:
# Reduce the dimensions with t-SNE and replace the old embeddings with the new ones
from sklearn.manifold import TSNE
import umap
import matplotlib.pyplot as plt

data = read_json(path_db_embedded)
# Gather all embeddings
embeddings = [entry['embedding'] for entry in data]

# Convert to numpy array
X = np.array(embeddings)

# Perform t-SNE
# X_embedded = TSNE(n_components=3).fit_transform(X)

# Perform UMAP
X_embedded = umap.UMAP(n_components=12).fit_transform(X)

# Update the embeddings in the data
for i, entry in enumerate(data):
    entry['embedding'] = X_embedded[i].tolist()

# Save the updated data





In [4]:
print(shape(embeddings))

(5797, 1536)


In [5]:
data[5]

{'app_id_name': '1166860_Rival_Stars_Horse_Racing_Desktop_Edition',
 'recommendationid': 177351085,
 'playtime_at_review_minutes': 3678,
 'last_played': 1729362177,
 'review_text': '"It\'s wonderful, simply one of the best horse games I\'ve ever played, although for the price, they could pay more attention to the PC version."',
 'timestamp_updated': 1729361303,
 'voted_up': True,
 'votes_up': 0,
 'votes_funny': 0,
 'weighted_vote_score': 0.0,
 'steam_purchase': True,
 'received_for_free': False,
 'written_during_early_access': False,
 'language': 'french',
 'topic': 'PC Version Optimization',
 'sentiment': 'Negative',
 'category': 'request',
 'sentence': 'Although for the price, they could pay more attention to the PC version.',
 'embedding': [6.496478080749512,
  5.828295707702637,
  4.334468841552734,
  5.238367557525635,
  4.680978775024414,
  2.032257080078125,
  3.6438417434692383,
  6.340383529663086,
  4.686120986938477,
  6.606916904449463,
  3.5240108966827393,
  8.15054798126

In [5]:
save_data = data

In [8]:
import numpy as np

# Dictionary to store all embeddings for each topic
topic_to_embeddings = {topic: [] for topic in unique_topics}

# Gather embeddings by topic
for entry in data:
    topic = entry['topic']
    # print(topic)
    embedding = entry['embedding']
    # print(embedding)
    topic_to_embeddings[topic].append(embedding)
    # print(topic_to_embeddings[topic])

NameError: name 'unique_topics' is not defined

In [6]:
topic_to_embeddings['Game Detail']

NameError: name 'topic_to_embeddings' is not defined

In [9]:
# Calculate mean embedding for each topic
topic_to_mean_embedding = {}

for topic, embeddings_list in topic_to_embeddings.items():
    # Convert to a numpy array for easy mean calculation
    np_embeddings = np.array(embeddings_list)
    mean_embedding = np.mean(np_embeddings, axis=0)  # Shape will match original embedding dimension

    # Store the mean embedding (convert back to list if you want plain Python types)
    topic_to_mean_embedding[topic] = mean_embedding.tolist()

NameError: name 'topic_to_embeddings' is not defined

In [9]:
topic_to_mean_embedding['Game Detail']

[10.249449253082275,
 0.912435308098793,
 3.8314263820648193,
 4.515610575675964,
 3.752143383026123,
 4.7079079151153564,
 6.168004631996155,
 4.5352208614349365,
 4.2640591859817505,
 3.572487235069275,
 4.389563322067261,
 5.2080641984939575]

In [57]:
(-2.168487071990967 -2.173760414123535 -2.168487071990967 -2.173734426498413) / 4

-2.1711172461509705

In [10]:
for entry in data:
    topic = entry['topic']
    entry['embedding'] = topic_to_mean_embedding[topic]


In [14]:
data[0]

{'app_id_name': '1166860_Rival_Stars_Horse_Racing_Desktop_Edition',
 'recommendationid': 179387177,
 'playtime_at_review_minutes': 1775,
 'last_played': 1735228376,
 'review_text': "Very detailed and it has lot's to do.",
 'timestamp_updated': 1732047413,
 'voted_up': True,
 'votes_up': 0,
 'votes_funny': 0,
 'weighted_vote_score': 0.0,
 'steam_purchase': True,
 'received_for_free': False,
 'written_during_early_access': False,
 'language': 'english',
 'topic': 'Game Detail',
 'sentiment': 'Positive',
 'category': 'fact',
 'sentence': "Very detailed and it has lot's to do.",
 'embedding': [10.249449253082275,
  0.912435308098793,
  3.8314263820648193,
  4.515610575675964,
  3.752143383026123,
  4.7079079151153564,
  6.168004631996155,
  4.5352208614349365,
  4.2640591859817505,
  3.572487235069275,
  4.389563322067261,
  5.2080641984939575]}

# Cluster analysis

In [10]:
# The clustering does not perform to good. Some datapoints that clearly should be in a cluster based on eyeballing and their topic name but they are not. Rather often some points in a dense cluster are categorized as noise.
# I will try to improve this by first perform a dimension reduction and then perform clustering. Reason being, that in high dimensions the data might be too sparse for the clustering algorithm to work properly.

from helper.cluster_analysis import *
from helper.utils import *

# Adjustable parameters
dimensionality_methods = ['UMAP', 'PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 50, "min_samples": 2, "cluster_selection_epsilon": 0.15}

df = pd.DataFrame(data)
df = df[df['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
print(f"Loaded {len(df)} valid entries with embeddings.")

# Extract embeddings
mat = np.array(df['embedding'].tolist())

hdbscan_clusterer = hdbscan.HDBSCAN(**hdbscan_params)
cluster_labels = hdbscan_clusterer.fit_predict(mat)

reduction_results = {}

for method in dimensionality_methods:
    coords_2d = dimensionality_reduction(mat, method, n_components=2)
    reduction_results[f'{method}_2D'] = {
        'x': coords_2d[:, 0],
        'y': coords_2d[:, 1]
    }

# 3D Reduction
    coords_3d = dimensionality_reduction(mat, method, n_components=3)
    reduction_results[f'{method}_3D'] = {
        'x': coords_3d[:, 0],
        'y': coords_3d[:, 1],
        'z': coords_3d[:, 2]
    }

# Add dimensional coordinates to DataFrame
for method_dim, coords in reduction_results.items():
    for axis, values in coords.items():
        df[f'{method_dim}_{axis}'] = values

# Add the cluster labels to the DataFrame
df['cluster_id'] = cluster_labels


2025-02-04 13:21:40,947 - INFO - Applying UMAP with 2 components.


Loaded 5797 valid entries with embeddings.


2025-02-04 13:21:41,962 - INFO - Applying UMAP with 3 components.
2025-02-04 13:21:43,040 - INFO - Applying PCA with 2 components.
2025-02-04 13:21:43,048 - INFO - Applying PCA with 3 components.
2025-02-04 13:21:43,050 - INFO - Applying tSNE with 2 components.
2025-02-04 13:21:43,050 - INFO - Perplexity not provided, setting to 30 based on sample size.
2025-02-04 13:21:51,783 - INFO - Applying tSNE with 3 components.
2025-02-04 13:21:51,783 - INFO - Perplexity not provided, setting to 30 based on sample size.


In [11]:
df.head()

Unnamed: 0,app_id_name,recommendationid,playtime_at_review_minutes,last_played,review_text,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,PCA_2D_y,PCA_3D_x,PCA_3D_y,PCA_3D_z,tSNE_2D_x,tSNE_2D_y,tSNE_3D_x,tSNE_3D_y,tSNE_3D_z,cluster_id
0,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,179387177,1775,1735228376,Very detailed and it has lot's to do.,1732047413,True,0,0,0.0,...,-0.696643,2.389605,-0.696643,0.14972,29.548246,-15.199729,9.003181,-3.183665,5.976941,-1
1,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,0.786879,-2.908823,0.786879,-0.537476,-43.810314,-4.599304,-5.191478,8.579439,-2.87197,0
2,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,-1.625069,3.840906,-1.625069,-0.378133,51.323654,-40.445629,14.036543,-15.618323,-4.650846,19
3,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178350425,2492,1736827844,i love this game one phone and pc. i just wish...,1730675154,True,0,0,0.0,...,2.16951,1.868521,2.16951,-0.677289,30.164398,45.05862,9.804322,11.004961,-6.132709,13
4,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,177351085,3678,1729362177,"""It's wonderful, simply one of the best horse ...",1729361303,True,0,0,0.0,...,-1.919069,-4.255278,-1.919069,-0.899337,-26.896746,-56.909252,-12.050323,-17.088345,-8.572276,0


In [12]:
save_df_as_json(df, path_db_clustered)

2025-02-04 13:22:09,201 - INFO - Saving data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests\db_clustered.json


# Cluster Naming

In [13]:
from helper.cluster_naming import *

api_settings = {"client": client, "model": chat_model_name}

data_cluster = read_json(path_db_clustered)

unique_cluster_names = {}

df = pd.DataFrame(data_cluster)
df = df[df['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
print(f"Loaded {len(df)} valid entries with embeddings.")


unique_clusters = df['cluster_id'].unique()

for cluster_id in unique_clusters:
    if cluster_id == -1:  # Skip noise clusters
        continue

    # Find the 10 most centric points in the cluster and store them in a list named 'topics'
    cluster_data = df[df['cluster_id'] == cluster_id]
    cluster_embeddings = np.array(cluster_data['embedding'].tolist())
    cluster_centroid = np.mean(cluster_embeddings, axis=0)
    cluster_centroid = cluster_centroid.tolist()

    distances = cosine_distances([cluster_centroid], cluster_embeddings).flatten()
    closest_indices = np.argsort(distances)[:10]
    representative_topics = cluster_data.iloc[closest_indices]['sentence'].tolist()

    #print(representative_topics)

    # Generate or retrieve cluster name
    if cluster_id not in unique_cluster_names:
        cluster_name = generate_cluster_name(representative_topics, api_settings)
        unique_cluster_names[cluster_id] = cluster_name



Loaded 5797 valid entries with embeddings.


2025-02-04 13:22:27,894 - INFO - Generated cluster name: Horse Breeding and Racing Dreams
2025-02-04 13:22:27,894 - INFO - Tokens used so far: Prompt Tokens: 234, Completion Tokens: 7
2025-02-04 13:22:28,856 - INFO - Generated cluster name: Ultimate Game Affection
2025-02-04 13:22:28,869 - INFO - Tokens used so far: Prompt Tokens: 395, Completion Tokens: 12
2025-02-04 13:22:32,477 - INFO - Generated cluster name: Mobile Gaming Monetization Issues
2025-02-04 13:22:32,477 - INFO - Tokens used so far: Prompt Tokens: 636, Completion Tokens: 18
2025-02-04 13:22:33,543 - INFO - Generated cluster name: "Highly Praised Game Reviews"
2025-02-04 13:22:33,543 - INFO - Tokens used so far: Prompt Tokens: 761, Completion Tokens: 26
2025-02-04 13:22:34,519 - INFO - Generated cluster name: Enjoyable Gaming Experience
2025-02-04 13:22:34,519 - INFO - Tokens used so far: Prompt Tokens: 895, Completion Tokens: 31
2025-02-04 13:22:35,988 - INFO - Generated cluster name: Gameplay Repetition and Stagnation


In [14]:
unique_cluster_names

{0: 'Horse Breeding and Racing Dreams',
 19: 'Ultimate Game Affection',
 13: 'Mobile Gaming Monetization Issues',
 18: '"Highly Praised Game Reviews"',
 17: 'Enjoyable Gaming Experience',
 16: 'Gameplay Repetition and Stagnation',
 9: 'Value for Entertainment Experience',
 6: 'Gaming Playtime Experiences',
 8: 'Universal Game Recommendations',
 5: 'Game Crashes and Bugs Troubleshooting',
 4: 'Instant Upgrades and No Waiting',
 2: 'Stunning Graphics in Gaming',
 14: 'Relaxing Gaming Experience',
 15: 'Game Improvement and Feature Requests',
 3: 'Multiplayer Limitations and Solo Focus',
 1: 'Incredibly Realistic Gaming Experience',
 12: 'Engaging Narrative Adventures',
 11: 'Price vs. Value Assessment',
 7: 'Strong Recommendations',
 10: '"Wait for Sale Recommendations"'}

In [15]:
# store the cluster names
df['cluster_name'] = df['cluster_id'].apply(lambda x: unique_cluster_names[x] if x in unique_cluster_names else "Unknown")

In [16]:
df.head()

Unnamed: 0,app_id_name,recommendationid,playtime_at_review_minutes,last_played,review_text,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,PCA_3D_x,PCA_3D_y,PCA_3D_z,tSNE_2D_x,tSNE_2D_y,tSNE_3D_x,tSNE_3D_y,tSNE_3D_z,cluster_id,cluster_name
0,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,179387177,1775,1735228376,Very detailed and it has lot's to do.,1732047413,True,0,0,0.0,...,2.389605,-0.696643,0.14972,29.548246,-15.199729,9.003181,-3.183665,5.976941,-1,Unknown
1,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,-2.908823,0.786879,-0.537476,-43.810314,-4.599304,-5.191478,8.579439,-2.87197,0,Horse Breeding and Racing Dreams
2,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,3.840906,-1.625069,-0.378133,51.323654,-40.445629,14.036543,-15.618323,-4.650846,19,Ultimate Game Affection
3,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178350425,2492,1736827844,i love this game one phone and pc. i just wish...,1730675154,True,0,0,0.0,...,1.868521,2.16951,-0.677289,30.164398,45.05862,9.804322,11.004961,-6.132709,13,Mobile Gaming Monetization Issues
4,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,177351085,3678,1729362177,"""It's wonderful, simply one of the best horse ...",1729361303,True,0,0,0.0,...,-4.255278,-1.919069,-0.899337,-26.896746,-56.909252,-12.050323,-17.088345,-8.572276,0,Horse Breeding and Racing Dreams


In [17]:
save_data_for_streamlit(df, path_db_final)

2025-02-04 13:23:28,685 - INFO - Saving updated data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests\db_final.json
2025-02-04 13:23:28,862 - INFO - Data saved successfully.
