In [1]:
import os
import openai
from numpy.ma.core import shape

from helper.utils import *
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"
configure_api(client, chat_model_name)

data_source = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Steam'
data_storage = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests'

path_db_analysed = os.path.join(data_source, "db_analysed.json")
path_db_embedded = os.path.join(data_source, "db_embedded.json")
path_db_clustered = os.path.join(data_storage, "db_clustered.json")
path_db_final = os.path.join(data_storage, "db_final.json")

# Extract unique topics

In [2]:
# Cluster algorithms force split datapoints even if they are identical. I will replace the embeddings of identical datapoints with the mean of their embeddings.
# Load the JSON data
data = read_json(path_db_embedded)
# data is your JSON array (list of dicts)
unique_topics = set()

for entry in data:
    unique_topics.add(entry['topic'])

# Now unique_topics is a set of all topic names
print(f'Original topics: {len(data)}')
print(f'Unique topics: {len(unique_topics)}')

Original topics: 5797
Unique topics: 2578


# Use OpenAI to generate embeddings

In [15]:
data = read_json(path_db_analysed)
data[0]['topics']


[{'topic': 'Game Detail',
  'sentiment': 'Positive',
  'category': 'fact',
  'sentence': "Very detailed and it has lot's to do."}]

In [2]:

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   embedding = client.embeddings.create(input = [text], model=model).data[0].embedding
   return embedding



embed_key = "sentence"  # "topic" or "sentence"
data = read_json(path_db_analysed)

def process_embedding(data, embed_key):
    for i in range(0, len(data)):
        if i % 10 == 0:
            logger.info(f"Processing entry {i}")

        for d_topic in data[i]["topics"]:
            if isinstance(d_topic, dict):
                d_topic["embedding"] = get_embedding(d_topic[embed_key], model="text-embedding-3-small")
    return data


data_embedded = process_embedding(data, embed_key)


# Flatten
def flatten_data(data):
    flattened = []
    for entry in data:
        base_copy = dict(entry)
        topics = base_copy.pop("topics", [])

        for topic in topics:
            new_entry = dict(base_copy)
            new_entry.update(topic)
            flattened.append(new_entry)
    return flattened


data_flattened = flatten_data(data_embedded)

# Save the embedded data
with open(path_db_embedded, "w") as output_file:
    json.dump(data_flattened, output_file, indent=4)


2025-02-04 07:59:42,574 - INFO - Processing entry 0
2025-02-04 07:59:53,961 - INFO - Processing entry 10
2025-02-04 08:00:04,901 - INFO - Processing entry 20
2025-02-04 08:00:12,861 - INFO - Processing entry 30
2025-02-04 08:00:25,006 - INFO - Processing entry 40
2025-02-04 08:00:37,008 - INFO - Processing entry 50
2025-02-04 08:00:48,422 - INFO - Processing entry 60
2025-02-04 08:00:56,452 - INFO - Processing entry 70
2025-02-04 08:01:06,392 - INFO - Processing entry 80
2025-02-04 08:01:19,296 - INFO - Processing entry 90
2025-02-04 08:01:35,694 - INFO - Processing entry 100
2025-02-04 08:01:48,046 - INFO - Processing entry 110
2025-02-04 08:02:02,472 - INFO - Processing entry 120
2025-02-04 08:02:13,103 - INFO - Processing entry 130
2025-02-04 08:02:22,537 - INFO - Processing entry 140
2025-02-04 08:02:34,791 - INFO - Processing entry 150
2025-02-04 08:04:30,950 - INFO - Retrying request to /embeddings in 0.390738 seconds
2025-02-04 08:04:40,841 - INFO - Processing entry 160
2025-02-

# Local Embedding

In [None]:
from helper.embedding import *

embed_key = "topic"  # "topic" or "sentence"

data = read_json(path_db_analysed)
embed_model = initialize_embedding_model(embed_model_name)


def process_embedding(data, embed_key):
    for i in range(0, len(data)):
        if i % 10 == 0:
            logger.info(f"Processing entry {i}")

        for d_topic in data[i]["topics"]:
            if isinstance(d_topic, dict):
                d_topic["embedding"] = embed_text(d_topic[embed_key], embed_model)
    return data


data_embedded = process_embedding(data, embed_key)


# Flatten
def flatten_data(data):
    flattened = []
    for entry in data:
        base_copy = dict(entry)
        topics = base_copy.pop("topics", [])

        for topic in topics:
            new_entry = dict(base_copy)
            new_entry.update(topic)
            flattened.append(new_entry)
    return flattened


data_flattened = flatten_data(data_embedded)

# Save the embedded data
with open(path_db_embedded, "w") as output_file:
    json.dump(data_flattened, output_file, indent=4)

# reduce dimensions

In [3]:
# Reduce the dimensions with t-SNE and replace the old embeddings with the new ones
from sklearn.manifold import TSNE
import umap
import matplotlib.pyplot as plt

data = read_json(path_db_embedded)
# Gather all embeddings
embeddings = [entry['embedding'] for entry in data]

# Convert to numpy array
X = np.array(embeddings)

# Perform t-SNE
# X_embedded = TSNE(n_components=3).fit_transform(X)

# Perform UMAP
X_embedded = umap.UMAP(n_components=12).fit_transform(X)

# Update the embeddings in the data
for i, entry in enumerate(data):
    entry['embedding'] = X_embedded[i].tolist()

# Save the updated data





In [13]:
print(shape(embeddings))

(5797, 1536)


In [5]:
data[5]

{'app_id_name': '1166860_Rival_Stars_Horse_Racing_Desktop_Edition',
 'recommendationid': 177351085,
 'playtime_at_review_minutes': 3678,
 'last_played': 1729362177,
 'review_text': '"It\'s wonderful, simply one of the best horse games I\'ve ever played, although for the price, they could pay more attention to the PC version."',
 'timestamp_updated': 1729361303,
 'voted_up': True,
 'votes_up': 0,
 'votes_funny': 0,
 'weighted_vote_score': 0.0,
 'steam_purchase': True,
 'received_for_free': False,
 'written_during_early_access': False,
 'language': 'french',
 'topic': 'PC Version Optimization',
 'sentiment': 'Negative',
 'category': 'request',
 'sentence': 'Although for the price, they could pay more attention to the PC version.',
 'embedding': [6.496478080749512,
  5.828295707702637,
  4.334468841552734,
  5.238367557525635,
  4.680978775024414,
  2.032257080078125,
  3.6438417434692383,
  6.340383529663086,
  4.686120986938477,
  6.606916904449463,
  3.5240108966827393,
  8.15054798126

In [5]:
save_data = data

In [14]:
import numpy as np

# Dictionary to store all embeddings for each topic
topic_to_embeddings = {topic: [] for topic in unique_topics}

# Gather embeddings by topic
for entry in data:
    topic = entry['topic']
    # print(topic)
    embedding = entry['embedding']
    # print(embedding)
    topic_to_embeddings[topic].append(embedding)
    # print(topic_to_embeddings[topic])

In [15]:
topic_to_embeddings['Game Detail']

[[6.85319185256958,
  3.694009780883789,
  2.739372968673706,
  6.121905326843262,
  3.57608962059021,
  1.6551101207733154,
  2.864488124847412,
  5.983946323394775,
  4.5477705001831055,
  6.262317657470703,
  3.809366226196289,
  7.028370380401611],
 [7.30991268157959,
  3.059624433517456,
  1.8706492185592651,
  6.117043972015381,
  4.280555248260498,
  1.7277992963790894,
  2.91556715965271,
  6.469871520996094,
  5.118198871612549,
  5.977120876312256,
  4.279849529266357,
  7.181881427764893],
 [7.25780725479126,
  3.139129161834717,
  1.6743881702423096,
  6.093160152435303,
  4.175294399261475,
  1.7032465934753418,
  2.8823049068450928,
  6.412700653076172,
  5.125767230987549,
  5.880000114440918,
  4.477870464324951,
  7.1107707023620605],
 [7.268645286560059,
  3.140059232711792,
  1.65809965133667,
  6.085731029510498,
  4.159914493560791,
  1.6944435834884644,
  2.878206968307495,
  6.415842533111572,
  5.123677730560303,
  5.866689682006836,
  4.512704372406006,
  7.093

In [16]:
# Calculate mean embedding for each topic
topic_to_mean_embedding = {}

for topic, embeddings_list in topic_to_embeddings.items():
    # Convert to a numpy array for easy mean calculation
    np_embeddings = np.array(embeddings_list)
    mean_embedding = np.mean(np_embeddings, axis=0)  # Shape will match original embedding dimension

    # Store the mean embedding (convert back to list if you want plain Python types)
    topic_to_mean_embedding[topic] = mean_embedding.tolist()

In [17]:
topic_to_mean_embedding['Game Detail']

[7.172389268875122,
 3.2582056522369385,
 1.9856275022029877,
 6.104460120201111,
 4.047963440418243,
 1.6951498985290527,
 2.8851417899131775,
 6.320590257644653,
 4.9788535833358765,
 5.996532082557678,
 4.269947648048401,
 7.103637456893921]

In [57]:
(-2.168487071990967 -2.173760414123535 -2.168487071990967 -2.173734426498413) / 4

-2.1711172461509705

In [10]:
for entry in data:
    topic = entry['topic']
    entry['embedding'] = topic_to_mean_embedding[topic]


In [14]:
data[0]

{'app_id_name': '1166860_Rival_Stars_Horse_Racing_Desktop_Edition',
 'recommendationid': 179387177,
 'playtime_at_review_minutes': 1775,
 'last_played': 1735228376,
 'review_text': "Very detailed and it has lot's to do.",
 'timestamp_updated': 1732047413,
 'voted_up': True,
 'votes_up': 0,
 'votes_funny': 0,
 'weighted_vote_score': 0.0,
 'steam_purchase': True,
 'received_for_free': False,
 'written_during_early_access': False,
 'language': 'english',
 'topic': 'Game Detail',
 'sentiment': 'Positive',
 'category': 'fact',
 'sentence': "Very detailed and it has lot's to do.",
 'embedding': [10.249449253082275,
  0.912435308098793,
  3.8314263820648193,
  4.515610575675964,
  3.752143383026123,
  4.7079079151153564,
  6.168004631996155,
  4.5352208614349365,
  4.2640591859817505,
  3.572487235069275,
  4.389563322067261,
  5.2080641984939575]}

# Cluster analysis

## HDBSCAN

In [3]:
# The clustering does not perform to good. Some datapoints that clearly should be in a cluster based on eyeballing and their topic name but they are not. Rather often some points in a dense cluster are categorized as noise.
# I will try to improve this by first perform a dimension reduction and then perform clustering. Reason being, that in high dimensions the data might be too sparse for the clustering algorithm to work properly.

from helper.cluster_analysis import *
from helper.utils import *

# Adjustable parameters
dimensionality_methods = ['UMAP', 'PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 50, "min_samples": 2, "cluster_selection_epsilon": 0.15}

data = read_json(path_db_clustered)
df = pd.DataFrame(data)
df = df[df['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
print(f"Loaded {len(df)} valid entries with embeddings.")

# Extract embeddings
mat = np.array(df['embedding'].tolist())

hdbscan_clusterer = hdbscan.HDBSCAN(**hdbscan_params)
cluster_labels = hdbscan_clusterer.fit_predict(mat)

reduction_results = {}

for method in dimensionality_methods:
    coords_2d = dimensionality_reduction(mat, method, n_components=2)
    reduction_results[f'hdbscan_{method}_2D'] = {
        'x': coords_2d[:, 0],
        'y': coords_2d[:, 1]
    }

# 3D Reduction
    coords_3d = dimensionality_reduction(mat, method, n_components=3)
    reduction_results[f'hdbscan_{method}_3D'] = {
        'x': coords_3d[:, 0],
        'y': coords_3d[:, 1],
        'z': coords_3d[:, 2]
    }

# Add dimensional coordinates to DataFrame
for method_dim, coords in reduction_results.items():
    for axis, values in coords.items():
        df[f'{method_dim}_{axis}'] = values

# Add the cluster labels to the DataFrame
df['hdbscan_id'] = cluster_labels


2025-02-10 08:17:21,423 - INFO - Applying UMAP with 2 components.


Loaded 5797 valid entries with embeddings.


2025-02-10 08:17:36,106 - INFO - Applying UMAP with 3 components.
2025-02-10 08:17:36,981 - INFO - Applying PCA with 2 components.
2025-02-10 08:17:36,993 - INFO - Applying PCA with 3 components.
2025-02-10 08:17:36,995 - INFO - Applying tSNE with 2 components.
2025-02-10 08:17:36,995 - INFO - Perplexity not provided, setting to 30 based on sample size.
2025-02-10 08:17:45,774 - INFO - Applying tSNE with 3 components.
2025-02-10 08:17:45,774 - INFO - Perplexity not provided, setting to 30 based on sample size.


In [4]:
df.head()

Unnamed: 0,app_id_name,recommendationid,playtime_at_review_minutes,last_played,review_text,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,hdbscan_PCA_2D_y,hdbscan_PCA_3D_x,hdbscan_PCA_3D_y,hdbscan_PCA_3D_z,hdbscan_tSNE_2D_x,hdbscan_tSNE_2D_y,hdbscan_tSNE_3D_x,hdbscan_tSNE_3D_y,hdbscan_tSNE_3D_z,hdbscan_id
0,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,179387177,1775,1735228376,Very detailed and it has lot's to do.,1732047413,True,0,0,0.0,...,-0.679641,2.452605,-0.679641,-0.155295,37.977051,-8.820072,9.040452,-2.834458,-6.749606,-1
1,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,0.627039,-2.773887,0.627039,0.419794,-44.862949,-2.636445,-6.351776,5.714776,3.649575,13
2,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,-1.750435,3.868619,-1.750435,0.231251,53.838345,-43.816601,11.260092,-15.390235,2.928083,21
3,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178350425,2492,1736827844,i love this game one phone and pc. i just wish...,1730675154,True,0,0,0.0,...,2.065378,1.794274,2.065378,0.907473,30.898285,53.160809,9.040145,10.655698,6.071946,12
4,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,177351085,3678,1729362177,"""It's wonderful, simply one of the best horse ...",1729361303,True,0,0,0.0,...,-1.937389,-4.134001,-1.937389,0.981432,-24.105824,-57.374065,-12.308875,-17.023937,8.806252,22


## Kmeans

In [5]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

from helper.cluster_analysis import dimensionality_reduction


# Adjustable parameters
dimensionality_methods = ['UMAP', 'PCA', 'tSNE']
kmeans_clusters = [15, 20, 25, 50]


df = df[df['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
print(f"Loaded {len(df)} valid entries with embeddings.")

# Extract embeddings
mat = np.array(df['embedding'].tolist())  # shape (n_samples, n_dimensions)

# --- KMeans Clustering (High-Dimensional) ---
for n_clusters in kmeans_clusters:
    kmeans_model = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans_model.fit_predict(mat)

    logger.info(f"Found {len(np.unique(cluster_labels))} clusters for KMeans with {n_clusters} clusters.")

    # Store the labels in a column named by cluster count
    df[f'kmeans_{n_clusters}_id'] = cluster_labels

# --- Dimensionality Reduction ---
# We do each method in 2D and 3D exactly once
reduction_results = {}

for method in dimensionality_methods:
    # 2D
    coords_2d = dimensionality_reduction(mat, method, n_components=2)
    reduction_results[f'kmeans_{method}_2D'] = {
        'x': coords_2d[:, 0],
        'y': coords_2d[:, 1]
    }

    # 3D
    coords_3d = dimensionality_reduction(mat, method, n_components=3)
    reduction_results[f'kmeans_{method}_3D'] = {
        'x': coords_3d[:, 0],
        'y': coords_3d[:, 1],
        'z': coords_3d[:, 2]
    }

# --- Add Dimensional Coordinates to DataFrame ---
for method_dim, coords in reduction_results.items():
    for axis, values in coords.items():
        df[f'{method_dim}_{axis}'] = values



2025-02-10 08:19:30,669 - INFO - Found 15 clusters for KMeans with 15 clusters.
2025-02-10 08:19:30,685 - INFO - Found 20 clusters for KMeans with 20 clusters.
2025-02-10 08:19:30,712 - INFO - Found 25 clusters for KMeans with 25 clusters.
2025-02-10 08:19:30,756 - INFO - Found 50 clusters for KMeans with 50 clusters.
2025-02-10 08:19:30,758 - INFO - Applying UMAP with 2 components.


Loaded 5797 valid entries with embeddings.


2025-02-10 08:19:32,050 - INFO - Applying UMAP with 3 components.
2025-02-10 08:19:32,844 - INFO - Applying PCA with 2 components.
2025-02-10 08:19:32,846 - INFO - Applying PCA with 3 components.
2025-02-10 08:19:32,848 - INFO - Applying tSNE with 2 components.
2025-02-10 08:19:32,848 - INFO - Perplexity not provided, setting to 30 based on sample size.
2025-02-10 08:19:41,628 - INFO - Applying tSNE with 3 components.
2025-02-10 08:19:41,628 - INFO - Perplexity not provided, setting to 30 based on sample size.


In [6]:
df.head()

Unnamed: 0,app_id_name,recommendationid,playtime_at_review_minutes,last_played,review_text,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,kmeans_PCA_2D_x,kmeans_PCA_2D_y,kmeans_PCA_3D_x,kmeans_PCA_3D_y,kmeans_PCA_3D_z,kmeans_tSNE_2D_x,kmeans_tSNE_2D_y,kmeans_tSNE_3D_x,kmeans_tSNE_3D_y,kmeans_tSNE_3D_z
0,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,179387177,1775,1735228376,Very detailed and it has lot's to do.,1732047413,True,0,0,0.0,...,2.452605,-0.679641,2.452605,-0.679641,-0.155295,37.977051,-8.820072,9.040452,-2.834458,-6.749606
1,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,-2.773887,0.627039,-2.773887,0.627039,0.419794,-44.862949,-2.636445,-6.351776,5.714776,3.649575
2,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,3.868619,-1.750435,3.868619,-1.750435,0.231251,53.838345,-43.816601,11.260092,-15.390235,2.928083
3,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178350425,2492,1736827844,i love this game one phone and pc. i just wish...,1730675154,True,0,0,0.0,...,1.794274,2.065378,1.794274,2.065378,0.907473,30.898285,53.160809,9.040145,10.655698,6.071946
4,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,177351085,3678,1729362177,"""It's wonderful, simply one of the best horse ...",1729361303,True,0,0,0.0,...,-4.134001,-1.937389,-4.134001,-1.937389,0.981432,-24.105824,-57.374065,-12.308875,-17.023937,8.806252


In [7]:
save_df_as_json(df, path_db_clustered)

2025-02-10 08:20:57,325 - INFO - Saving data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests\db_clustered.json


# Cluster Naming

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

from helper.cluster_naming import *

api_settings = {"client": client, "model": chat_model_name}

def name_clusters(
    df,
    cluster_columns,
    embedding_col="embedding",
    text_col="sentence",
    top_k=10,
    skip_noise_label=-1
):
    for col in cluster_columns:
        # Prepare a dict to store {cluster_id -> cluster_name}
        cluster_id_to_name = {}
        logger.info((f'Preparing to name clusters in column "{col}"'))

        # Get unique cluster IDs from this column
        cluster_ids = df[col].unique()

        for cluster_id in cluster_ids:

            if skip_noise_label is not None and cluster_id == skip_noise_label:
                continue

            # Select rows belonging to this cluster
            cluster_data = df[df[col] == cluster_id]
            if cluster_data.empty:
                continue

            # Compute centroid of embeddings
            embeddings = np.array(cluster_data[embedding_col].tolist())
            centroid = embeddings.mean(axis=0, dtype=np.float32, keepdims=True)

            # Find top_k closest points to centroid
            distances = cosine_distances(centroid, embeddings).flatten()
            closest_indices = np.argsort(distances)[:top_k]
            representative_texts = cluster_data.iloc[closest_indices][text_col].tolist()

            # Call your naming function
            cluster_name = generate_cluster_name(representative_texts, api_settings)
            cluster_id_to_name[cluster_id] = cluster_name

        # Create a new column with the cluster name for each row
        name_col = f"{col}_name"
        df[name_col] = df[col].apply(lambda cid: cluster_id_to_name.get(cid, "Noise"))

    return df


data = read_json(path_db_clustered)  # data is probably a list of dicts
df = pd.DataFrame(data)              # Convert to DataFrame

cluster_columns = ['hdbscan_id', 'kmeans_15_id', 'kmeans_20_id', 'kmeans_25_id', 'kmeans_50_id']

df_named = name_clusters(
    df,
    cluster_columns,
    embedding_col="embedding",
    text_col="sentence",
    top_k=10,
    skip_noise_label=-1  # for HDBSCAN noise
)


In [14]:
df_named.head()

Unnamed: 0,app_id_name,recommendationid,playtime_at_review_minutes,last_played,review_text,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,kmeans_tSNE_2D_x,kmeans_tSNE_2D_y,kmeans_tSNE_3D_x,kmeans_tSNE_3D_y,kmeans_tSNE_3D_z,hdbscan_id_name,kmeans_15_id_name,kmeans_20_id_name,kmeans_25_id_name,kmeans_50_id_name
0,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,179387177,1775,1735228376,Very detailed and it has lot's to do.,1732047413,True,0,0,0.0,...,37.977051,-8.820072,9.040452,-2.834458,-6.749606,Noise,Endlessly Engaging Gameplay Experience,Endlessly Engaging Gameplay Experience,Endlessly Engaging Gameplay Experience,Charming Quests and Story-driven Adventures
1,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,-44.862949,-2.636445,-6.351776,5.714776,3.649575,Horse Interaction and Care体验,Horse Ownership and Player Interaction,Horse Ranch Customization and Social Features,Enhanced Horse Care Features,Limited Horse Interaction Experience
2,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,53.838345,-43.816601,11.260092,-15.390235,2.928083,Beloved Favorite Games,Game Appreciation and Enjoyment,Game Love and Appreciation,Game Enthusiasm and Praise,Game Love and Appreciation
3,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178350425,2492,1736827844,i love this game one phone and pc. i just wish...,1730675154,True,0,0,0.0,...,30.898285,53.160809,9.040145,10.655698,6.071946,Mobile vs. Desktop Gaming Experiences,Cross-Platform Gaming Enhancements,Cross-Platform Game Improvements,PC and Mobile Game Disparities,Mobile Version Update Disparity
4,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,177351085,3678,1729362177,"""It's wonderful, simply one of the best horse ...",1729361303,True,0,0,0.0,...,-24.105824,-57.374065,-12.308875,-17.023937,8.806252,Favorite Horse Games Reviewed,Realistic Horse Gaming Experience,Ultimate Horse Gaming Experience,Horse Game Enthusiast Community,Top Horse Management Games


In [15]:
save_data_for_streamlit(df_named, path_db_final)

2025-02-10 08:45:13,670 - INFO - Saving updated data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Cluster_tests\db_final.json
2025-02-10 08:45:14,144 - INFO - Data saved successfully.
