In [5]:
# General modules
import os
import openai
from dotenv import load_dotenv


# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"


# Paths
root_dir = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC'
steam_title = 'Community'



path_input = os.path.join(root_dir, steam_title, "Transcript_pinehaven_stream.txt")
path_db_prepared = os.path.join(root_dir, steam_title, "db_prepared.json")
path_db_translated = os.path.join(root_dir, steam_title, "db_translated.json")
path_db_analysed = os.path.join(root_dir, steam_title, "db_analysed.json")
path_db_embedded = os.path.join(root_dir, steam_title, "db_embedded.json")
path_db_clustered = os.path.join(root_dir, steam_title, "db_clustered.json")
path_db_final = os.path.join(root_dir, steam_title, "db_final.json")

In [7]:
from helper.utils import *

configure_api(client, chat_model_name)

# Transcript Preparation

In [None]:
# Read the transcript
with open(path_input, 'r', encoding='utf-8') as file:
    transcript = file.readlines()

In [None]:
# concatenate every 100 lines + get rid of '\n'

transcript_joined = []
for i in range(0, len(transcript), 40):
    transcript_joined.append(' '.join([line.strip() for line in transcript[i:i+100]]))


In [None]:
len(transcript_joined)

In [None]:
len(transcript_joined[5])

# Analysis

In [None]:
from helper.utils import *
from helper.prompt_templates import *

configure_api(client, chat_model_name)

all_entries = []

for i in range(0, len(transcript_joined)):
    logger.info(f"Processing text {i}")

    transcript = transcript_joined[i]

    prompt_influencer = prompt_template_influencer.format(transcript=transcript)
    response = api_settings["client"].chat.completions.create(
        model=api_settings["model"],
        messages=[
            {"role": "system", "content": "You are an expert in extracting video game topics from Youtube Transcripts."},
            {"role": "user", "content": prompt_influencer},
        ],
        response_format={"type": "json_object"},
        max_tokens=4096
    )
    response_json = json.loads(response.choices[0].message.content)

    # Dynamically handle varying keys at the root of the response
    if isinstance(response_json, dict):
        for key, value in response_json.items():
            if isinstance(value, list):  # Ensure the value is a list
                all_entries.extend(value)
            else:
                logger.warning(f"Unexpected format for key '{key}' in response {i}")
    else:
        logger.warning(f"Unexpected response structure for text {i}: {response_json}")

# save the entries
with open(path_db_prepared, "w") as output_file:
    json.dump(all_entries, output_file, indent=4)

In [None]:
# generat unique ID

from helper.utils import *

# A unique ID is generated in the new column / key "response_ID"
data = read_json(path_db_prepared)
data_prepared = generate_ID(data)
save_to_json(data_prepared, path_db_prepared)

# Sentiment Analysis

In [None]:
all_entries = []
data_prepared = read_json(path_db_prepared)

for i in range(0, len(data_prepared)):
    entry = data_prepared[i]
    logger.info(f"Process Sentiment for text {i}")
    try:
        prompt_sentiment = prompt_template_sentiment.format(
            review=entry["Context"], topic=entry["Topic"]
        )
        response = api_settings["client"].chat.completions.create(
            model=api_settings["model"],
            messages=[
                {"role": "system", "content": "You are an expert for sentiment analysis."},
                {"role": "user", "content": prompt_sentiment},
            ],
            max_tokens=1024,
        )
        sentiment = response.choices[0].message.content.strip()
        # rename keys
        entry["topic"] = entry["Topic"]
        entry["sentiment"] = sentiment
        entry["category"] = entry["Category"]
        entry["sentence"] = entry["Context"]
        entry.pop("Context")
        entry.pop("Category")
        entry.pop("Topic")

        all_entries.append(entry)
    except Exception as e:
        logger.error(f"Error analyzing sentiment for topic '{entry['Topic']}' (Entry ID {entry['response_ID']}): {e}")
        raise



In [None]:
# Save the entries
with open(path_db_analysed, "w") as output_file:
    json.dump(all_entries, output_file, indent=4)

# Embedding

In [None]:
from helper.embedding import *

embed_key = "topic"  # "topic" or "sentence"

data = read_json(path_db_analysed)
embed_model = initialize_embedding_model(embed_model_name)

def process_embedding(data, embed_key):
    for i in range(0, len(data)):
        if i % 10 == 0:
            logger.info(f"Processing entry {i}")
        entry = data[i]
        text = entry[embed_key]
        embedding = embed_text(text, embed_model)
        entry["embedding"] = embedding
    return data

data_embedded = process_embedding(data, embed_key)

# Save the embedded data
with open(path_db_embedded, "w") as output_file:
    json.dump(data_embedded, output_file, indent=4)


# Clustering

In [None]:
from helper.cluster_analysis import *

# Adjustable parameters
dimensionality_methods = ['UMAP','PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 7, "min_samples": 2, "cluster_selection_epsilon": 0.4}

# Load data
df_total = load_embedded_data(path_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat,
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)

# Cluster Naming

In [None]:
from helper.cluster_naming import *

# Parameters
dimensionality_methods = ["UMAP",'PCA', "tSNE"]
clustering_algorithms = ["hdbscan"]  # No KMeans here
max_centers = 10

#kmeans_clusters = [15, 20, 25, 50]  # Number of clusters for KMeans

# Load data
df_total = load_json_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(
    df_total,
    dimensionality_methods,
    clustering_algorithms,
    max_centers,
    api_settings) # insert kmeans_clusters in the function when needed


# Save results
save_data_for_streamlit(df_total, path_db_final)

# HRC Steam reviews

In [5]:
# General modules
import os
import openai
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"

# Paths
root_dir = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC'
steam_title = 'Community'

# Paths
root_dir = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC'
steam_title = 'Steam'



path_db_prepared = os.path.join(root_dir, steam_title, "db_prepared.json")
path_db_translated = os.path.join(root_dir, steam_title, "db_translated.json")
path_db_analysed = os.path.join(root_dir, steam_title, "db_analysed.json")
path_db_embedded = os.path.join(root_dir, steam_title, "db_embedded.json")
path_db_clustered = os.path.join(root_dir, steam_title, "db_clustered.json")
path_db_final = os.path.join(root_dir, steam_title, "db_final.json")

In [8]:
from helper.utils import *

configure_api(client, chat_model_name)

## Redshift query

In [4]:
# My imports
from helper.redshift_conector_standalone import *

# https://store.steampowered.com/app/1166860/Rival_Stars_Horse_Racing_Desktop_Edition/

# SQL Query Redshift
sql_query = """
SELECT *
FROM steam_review
where app_id_name = '1166860_Rival_Stars_Horse_Racing_Desktop_Edition'
"""
logger.info(f"Query Redshift with: {sql_query}")

try:
    results_json, results_df = fetch_query_results(sql_query)
    # Print the first row of the DataFrame
    logger.info("Successfully fetched query results, with shape: %s", results_df.shape)
except Exception as e:
    logger.error(f"Error fetching query results: {e}")
    raise

# Save the json
parsed_json = json.loads(results_json)

# 2) Then pretty-print with indentation
save_to_json(parsed_json, path_db_prepared)

2025-01-22 09:50:19,868 - INFO - Query Redshift with: 
SELECT *
FROM steam_review
where app_id_name = '1166860_Rival_Stars_Horse_Racing_Desktop_Edition'

2025-01-22 09:50:32,716 - INFO - Successfully fetched query results, with shape: (3235, 14)
2025-01-22 09:50:32,886 - INFO - Data successfully saved to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Steam\db_prepared.json


# Translation

In [None]:
from helper.data_analysis import translate_reviews

id_column = "recommendationid"              # The column that contains unique IDs
text_col = "review_text"                    # The column that contains the text to be translated
language_col = "language"                   # The column that contains the language tag

data_translated = translate_reviews(df=results_df,
                                    file_path=path_db_translated,
                                    id_column=id_column,
                                    text_column=text_col,
                                    language_column='language')

# Save the translated data
save_df_as_json(data_translated, path_db_translated)

# Data Analysis

In [None]:
import os
from helper.utils import *
from helper.prompt_templates import *
from helper.data_analysis import normalize_topics_key, process_entry

# Configure API
configure_api(client, chat_model_name)

data_prepared = read_json(path_db_translated)

id_column = "recommendationid"              # The column that contains unique IDs
columns_of_interest = ["review_text"]       # The column(s) that are going to be analyzed
all_entries = []                            # List to store all processed entries
processed_ids = set()                       # Set to store IDs of processed entries

# If the analyzed file already exists, load it
if os.path.exists(path_db_analysed):
    all_entries = read_json(path_db_analysed)
    processed_ids = {entry[id_column] for entry in all_entries}  # set for O(1) membership checks

# Process all unprocessed entries
for i, entry in enumerate(data_prepared):
    current_id = entry[id_column]

    # If we've already processed this entry, skip it
    if current_id in processed_ids:
        logger.info(f"Skipping entry {i} (ID: {current_id}) - already processed.")
        continue

    # Otherwise, process and append
    process_entry(
        entry,
        id_column,
        prompt_template_topic,
        prompt_template_sentiment,
        api_settings,
        columns_of_interest
    )
    all_entries.append(entry)
    processed_ids.add(current_id)  # mark as processed

    # Save intermediate progress every 10 entries
    if (i % 10) == 0 and i != 0:
        save_to_json(all_entries, path_db_analysed)
        logger.info(f"Progress saved at index {i}.")

# Final save after the loop
save_to_json(all_entries, path_db_analysed)
logger.info("All entries processed and final results saved.")


# Embedding

In [None]:
data = read_json(path_db_analysed)

In [None]:
data[0]['topics']

In [None]:
from helper.embedding import *

embed_key = "topic"  # "topic" or "sentence"

data = read_json(path_db_analysed)
embed_model = initialize_embedding_model(embed_model_name)

def process_embedding(data, embed_key):
    for i in range(0, len(data)):
        if i % 10 == 0:
            logger.info(f"Processing entry {i}")

        for d_topic in data[i]["topics"]:
            if isinstance(d_topic, dict):
                d_topic["embedding"] = embed_text(d_topic[embed_key], embed_model)
    return data

data_embedded = process_embedding(data, embed_key)

# Flatten
def flatten_data(data):
    flattened = []
    for entry in data:
        base_copy = dict(entry)
        topics = base_copy.pop("topics", [])

        for topic in topics:
            new_entry = dict(base_copy)
            new_entry.update(topic)
            flattened.append(new_entry)
    return flattened

data_flattened = flatten_data(data_embedded)


# Save the embedded data
with open(path_db_embedded, "w") as output_file:
    json.dump(data_flattened, output_file, indent=4)


# Clustering

In [3]:
from helper.cluster_analysis import *
from helper.utils import *

# Adjustable parameters
dimensionality_methods = ['UMAP','PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 30, "min_samples": 15, "cluster_selection_epsilon": 0.4}

# Load data
df_total = load_embedded_data(path_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat,
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)

  from .autonotebook import tqdm as notebook_tqdm
2025-01-22 13:45:19,273 - INFO - Loading data from S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Steam\db_embedded.json
2025-01-22 13:45:20,671 - INFO - Loaded 5797 valid entries with embeddings.
2025-01-22 13:45:20,734 - INFO - Applying HDBSCAN with: {'min_cluster_size': 30, 'min_samples': 15, 'cluster_selection_epsilon': 0.4}
2025-01-22 13:45:27,849 - INFO - Found 35 clusters.
2025-01-22 13:45:27,849 - INFO - Applying UMAP with 2 components.
2025-01-22 13:45:42,677 - INFO - Applying UMAP with 3 components.
2025-01-22 13:45:43,879 - INFO - Applying PCA with 2 components.
2025-01-22 13:45:43,943 - INFO - Applying PCA with 3 components.
2025-01-22 13:45:44,002 - INFO - Applying tSNE with 2 components.
2025-01-22 13:45:44,002 - INFO - Perplexity not provided, setting to 30 based on sample size.
2025-01-22 13:45:53,389 - INFO - Applying tSNE with 3 components.
2025-01-22 13:45:53,391 - INF

# Cluster Naming

In [9]:
from helper.cluster_naming import *

# Parameters
dimensionality_methods = ["UMAP",'PCA', "tSNE"]
clustering_algorithms = ["hdbscan"]  # No KMeans here
max_centers = 10
api_settings = {"client": client, "model": chat_model_name}

#kmeans_clusters = [15, 20, 25, 50]  # Number of clusters for KMeans

# Load data
df_total = load_json_into_df(path_db_clustered)

df_total = process_clusters(
    df_total,
    dimensionality_methods,
    clustering_algorithms,
    max_centers,
    api_settings) # insert kmeans_clusters in the function when needed


# Save results
save_data_for_streamlit(df_total, path_db_final)

2025-01-22 13:58:11,455 - INFO - Loading data from S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Steam\db_clustered.json
2025-01-22 13:58:12,909 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 28
2025-01-22 13:58:14,039 - INFO - Generated cluster name: Game Enthusiasm and Enjoyment
2025-01-22 13:58:14,040 - INFO - Tokens used so far: Prompt Tokens: 144, Completion Tokens: 7
2025-01-22 13:58:14,044 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 27
2025-01-22 13:58:14,536 - INFO - Generated cluster name: Equestrian Game Enthusiasm
2025-01-22 13:58:14,537 - INFO - Tokens used so far: Prompt Tokens: 315, Completion Tokens: 13
2025-01-22 13:58:14,540 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 5
2025-01-22 13:58:15,264 - INFO - Generated cluster name: Customization in Equestrian Gameplay
2025-01-22 13:58:15,266 - INFO - Tokens used so far: Prompt Tokens: 508, Completion Tokens: 20
2025-01-22 13:58:15,271 - INFO - Found 10 T

# Combining the data

In [2]:
# General modules
import os
import openai
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"

# Paths
root_dir = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC'
influencer = 'Community'
steam_reviews = 'Steam'

influencer_data = os.path.join(root_dir, influencer, "db_analysed.json")
steam_data = os.path.join(root_dir, steam_reviews, "db_analysed.json")

path_db_analysed = os.path.join(root_dir, "db_analysed.json")
path_db_embedded = os.path.join(root_dir, "db_embedded.json")
path_db_clustered = os.path.join(root_dir, "db_clustered.json")
path_db_final = os.path.join(root_dir, "db_final.json")


In [2]:
from helper.utils import *

# Load the pre embedded data
influencer_data = read_json(influencer_data)
steam_data = read_json(steam_data)

In [6]:
# flatten steam data
def flatten_data(data):
    flattened = []
    for entry in data:
        base_copy = dict(entry)
        topics = base_copy.pop("topics", [])

        for topic in topics:
            new_entry = dict(base_copy)
            new_entry.update(topic)
            flattened.append(new_entry)
    return flattened

steam_data = flatten_data(steam_data)
with open(path_db_analysed, "w") as output_file:
    json.dump(steam_data, output_file, indent=4)

In [7]:
steam_data[1]

{'app_id_name': '1166860_Rival_Stars_Horse_Racing_Desktop_Edition',
 'recommendationid': 178743676,
 'playtime_at_review_minutes': 1016,
 'last_played': 1731228389,
 'review_text': "definitely a favourite go-to game of mine, I can't really fault anything in it. I DO wish you could care for your horses (e.g, feed them, water them, wash them, etc) but nonetheless it's a great game",
 'timestamp_updated': 1731227648,
 'voted_up': True,
 'votes_up': 9,
 'votes_funny': 0,
 'weighted_vote_score': 0.65208226442337,
 'steam_purchase': True,
 'received_for_free': False,
 'written_during_early_access': False,
 'language': 'english',
 'topic': 'Horse Care Mechanics',
 'sentiment': 'Negative',
 'category': 'request',
 'sentence': 'I DO wish you could care for your horses (e.g., feed them, water them, wash them, etc).'}

In [8]:
influencer_data[1]

{'response_ID': 2,
 'topic': 'Rival Stars Update',
 'sentiment': 'Positive',
 'category': 'fact',
 'sentence': 'There has been an update on Rival stars as you saw by the thumbnail and the title um they have added show jumping.'}

# Add tags to all JSON entires in the influencer data and the steam data

In [9]:
for entry in influencer_data:
    entry["data_source"] = "influencer"

In [10]:
for entry in steam_data:
    entry["data_source"] = "steam"

# Combine the data

In [11]:
combined_data = influencer_data + steam_data

save_to_json(combined_data, path_db_analysed)

2025-01-23 08:22:48,114 - INFO - Data successfully saved to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\db_analysed.json


In [12]:
print(f'Influencer data: {len(influencer_data)}')
print(f'Steam data: {len(steam_data)}')
print(f'Combined data: {len(combined_data)}')

Influencer data: 753
Steam data: 5797
Combined data: 6550


# Embedd the combined data

In [15]:
data = read_json(path_db_analysed)

In [17]:
data[1000]

{'app_id_name': '1166860_Rival_Stars_Horse_Racing_Desktop_Edition',
 'recommendationid': 185300807,
 'playtime_at_review_minutes': 157,
 'last_played': 1736891522,
 'review_text': 'I love this game, one of the best I have ever played!!!',
 'timestamp_updated': 1736551776,
 'voted_up': True,
 'votes_up': 0,
 'votes_funny': 0,
 'weighted_vote_score': 0.0,
 'steam_purchase': True,
 'received_for_free': False,
 'written_during_early_access': False,
 'language': 'english',
 'topic': 'Overall Satisfaction',
 'sentiment': 'Positive',
 'category': 'fact',
 'sentence': 'I love this game, one of the best I have ever played!!!',
 'data_source': 'steam'}

In [21]:
from helper.embedding import *

embed_key = "topic"  # "topic" or "sentence"

data = read_json(path_db_analysed)
embed_model = initialize_embedding_model(embed_model_name)

def process_embedding(data, embed_key):
    for i in range(0, len(data)):
        if i % 10 == 0:
            logger.info(f"Processing entry {i}")

        data[i]["embedding"] = embed_text(data[i][embed_key], embed_model)

    return data

combined_data = process_embedding(data, embed_key)

# Save the embedded data
save_to_json(combined_data, path_db_embedded)

2025-01-23 08:31:17,566 - INFO - Loading embedding model: all-MiniLM-L6-v2
2025-01-23 08:31:17,598 - INFO - Use pytorch device_name: cpu
2025-01-23 08:31:17,598 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-01-23 08:31:20,211 - INFO - Processing entry 0
2025-01-23 08:31:20,306 - INFO - Processing entry 10
2025-01-23 08:31:20,400 - INFO - Processing entry 20
2025-01-23 08:31:20,495 - INFO - Processing entry 30
2025-01-23 08:31:20,605 - INFO - Processing entry 40
2025-01-23 08:31:20,700 - INFO - Processing entry 50
2025-01-23 08:31:20,796 - INFO - Processing entry 60
2025-01-23 08:31:20,891 - INFO - Processing entry 70
2025-01-23 08:31:20,985 - INFO - Processing entry 80
2025-01-23 08:31:21,079 - INFO - Processing entry 90
2025-01-23 08:31:21,174 - INFO - Processing entry 100
2025-01-23 08:31:21,260 - INFO - Processing entry 110
2025-01-23 08:31:21,355 - INFO - Processing entry 120
2025-01-23 08:31:21,466 - INFO - Processing entry 130
2025-01-23 08:31:21,560 - INFO 

# Clustering

In [23]:
# The clustering dows not perform to good. Some datapoints that clearly should be in a cluster based on eyeballing and their topic name but they are not. Rather often some points in a dense cluster are categorized as noise.
# I will try to improve this by first perform a dimension reduction and then perform clustering. Reson beeing, that in high dimensions the data might be to sparse for the clustering algorithm to work properly.

from helper.cluster_analysis import *
from helper.utils import *

# Adjustable parameters
dimensionality_methods = ['UMAP', 'PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 50, "min_samples": 2, "cluster_selection_epsilon": 0.15}

# Load data
df_total = load_embedded_data(path_db_embedded)

# Extract embeddings
mat = np.array(df_total['embedding'].tolist())

# Reduce dimensions to 20 with UMAP
reducer = umap.UMAP(n_components=50)
mat_reduced = reducer.fit_transform(mat)

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat_reduced,  # Use reduced embeddings for clustering
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)

2025-01-23 16:27:44,952 - INFO - Loading data from S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\db_embedded.json
2025-01-23 16:27:46,961 - INFO - Loaded 6550 valid entries with embeddings.
2025-01-23 16:27:50,450 - INFO - Applying HDBSCAN with: {'min_cluster_size': 50, 'min_samples': 2, 'cluster_selection_epsilon': 0.15}
2025-01-23 16:27:50,832 - INFO - Found 45 clusters.
2025-01-23 16:27:50,832 - INFO - Applying UMAP with 2 components.
2025-01-23 16:27:52,003 - INFO - Applying UMAP with 3 components.
2025-01-23 16:27:53,003 - INFO - Applying PCA with 2 components.
2025-01-23 16:27:53,007 - INFO - Applying PCA with 3 components.
2025-01-23 16:27:53,010 - INFO - Applying tSNE with 2 components.
2025-01-23 16:27:53,010 - INFO - Perplexity not provided, setting to 30 based on sample size.
2025-01-23 16:28:03,542 - INFO - Applying tSNE with 3 components.
2025-01-23 16:28:03,542 - INFO - Perplexity not provided, setting to 30 based on samp

# Cluster Naming

In [24]:
from helper.cluster_naming import *

# Parameters
dimensionality_methods = ["UMAP",'PCA', "tSNE"]
clustering_algorithms = ["hdbscan"]  # No KMeans here
max_centers = 10
api_settings = {"client": client, "model": chat_model_name}

#kmeans_clusters = [15, 20, 25, 50]  # Number of clusters for KMeans

# Load data
df_total = load_json_into_df(path_db_clustered)

df_total = process_clusters(
    df_total,
    dimensionality_methods,
    clustering_algorithms,
    max_centers,
    api_settings) # insert kmeans_clusters in the function when needed

# remove embeddings
df_total = df_total.drop(columns=['embedding'])

# Save results
save_data_for_streamlit(df_total, path_db_final)

2025-01-23 16:35:30,002 - INFO - Loading data from S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\db_clustered.json
2025-01-23 16:35:31,683 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 21
2025-01-23 16:35:32,414 - INFO - Generated cluster name: Horse Aesthetics and Customization
2025-01-23 16:35:32,415 - INFO - Tokens used so far: Prompt Tokens: 64487, Completion Tokens: 2008
2025-01-23 16:35:32,418 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 36
2025-01-23 16:35:32,924 - INFO - Generated cluster name: Ongoing Game Updates and Content
2025-01-23 16:35:32,926 - INFO - Tokens used so far: Prompt Tokens: 64718, Completion Tokens: 2015
2025-01-23 16:35:32,930 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 24
2025-01-23 16:35:34,037 - INFO - Generated cluster name: Show Jumping Gameplay Feedback
2025-01-23 16:35:34,037 - INFO - Tokens used so far: Prompt Tokens: 65009, Completion Tokens: 2021
2025-01-23 16:35:34,053 - INFO