In [1]:
# General modules
import os
import openai
from dotenv import load_dotenv

# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"

# Paths
root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps'
steam_title = 'Market'

path_db_prepared = os.path.join(root_dir, steam_title, "db_prepared.json")
path_db_translated = os.path.join(root_dir, steam_title, "db_translated.json")
path_db_analysed = os.path.join(root_dir, steam_title, "db_analysed.json")
path_db_embedded = os.path.join(root_dir, steam_title, "db_embedded.json")
path_db_clustered = os.path.join(root_dir, steam_title, "db_clustered.json")
path_db_final = os.path.join(root_dir, steam_title, "db_final.json")

In [2]:
# my imports
from helper.utils import *
from helper.data_analysis import *
from helper.prompt_templates import *
from helper.embedding import *
from helper.cluster_analysis import *
from helper.cluster_naming import *
from helper.steam_scraper import *
from helper.redshift_conector_standalone import *

configure_api(client, chat_model_name)

# Scrape Steam reviews

In [18]:
# https: // store.steampowered.com / app / 455690 / Pixel_Puzzles_Junior_Jigsaw /
# https://store.steampowered.com/app/2093920/Dread_Dawn/
# https://store.steampowered.com/app/1465460/Infection_Free_Zone/
# https://store.steampowered.com/app/1259420/Days_Gone/
appid = '1928980'
n_reviews = 60000     # Number of reviews to scrape

params = {
    'json': 1,
    'filter': 'all',
    'language': 'english',
    'day_range': 9223372036854775807,              # Dont ask, just dont touch this number
    'review_type': 'all',
    'purchase_type': 'all'
}


reviews = get_n_reviews(appid, params, n_reviews)
print(f"Total reviews: {len(reviews)}")

2024-12-11 13:26:19,104 - INFO - Retrieved 100 reviews in API call. Total so far: 100
2024-12-11 13:26:19,957 - INFO - Retrieved 100 reviews in API call. Total so far: 200
2024-12-11 13:26:20,564 - INFO - Retrieved 100 reviews in API call. Total so far: 300
2024-12-11 13:26:21,114 - INFO - Retrieved 100 reviews in API call. Total so far: 400
2024-12-11 13:26:21,652 - INFO - Retrieved 100 reviews in API call. Total so far: 500
2024-12-11 13:26:22,412 - INFO - Retrieved 100 reviews in API call. Total so far: 600
2024-12-11 13:26:22,994 - INFO - Retrieved 100 reviews in API call. Total so far: 700
2024-12-11 13:26:23,589 - INFO - Retrieved 100 reviews in API call. Total so far: 800
2024-12-11 13:26:24,135 - INFO - Retrieved 100 reviews in API call. Total so far: 900
2024-12-11 13:26:25,035 - INFO - Retrieved 100 reviews in API call. Total so far: 1000
2024-12-11 13:26:25,629 - INFO - Retrieved 100 reviews in API call. Total so far: 1100
2024-12-11 13:26:26,213 - INFO - Retrieved 100 revie

Total reviews: 10076


# Redshift connector 

In [4]:
# Define your SQL query
# Get the count of reviews for unique app_id_name combinations
sql_query = """
SELECT app_id_name, COUNT(*) AS review_count
FROM steam_review
GROUP BY app_id_name
ORDER BY review_count DESC
"""

# Fetch results
reviews_json, reviews_df = fetch_query_results(sql_query)

# Print the JSON output
print("JSON Output:")
print(reviews_json)

# Display the DataFrame
print("DataFrame Output:")
print(reviews_df)



JSON Output:
[
    {
        "app_id_name": "1326470_Sons_Of_The_Forest",
        "review_count": 135240
    },
    {
        "app_id_name": "1203620_Enshrouded",
        "review_count": 46237
    },
    {
        "app_id_name": "2080690_Sunkenland",
        "review_count": 13430
    },
    {
        "app_id_name": "1928980_Nightingale",
        "review_count": 11906
    },
    {
        "app_id_name": "1458140_Pacific_Drive",
        "review_count": 11718
    },
    {
        "app_id_name": "2163330_Yet_Another_Zombie_Survivors",
        "review_count": 6353
    },
    {
        "app_id_name": "934700_Dead_Island_2",
        "review_count": 5856
    },
    {
        "app_id_name": "1963370_No_One_Survived",
        "review_count": 4949
    },
    {
        "app_id_name": "1641960_Forever_Skies",
        "review_count": 3594
    },
    {
        "app_id_name": "1766060_HumanitZ",
        "review_count": 2612
    },
    {
        "app_id_name": "2351560_Apocalypse_Party",
        "revie

NameError: name 'redshift_connect_scope' is not defined

# Filter Data 

#### - 

In [19]:
# Specify the key where the review text is stored
review_key = 'review'

# Filter the reviews
filtered_data = filter_reviews(reviews, review_key)

2024-12-11 13:27:33,485 - INFO - Total entries removed: 601


In [20]:
# Generate sample and save
sample_size = 4500
seed = 42
sample_data = get_random_sample(reviews, sample_size, seed=seed)
save_to_json(sample_data, path_db_prepared)

2024-12-11 13:27:43,875 - INFO - Generating a random sample of size 4500 with seed 42.
2024-12-11 13:27:44,104 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\db_prepared.json


# Translate reviews

#### This transformation is not pretty but it is used for now to keep using the same helpers as before

In [21]:
# chang key from 'language' to 'player_language'
data = load_json(path_db_prepared)
def rename_key_in_json(obj, old_key, new_key):
    if isinstance(obj, dict):
        return {new_key if k == old_key else k: rename_key_in_json(v, old_key, new_key) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [rename_key_in_json(i, old_key, new_key) for i in obj]
    return obj

# Rename 'language' to 'player_language'
updated_data = rename_key_in_json(data, 'language', 'player_language')
updated_data = rename_key_in_json(updated_data, 'review', 'player_response')


2024-12-11 13:27:49,452 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\db_prepared.json


In [17]:
updated_data[0]

{'app_id_name': '1928980_Nightingale',
 'recommendationid': 181673699,
 'playtime_at_review_minutes': 346,
 'last_played': 1733235591,
 'review_text': '画的饼看着是好的，玩起来都是糙的，新手教程巨长，做完直接不能退款了，神坑',
 'timestamp_updated': 1733235831,
 'voted_up': False,
 'votes_up': 0,
 'votes_funny': 0,
 'weighted_vote_score': 0.0,
 'steam_purchase': True,
 'received_for_free': False,
 'written_during_early_access': True}

In [22]:
id_col = 'recommendationid'
columns_of_interest = ['player_response']
translated_data = translate_data(updated_data, id_col, prompt_template_translation, api_settings, columns_of_interest)

In [23]:
save_to_json(translated_data, path_db_translated)

2024-12-11 13:27:59,252 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\db_translated.json


# Topic extraction and sentiment analysis

In [6]:
# Parameters
id_column = 'recommendationid'                # Column name for entry IDs
columns_of_interest = ["player_response"]     # Which cols should be analyzed?
batch_size = 10                               # Fail-safe batching. The higher the number, the less often the progress is saved.

prepared_data = read_json(path_db_translated)

# Run analysis
analyse_data(
    translated_data=prepared_data,
    id_column=id_column,
    output_path=path_db_analysed,
    prompt_template_topic=prompt_template_topic_steam,
    prompt_template_sentiment=prompt_template_sentiment,
    api_settings=api_settings,
    columns_of_interest=columns_of_interest,
    batch_size=batch_size
)

2024-12-11 16:21:43,791 - INFO - Loading existing progress from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\db_analysed.json
2024-12-11 16:21:43,807 - INFO - Skipping already processed entry ID 181537175
2024-12-11 16:21:43,807 - INFO - Skipping already processed entry ID 159046186
2024-12-11 16:21:43,807 - INFO - Skipping already processed entry ID 171360654
2024-12-11 16:21:43,807 - INFO - Skipping already processed entry ID 159376789
2024-12-11 16:21:43,807 - INFO - Skipping already processed entry ID 159362247
2024-12-11 16:21:43,807 - INFO - Skipping already processed entry ID 166544997
2024-12-11 16:21:43,807 - INFO - Skipping already processed entry ID 168649247
2024-12-11 16:21:43,807 - INFO - Skipping already processed entry ID 181119052
2024-12-11 16:21:43,807 - INFO - Skipping already processed entry ID 160155372
2024-12-11 16:21:43,821 - INFO - Skipping already processed entry ID 158855733
2024-12-11 16:21:43,821 - INFO - Skipping alre

# Embed reviews

In [5]:
batch_size = 50
b_override = False  # Change to True if embeddings should be overwritten
embed_key = "topic"  # topic or "sentence"

# embed_model_name = 'sentence-transformers/all-mpnet-base-v2'
# embed_model_name = 'dunzhang/stella_en_1.5B_v5'

# Load the JSON data 
data = read_json(path_db_analysed)

# Initialize the embedding model once
embed_model = initialize_embedding_model(model_name=embed_model_name)

# Initialize an empty list for all processed results
processed_results = []

# Process data in batches
for batch_start in range(0, len(data), batch_size):
    batch_end = min(batch_start + batch_size, len(data))
    batch = data[batch_start:batch_end]
    logger.info(f"Processing batch {batch_start // batch_size + 1} ({batch_start} to {batch_end})")
    processed_batch = process_batch(batch, embed_model, b_override, embed_key=embed_key)
    processed_results.extend(processed_batch)  # Collect processed batch results

# Convert all processed results to a DataFrame at once
df_table = json_to_table(processed_results)

# Save the final JSON table
save_df_as_json(df_table, path_db_embedded)
logger.info("Embedding and conversion to table format completed.")


2024-12-12 08:53:14,531 - INFO - Loading embedding model: all-MiniLM-L6-v2
  embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=model_name))
2024-12-12 08:53:59,874 - INFO - PyTorch version 2.4.0+cu124 available.
2024-12-12 08:53:59,874 - INFO - Polars version 1.12.0 available.
2024-12-12 08:53:59,874 - INFO - Duckdb version 1.1.2 available.
2024-12-12 08:54:01,443 - INFO - Use pytorch device_name: cuda
2024-12-12 08:54:01,443 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-12-12 08:54:08,479 - INFO - Processing batch 1 (0 to 50)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
2024-12-12 08:54:22,663 - INFO - Processing batch 2 (50 to 100)
2024-12-12 08:54:35,505 - INFO - Processing batch 3 (100 to 150)
2024-12-12 08:54:48,143 - INFO - Processing batch 4 (150 to 200)
2024-12-12 08:55:00,622 - INFO - Processing batch 5 (200 to 250)
2024-12-12 08:55:13,751 - INFO - Processing batch 6 (250 to 300)
2024-12-12 08:55:27,421 - INFO - Proces

In [10]:
import openai
import logging
from helper.utils import *

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   embedding = client.embeddings.create(input = [text], model=model).data[0].embedding
   return embedding


def flatten_and_embed(json_data, embed_model_name="text-embedding-3-large", embed_key="sentence"):
    """
    Flattens the topics in the JSON data and embeds the sentences.
    Args:
        json_data (list): List of JSON entries with nested topics.
        embed_model_name (str): The OpenAI embedding model name.
    Returns:
        list: A flattened list of JSON entries with embeddings.
    """
    flattened_data = []
    counter = 0
    
    for entry in json_data:
        # Extract common fields
        common_fields = {key: value for key, value in entry.items() if key != "topics"}
        
        if "topics" in entry and isinstance(entry["topics"], list):
            for topic in entry["topics"]:
                # Combine common fields with topic-specific fields
                flattened_entry = {**common_fields, **topic}
                #print(flattened_entry)
                # Generate embedding for the sentence
                if embed_key in topic:
                    flattened_entry["embedding"] = get_embedding(topic[embed_key], model=embed_model_name)
                    counter += 1
                    if counter % 10 == 0:
                        logger.info(f"Processed {counter} entries")
                else:
                    flattened_entry["embedding"] = None
                    logger.info(f"No sentence found in entry: {entry.get('recommendationid', 'Unknown')}")
                # Append the flattened entry to the list
                flattened_data.append(flattened_entry)
        else:
            logger.warning(f"No topics found in entry: {entry.get('recommendationid', 'Unknown')}")
    
    return flattened_data


data = read_json(path_db_analysed)

# Process the data
logger.info("Flattening and embedding data...")
processed_data = flatten_and_embed(data)

# Save the processed data
save_df_as_json(processed_data, path_db_embedded)

logger.info("Data flattening and embedding completed successfully.")


2024-12-05 09:18:37,644 - INFO - Flattening and embedding data...
2024-12-05 09:18:44,192 - INFO - Processed 10 entries
2024-12-05 09:18:51,024 - INFO - Processed 20 entries
2024-12-05 09:18:57,504 - INFO - Processed 30 entries
2024-12-05 09:19:03,624 - INFO - Processed 40 entries
2024-12-05 09:19:10,385 - INFO - Processed 50 entries
2024-12-05 09:19:16,609 - INFO - Processed 60 entries
2024-12-05 09:19:22,641 - INFO - Processed 70 entries
2024-12-05 09:19:27,997 - INFO - Processed 80 entries
2024-12-05 09:19:34,091 - INFO - Processed 90 entries
2024-12-05 09:19:39,609 - INFO - Processed 100 entries
2024-12-05 09:19:45,695 - INFO - Processed 110 entries
2024-12-05 09:19:51,966 - INFO - Processed 120 entries
2024-12-05 09:19:57,542 - INFO - Processed 130 entries
2024-12-05 09:20:03,248 - INFO - Processed 140 entries
2024-12-05 09:20:09,043 - INFO - Processed 150 entries
2024-12-05 09:20:14,913 - INFO - Processed 160 entries
2024-12-05 09:20:19,947 - INFO - Processed 170 entries
2024-12-

# Optional Data transofrmation operations
##### - maybe just filter out all reviews that have category = 'bugs' 
##### - Remove embedding key (reduce size)


In [3]:
data = read_json(path_db_analysed)

# Filter out all reviews that have category = 'bugs'
bug_entries = [entry for entry in data if entry.get('category') == 'bug']
filtered_entries = [entry for entry in data if entry.get('category') != 'bug']

In [8]:
# Save the entries with "category": "bug" to a separate file
output_path = os.path.join(root_dir, steam_title, "bugs.json")
save_to_json(bug_entries, output_path)

output_path = os.path.join(root_dir, steam_title, "no_bugs.json")
# Save the filtered JSON back to the original file or a new file
save_to_json(filtered_entries, output_path)

2024-12-04 12:17:25,345 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\bugs.json
2024-12-04 12:17:33,747 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\no_bugs.json


In [4]:
len(bug_entries)

0

# Cluster Analysis


In [16]:
# Adjustable parameters
dimensionality_methods = ['UMAP', 'tSNE']
hdbscan_params = {"min_cluster_size": 50, "min_samples": 30, "cluster_selection_epsilon": 0.2}

# Load data
input_path = os.path.join(root_dir, steam_title, "no_bugs.json")
df_total = load_embedded_data(input_path)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat,
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

output_path = path_db_clustered = os.path.join(root_dir, steam_title, "no_bugs.json")
# Save results
save_df_as_json(df_total, output_path)
logger.info(f"Results saved to {output_path}")

# how many unique cluster ids are in the data?
length = len(df_total['hdbscan_UMAP_2D'].unique())
print(f'Number of unique clusters: {length}')

2024-12-04 13:23:21,545 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\no_bugs.json
2024-12-04 13:23:27,481 - INFO - Loaded 13091 valid entries with embeddings.
2024-12-04 13:23:27,886 - INFO - Applying HDBSCAN in the original high-dimensional space with params: {'min_cluster_size': 50, 'min_samples': 30, 'cluster_selection_epsilon': 0.2}
2024-12-04 13:24:51,641 - INFO - Applying UMAP for 2D visualization.
2024-12-04 13:24:51,643 - INFO - Applying UMAP with 2 components.
  warn(
2024-12-04 13:25:11,421 - INFO - Applying UMAP for 3D visualization.
2024-12-04 13:25:11,423 - INFO - Applying UMAP with 3 components.
  warn(
2024-12-04 13:25:30,270 - INFO - Applying tSNE for 2D visualization.
2024-12-04 13:25:30,271 - INFO - Applying tSNE with 2 components.
2024-12-04 13:25:30,271 - INFO - Perplexity not provided, setting to 30 based on sample size.
2024-12-04 13:26:26,232 - INFO - Applying tSNE for 3D visualization.
2024-12-0

AttributeError: 'DataFrame' object has no attribute 'unique'

In [18]:
length = len(df_total['hdbscan_UMAP_2D'].unique())
length

AttributeError: 'DataFrame' object has no attribute 'unique'

# Improve HDBSCAN 
##### - Reduce the sample size
##### - build a sklearn loop to optimize the hdbscan parameters
##### -  benchmark: reduce noise (smaller Unknown cluster) and improve cluster assignment (big topics are not split while preserving small topics)

In [6]:
# Sample paths
sample_db_embedded = os.path.join(root_dir, steam_title, "sample_db_embedded.json")
sample_db_clustered = os.path.join(root_dir, steam_title, "sample_db_clustered.json")


In [7]:
# Get a random sample
n = 400
seed = 42
df_total = load_embedded_data(path_db_embedded)
df_sample = df_total.sample(n, random_state=seed)

# Save the sample
save_df_as_json(df_sample, sample_db_embedded)
len(df_sample)

2024-12-12 09:01:41,745 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\db_embedded.json
2024-12-12 09:01:45,307 - INFO - Loaded 7168 valid entries with embeddings.
2024-12-12 09:01:45,319 - INFO - Saving data to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\sample_db_embedded.json


400

In [8]:
# Build matrix
from sklearn.model_selection import ParameterGrid

param_grid = {
    'min_cluster_size': [3, 5, 7, 10, 15],
    'min_samples': [1, 2, 3, 4, 5, 7, 10],
    'cluster_selection_epsilon': [0.1, 0.3, 0.5, 0.7, 0.9]
}
grid = ParameterGrid(param_grid)


In [9]:
sample_df = read_json(sample_db_embedded)

In [10]:
sampled_mat = np.array([entry['embedding'] for entry in sample_df])

In [12]:
# Reduce dimensions

# Step 3: Reduce Dimensionality with UMAP
# Reduce the original 3075 dimensions to 20 dimensions
reducer = umap.UMAP(n_components=20, random_state=42)
mat_reduced = reducer.fit_transform(sampled_mat)

  warn(


In [13]:
mat_reduced.shape

(400, 20)

In [14]:
# Step 4: Define Hyperparameter Grid
param_grid = {
    'min_cluster_size': [5, 10, 15, 20],
    'min_samples': [1, 5, 10],
    'cluster_selection_epsilon': [0.1, 0.3, 0.5, 0.7]
}
grid = ParameterGrid(param_grid)

# Step 5: Apply HDBSCAN to Each Parameter Combination and Evaluate Results
results = []

for params in grid:
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=params['min_cluster_size'],
        min_samples=params['min_samples'],
        cluster_selection_epsilon=params['cluster_selection_epsilon']
    )
    labels = clusterer.fit_predict(mat_reduced)
    #print(labels)

    # Number of clusters (excluding noise, which is labeled as -1)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    # Proportion of noise points
    noise_ratio = sum(labels == -1) / len(labels)

    # Store the results for analysis
    results.append({
        'params': params,
        'n_clusters': n_clusters,
        'noise_ratio': noise_ratio
    })

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)

In [15]:
# Step 6: Analyze the Results
# Sort by noise ratio (ascending) and number of clusters (descending)
optimal_results = results_df.sort_values(by=['noise_ratio', 'n_clusters'], ascending=[True, False])
print(optimal_results)  # Print the top parameter combinations

                                               params  n_clusters  noise_ratio
36  {'cluster_selection_epsilon': 0.7, 'min_cluste...          15       0.0125
24  {'cluster_selection_epsilon': 0.5, 'min_cluste...          22       0.0375
37  {'cluster_selection_epsilon': 0.7, 'min_cluste...          13       0.0525
12  {'cluster_selection_epsilon': 0.3, 'min_cluste...          33       0.0675
0   {'cluster_selection_epsilon': 0.1, 'min_cluste...          36       0.0900
39  {'cluster_selection_epsilon': 0.7, 'min_cluste...          10       0.0975
40  {'cluster_selection_epsilon': 0.7, 'min_cluste...          10       0.1125
25  {'cluster_selection_epsilon': 0.5, 'min_cluste...          21       0.1175
10  {'cluster_selection_epsilon': 0.1, 'min_cluste...           3       0.1175
22  {'cluster_selection_epsilon': 0.3, 'min_cluste...           3       0.1175
34  {'cluster_selection_epsilon': 0.5, 'min_cluste...           3       0.1175
46  {'cluster_selection_epsilon': 0.7, 'min_cluste..

In [17]:
# Step 7: Choose Optimal Parameters and Apply to Full Dataset
# Assuming the optimal parameters are in the first row of optimal_results
best_params = optimal_results.iloc[0]
print(f"Best Parameters: {best_params}")


Best Parameters: params         {'cluster_selection_epsilon': 0.7, 'min_cluste...
n_clusters                                                    15
noise_ratio                                               0.0125
Name: 36, dtype: object


In [None]:
# Adjustable parameters
dimensionality_methods = ['UMAP','PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 5, "min_samples": 1, "cluster_selection_epsilon": 0.7}

# Load data
df_total = load_embedded_data(sample_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat_reduced,
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, sample_db_clustered)
logger.info(f"Results saved to {sample_db_clustered}")

# how many unique cluster ids are in the data?
length = len(df_total['hdbscan_UMAP_2D'].unique())
print(f'Number of unique clusters: {length}')

# Clustering 

In [19]:
db_embedded = read_json(path_db_embedded)
mat = np.array([entry['embedding'] for entry in db_embedded])

# Reduce the original 3075 dimensions to 20 dimensions
reducer = umap.UMAP(n_components=20, random_state=42)
mat_reduced = reducer.fit_transform(mat)

  warn(


In [3]:
# Adjustable parameters
dimensionality_methods = ['UMAP', 'PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 10, "min_samples": 6, "cluster_selection_epsilon": 0.7}

# Load data
df_total = load_embedded_data(path_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat,
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)
logger.info(f"Results saved to {path_db_clustered}")

# Check number of unique clusters
length = len(df_total['hdbscan_cluster_id'].unique())
print(f'Number of unique clusters: {length}')

2024-12-12 15:45:51,779 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\db_embedded.json
2024-12-12 15:45:54,178 - INFO - Loaded 7168 valid entries with embeddings.
2024-12-12 15:45:54,299 - INFO - Applying HDBSCAN with params: {'min_cluster_size': 10, 'min_samples': 6, 'cluster_selection_epsilon': 0.7}
2024-12-12 15:45:54,299 - INFO - Clustering is performed in 384 Dimensions.
2024-12-12 15:46:19,177 - INFO - Applying UMAP with 2 components.
  warn(
2024-12-12 15:46:57,430 - INFO - Applying UMAP with 3 components.
  warn(
2024-12-12 15:47:15,912 - INFO - Applying PCA with 2 components.
2024-12-12 15:47:16,061 - INFO - Applying PCA with 3 components.
2024-12-12 15:47:16,187 - INFO - Applying tSNE with 2 components.
2024-12-12 15:47:16,189 - INFO - Perplexity not provided, setting to 30 based on sample size.
2024-12-12 15:47:49,601 - INFO - Applying tSNE with 3 components.
2024-12-12 15:47:49,602 - INFO - Perplexity not provi

KeyError: 'cluster_id'

In [10]:
# Apply KMeans (if needed)
dimensionality_methods = ['UMAP', 'tSNE']
kmeans_clusters = [14, 35, 50]

df_total = load_embedded_data(path_db_clustered)
df_total = apply_kmeans(
    df_total,
    mat,
    dimensionality_methods,
    kmeans_clusters,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)
logger.info(f"Results saved to {path_db_clustered}")

2024-12-05 16:46:58,076 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\db_clustered.json
2024-12-05 16:47:55,407 - INFO - Loaded 13767 valid entries with embeddings.
2024-12-05 16:47:58,663 - INFO - Applying KMeans with 14 clusters in high-dimensional space.
2024-12-05 16:48:03,127 - INFO - Applying UMAP in 2D.
2024-12-05 16:48:03,129 - INFO - Applying UMAP with 2 components.
  warn(
2024-12-05 16:48:16,022 - INFO - Applying UMAP in 3D.
2024-12-05 16:48:16,022 - INFO - Applying UMAP with 3 components.
  warn(
2024-12-05 16:48:28,508 - INFO - Applying tSNE in 2D.
2024-12-05 16:48:28,508 - INFO - Applying tSNE with 2 components.
2024-12-05 16:48:28,508 - INFO - Perplexity not provided, setting to 30 based on sample size.
2024-12-05 16:49:31,796 - INFO - Applying tSNE in 3D.
2024-12-05 16:49:31,796 - INFO - Applying tSNE with 3 components.
2024-12-05 16:49:31,798 - INFO - Perplexity not provided, setting to 30 based on samp

# Cluster naming

### - OpenAI 

In [5]:
# Parameters
dimensionality_methods = ["UMAP", 'PCA', "tSNE"]
clustering_algorithms = ["hdbscan"]  # Or include "kmeans"
max_centers = 10


# Load data
df_total = load_json_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(
    df_total, 
    dimensionality_methods, 
    clustering_algorithms, 
    max_centers, 
    api_settings # Optional kmeans_clusters
)

# Save results
save_data_for_streamlit(df_total, path_db_final)

2024-12-12 15:50:05,568 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\db_clustered.json
2024-12-12 15:50:07,192 - INFO - Cluster naming process completed.
2024-12-12 15:50:07,193 - INFO - Saving updated data to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\db_final.json
2024-12-12 15:50:07,854 - INFO - Data saved successfully.


In [10]:
# Parameters
dimensionality_methods = ["UMAP",'PCA', "tSNE"]
clustering_algorithms = ["hdbscan"]  # No KMeans here
max_centers = 10

#kmeans_clusters = [15, 20, 25, 50]  # Number of clusters for KMeans

# Load data
df_total = load_json_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(
    df_total, 
    dimensionality_methods, 
    clustering_algorithms, 
    max_centers, 
    api_settings) # insert kmeans_clusters in the function when needed


# Save results
save_data_for_streamlit(df_total, path_db_final)

2024-12-12 14:04:39,939 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\db_clustered.json
2024-12-12 14:04:41,902 - INFO - Found 10 Topics for hdbscan_UMAP_2D ID: 57


KeyboardInterrupt: 

In [11]:
len_topic = len(df_total['topic'].unique())
print(f'Number of unique topics: {len_topic}')

len_clusters = len(df_total['hdbscan_UMAP_2D'].unique())
print(f'Number of unique clusters: {len_clusters}')

Number of unique topics: 3434
Number of unique clusters: 60


# Cluster naming

### - Word frequency

# Remove embeddings to save space

In [24]:
# remove embedding key again to save space

def remove_embeddings(data):
    """
    Removes the 'embedding' key from each entry in the dataset.
    Args:
        data (list): List of JSON entries.
    Returns:
        list: Dataset with 'embedding' key removed.
    """
    for entry in data:
        if "embedding" in entry:
            del entry["embedding"]
    return data

data = read_json(path_db_final)
data = remove_embeddings(data)
# transform to df
df = pd.DataFrame(data)
save_data_for_streamlit(df, path_db_final)

2024-12-12 09:45:52,508 - INFO - Saving updated data to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\db_final.json
2024-12-12 09:45:52,666 - INFO - Data saved successfully.
