In [1]:
# General modules
import os
import openai
from dotenv import load_dotenv

# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"

# Paths
root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps'
steam_title = 'Days Gone'

path_db_prepared = os.path.join(root_dir, steam_title, "db_prepared.json")
path_db_translated = os.path.join(root_dir, steam_title, "db_translated.json")
path_db_analysed = os.path.join(root_dir, steam_title, "db_analysed.json")
path_db_embedded = os.path.join(root_dir, steam_title, "db_embedded.json")
path_db_clustered = os.path.join(root_dir, steam_title, "db_clustered.json")
path_db_final = os.path.join(root_dir, steam_title, "db_final.json")

In [2]:
# my imports
from helper.utils import *
from helper.data_analysis import *
from helper.prompt_templates import *
from helper.embedding import *
from helper.cluster_analysis import *
from helper.cluster_naming import *
from helper.steam_scraper import *

configure_api(client, chat_model_name)

# Scrape Steam reviews

In [3]:
# https: // store.steampowered.com / app / 455690 / Pixel_Puzzles_Junior_Jigsaw /
# https://store.steampowered.com/app/2093920/Dread_Dawn/
# https://store.steampowered.com/app/1465460/Infection_Free_Zone/
# https://store.steampowered.com/app/1259420/Days_Gone/
appid = '1259420'
n_reviews = 60000     # Number of reviews to scrape

params = {
    'json': 1,
    'filter': 'all',
    'language': 'english',
    'day_range': 9223372036854775807,              # Dont ask, just dont touch this number
    'review_type': 'all',
    'purchase_type': 'all'
}


reviews = get_n_reviews(appid, params, n_reviews)
print(f"Total reviews: {len(reviews)}")

2024-12-03 10:20:48,325 - INFO - Retrieved 100 reviews in API call. Total so far: 100
2024-12-03 10:20:49,279 - INFO - Retrieved 100 reviews in API call. Total so far: 200
2024-12-03 10:20:50,128 - INFO - Retrieved 100 reviews in API call. Total so far: 300
2024-12-03 10:20:50,857 - INFO - Retrieved 100 reviews in API call. Total so far: 400
2024-12-03 10:20:51,551 - INFO - Retrieved 100 reviews in API call. Total so far: 500
2024-12-03 10:20:52,360 - INFO - Retrieved 100 reviews in API call. Total so far: 600
2024-12-03 10:20:53,180 - INFO - Retrieved 100 reviews in API call. Total so far: 700
2024-12-03 10:20:54,195 - INFO - Retrieved 100 reviews in API call. Total so far: 800
2024-12-03 10:20:55,012 - INFO - Retrieved 100 reviews in API call. Total so far: 900
2024-12-03 10:20:55,898 - INFO - Retrieved 100 reviews in API call. Total so far: 1000
2024-12-03 10:20:56,764 - INFO - Retrieved 100 reviews in API call. Total so far: 1100
2024-12-03 10:20:57,675 - INFO - Retrieved 100 revie

Total reviews: 40816


# Filter Data 

#### - 

In [4]:
# Specify the key where the review text is stored
review_key = 'review'

# Filter the reviews
filtered_data = filter_reviews(reviews, review_key)

2024-12-03 10:26:50,115 - INFO - Total entries removed: 7250


In [5]:
# Generate sample and save
sample_size = 4500
seed = 42
sample_data = get_random_sample(reviews, sample_size, seed=seed)
save_to_json(sample_data, path_db_prepared)

2024-12-03 10:27:05,238 - INFO - Generating a random sample of size 4500 with seed 42.
2024-12-03 10:27:05,456 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\db_prepared.json


# Translate reviews

#### This transformation is not pretty but it is used for now to keep using the same helpers as before

In [6]:
# chang key from 'language' to 'player_language'
data = load_json(path_db_prepared)
def rename_key_in_json(obj, old_key, new_key):
    if isinstance(obj, dict):
        return {new_key if k == old_key else k: rename_key_in_json(v, old_key, new_key) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [rename_key_in_json(i, old_key, new_key) for i in obj]
    return obj

# Rename 'language' to 'player_language'
updated_data = rename_key_in_json(data, 'language', 'player_language')
updated_data = rename_key_in_json(updated_data, 'review', 'player_response')


2024-12-03 10:27:36,515 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\db_prepared.json


In [7]:
updated_data[0]

{'recommendationid': '125541284',
 'author': {'steamid': '76561199170144950',
  'num_games_owned': 0,
  'num_reviews': 5,
  'playtime_forever': 5012,
  'playtime_last_two_weeks': 0,
  'playtime_at_review': 211,
  'last_played': 1680153828},
 'player_language': 'english',
 'player_response': "Great game despite past criticism! Being an Oregon native I am a little biased, but I love this game so much that I'm revisiting it after playing it once before on console. Everything from the story to the environment is top notch.",
 'timestamp_created': 1668546971,
 'timestamp_updated': 1668546971,
 'voted_up': True,
 'votes_up': 0,
 'votes_funny': 0,
 'weighted_vote_score': 0,
 'comment_count': 0,
 'steam_purchase': False,
 'received_for_free': False,
 'written_during_early_access': False,
 'primarily_steam_deck': False}

In [8]:
id_col = 'recommendationid'
columns_of_interest = ['player_response']
translated_data = translate_data(updated_data, id_col, prompt_template_translation, api_settings, columns_of_interest)

In [9]:
save_to_json(translated_data, path_db_translated)

2024-12-03 10:27:58,846 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\db_translated.json


# Topic extraction and sentiment analysis

In [5]:
# Parameters
id_column = 'recommendationid'                # Column name for entry IDs
columns_of_interest = ["player_response"]     # Which cols should be analyzed?
batch_size = 10                               # Fail-safe batching. The higher the number, the less often the progress is saved.

prepared_data = read_json(path_db_translated)

# Run analysis
analyse_data(
    translated_data=prepared_data,
    id_column=id_column,
    output_path=path_db_analysed,
    prompt_template_topic=prompt_template_topic_steam,
    prompt_template_sentiment=prompt_template_sentiment,
    api_settings=api_settings,
    columns_of_interest=columns_of_interest,
    batch_size=batch_size
)

2024-12-03 10:50:35,514 - INFO - Loading existing progress from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\db_analysed.json
2024-12-03 10:50:35,519 - INFO - Skipping already processed entry ID 125541284
2024-12-03 10:50:35,520 - INFO - Skipping already processed entry ID 123380860
2024-12-03 10:50:35,520 - INFO - Skipping already processed entry ID 92881039
2024-12-03 10:50:35,521 - INFO - Skipping already processed entry ID 123625248
2024-12-03 10:50:35,521 - INFO - Skipping already processed entry ID 156502410
2024-12-03 10:50:35,522 - INFO - Skipping already processed entry ID 105851648
2024-12-03 10:50:35,522 - INFO - Skipping already processed entry ID 130928364
2024-12-03 10:50:35,523 - INFO - Skipping already processed entry ID 121169390
2024-12-03 10:50:35,523 - INFO - Skipping already processed entry ID 140697667
2024-12-03 10:50:35,523 - INFO - Skipping already processed entry ID 101173137
2024-12-03 10:50:35,524 - INFO - Skipping al

# Embed reviews

In [8]:
batch_size = 50
b_override = False  # Change to True if embeddings should be overwritten
embed_key = "sentence"  # topic or "sentence"

# embed_model_name = 'sentence-transformers/all-mpnet-base-v2'
# embed_model_name = 'dunzhang/stella_en_1.5B_v5'

# Load the JSON data
data = read_json(path_db_analysed)

# Initialize the embedding model once
embed_model = initialize_embedding_model(model_name=embed_model_name)

# Initialize an empty list for all processed results
processed_results = []

# Process data in batches
for batch_start in range(0, len(data), batch_size):
    batch_end = min(batch_start + batch_size, len(data))
    batch = data[batch_start:batch_end]
    logger.info(f"Processing batch {batch_start // batch_size + 1} ({batch_start} to {batch_end})")
    processed_batch = process_batch(batch, embed_model, b_override, embed_key=embed_key)
    processed_results.extend(processed_batch)  # Collect processed batch results

# Convert all processed results to a DataFrame at once
df_table = json_to_table(processed_results)

# Save the final JSON table
save_df_as_json(df_table, path_db_embedded)
logger.info("Embedding and conversion to table format completed.")


2024-12-03 15:31:51,122 - INFO - Loading embedding model: all-MiniLM-L6-v2
  embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=model_name))
2024-12-03 15:32:08,591 - INFO - PyTorch version 2.4.0+cu124 available.
2024-12-03 15:32:08,604 - INFO - Polars version 1.12.0 available.
2024-12-03 15:32:08,620 - INFO - Duckdb version 1.1.2 available.
2024-12-03 15:32:10,313 - INFO - Use pytorch device_name: cuda
2024-12-03 15:32:10,314 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-12-03 15:32:13,781 - INFO - Processing batch 1 (0 to 50)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
2024-12-03 15:32:28,568 - INFO - Processing batch 2 (50 to 100)
2024-12-03 15:32:42,834 - INFO - Processing batch 3 (100 to 150)
2024-12-03 15:32:57,176 - INFO - Processing batch 4 (150 to 200)
2024-12-03 15:33:11,277 - INFO - Processing batch 5 (200 to 250)
2024-12-03 15:33:24,227 - INFO - Processing batch 6 (250 to 300)
2024-12-03 15:33:37,529 - INFO - Proces

In [10]:
import openai
import logging
from helper.utils import *

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   embedding = client.embeddings.create(input = [text], model=model).data[0].embedding
   return embedding


def flatten_and_embed(json_data, embed_model_name="text-embedding-3-large", embed_key="sentence"):
    """
    Flattens the topics in the JSON data and embeds the sentences.
    Args:
        json_data (list): List of JSON entries with nested topics.
        embed_model_name (str): The OpenAI embedding model name.
    Returns:
        list: A flattened list of JSON entries with embeddings.
    """
    flattened_data = []
    counter = 0
    
    for entry in json_data:
        # Extract common fields
        common_fields = {key: value for key, value in entry.items() if key != "topics"}
        
        if "topics" in entry and isinstance(entry["topics"], list):
            for topic in entry["topics"]:
                # Combine common fields with topic-specific fields
                flattened_entry = {**common_fields, **topic}
                #print(flattened_entry)
                # Generate embedding for the sentence
                if embed_key in topic:
                    flattened_entry["embedding"] = get_embedding(topic[embed_key], model=embed_model_name)
                    counter += 1
                    if counter % 10 == 0:
                        logger.info(f"Processed {counter} entries")
                else:
                    flattened_entry["embedding"] = None
                    logger.info(f"No sentence found in entry: {entry.get('recommendationid', 'Unknown')}")
                # Append the flattened entry to the list
                flattened_data.append(flattened_entry)
        else:
            logger.warning(f"No topics found in entry: {entry.get('recommendationid', 'Unknown')}")
    
    return flattened_data


data = read_json(path_db_analysed)

# Process the data
logger.info("Flattening and embedding data...")
processed_data = flatten_and_embed(data)

# Save the processed data
save_df_as_json(processed_data, path_db_embedded)

logger.info("Data flattening and embedding completed successfully.")


2024-12-05 09:18:37,644 - INFO - Flattening and embedding data...
2024-12-05 09:18:44,192 - INFO - Processed 10 entries
2024-12-05 09:18:51,024 - INFO - Processed 20 entries
2024-12-05 09:18:57,504 - INFO - Processed 30 entries
2024-12-05 09:19:03,624 - INFO - Processed 40 entries
2024-12-05 09:19:10,385 - INFO - Processed 50 entries
2024-12-05 09:19:16,609 - INFO - Processed 60 entries
2024-12-05 09:19:22,641 - INFO - Processed 70 entries
2024-12-05 09:19:27,997 - INFO - Processed 80 entries
2024-12-05 09:19:34,091 - INFO - Processed 90 entries
2024-12-05 09:19:39,609 - INFO - Processed 100 entries
2024-12-05 09:19:45,695 - INFO - Processed 110 entries
2024-12-05 09:19:51,966 - INFO - Processed 120 entries
2024-12-05 09:19:57,542 - INFO - Processed 130 entries
2024-12-05 09:20:03,248 - INFO - Processed 140 entries
2024-12-05 09:20:09,043 - INFO - Processed 150 entries
2024-12-05 09:20:14,913 - INFO - Processed 160 entries
2024-12-05 09:20:19,947 - INFO - Processed 170 entries
2024-12-

# Optional Data transofrmation operations
##### - maybe just filter out all reviews that have category = 'bugs' 
##### - Remove embedding key (reduce size)


In [3]:
data = read_json(path_db_embedded)

# Filter out all reviews that have category = 'bugs'
bug_entries = [entry for entry in data if entry.get('category') == 'bug']
filtered_entries = [entry for entry in data if entry.get('category') != 'bug']

In [8]:
# Save the entries with "category": "bug" to a separate file
output_path = os.path.join(root_dir, steam_title, "bugs.json")
save_to_json(bug_entries, output_path)

output_path = os.path.join(root_dir, steam_title, "no_bugs.json")
# Save the filtered JSON back to the original file or a new file
save_to_json(filtered_entries, output_path)

2024-12-04 12:17:25,345 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\bugs.json
2024-12-04 12:17:33,747 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\no_bugs.json


In [6]:
len(bug_entries)

676

# Cluster Analysis


In [16]:
# Adjustable parameters
dimensionality_methods = ['UMAP', 'tSNE']
hdbscan_params = {"min_cluster_size": 50, "min_samples": 30, "cluster_selection_epsilon": 0.2}

# Load data
input_path = os.path.join(root_dir, steam_title, "no_bugs.json")
df_total = load_embedded_data(input_path)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat,
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

output_path = path_db_clustered = os.path.join(root_dir, steam_title, "no_bugs.json")
# Save results
save_df_as_json(df_total, output_path)
logger.info(f"Results saved to {output_path}")

# how many unique cluster ids are in the data?
length = len(df_total['hdbscan_UMAP_2D'].unique())
print(f'Number of unique clusters: {length}')

2024-12-04 13:23:21,545 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\no_bugs.json
2024-12-04 13:23:27,481 - INFO - Loaded 13091 valid entries with embeddings.
2024-12-04 13:23:27,886 - INFO - Applying HDBSCAN in the original high-dimensional space with params: {'min_cluster_size': 50, 'min_samples': 30, 'cluster_selection_epsilon': 0.2}
2024-12-04 13:24:51,641 - INFO - Applying UMAP for 2D visualization.
2024-12-04 13:24:51,643 - INFO - Applying UMAP with 2 components.
  warn(
2024-12-04 13:25:11,421 - INFO - Applying UMAP for 3D visualization.
2024-12-04 13:25:11,423 - INFO - Applying UMAP with 3 components.
  warn(
2024-12-04 13:25:30,270 - INFO - Applying tSNE for 2D visualization.
2024-12-04 13:25:30,271 - INFO - Applying tSNE with 2 components.
2024-12-04 13:25:30,271 - INFO - Perplexity not provided, setting to 30 based on sample size.
2024-12-04 13:26:26,232 - INFO - Applying tSNE for 3D visualization.
2024-12-0

AttributeError: 'DataFrame' object has no attribute 'unique'

In [18]:
length = len(df_total['hdbscan_UMAP_2D'].unique())
length

AttributeError: 'DataFrame' object has no attribute 'unique'

# Improve HDBSCAN 
##### - Reduce the sample size
##### - build a sklearn loop to optimize the hdbscan parameters
##### -  benchmark: reduce noise (smaller Unknown cluster) and improve cluster assignment (big topics are not split while preserving small topics)

In [9]:
# Sample paths
sample_db_embedded = os.path.join(root_dir, steam_title, "sample_db_embedded.json")
sample_db_clustered = os.path.join(root_dir, steam_title, "sample_db_clustered.json")


In [25]:
# Get a random sample
n = 400
seed = 42
df_total = load_embedded_data(path_db_embedded)
df_sample = df_total.sample(n, random_state=seed)

# Save the sample
save_df_as_json(df_sample, sample_db_embedded)
len(df_sample)

2024-12-06 14:24:16,247 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\db_embedded.json
2024-12-06 14:25:39,837 - INFO - Loaded 13767 valid entries with embeddings.
2024-12-06 14:25:42,473 - INFO - Saving data to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\sample_db_embedded.json


400

In [11]:
# Build matrix
from sklearn.model_selection import ParameterGrid

param_grid = {
    'min_cluster_size': [3, 5, 7, 10, 15],
    'min_samples': [1, 2, 3, 4, 5, 7, 10],
    'cluster_selection_epsilon': [0.1, 0.3, 0.5, 0.7, 0.9]
}
grid = ParameterGrid(param_grid)


In [26]:
sample_df = read_json(sample_db_embedded)

In [27]:
sampled_mat = np.array([entry['embedding'] for entry in sample_df])

In [37]:
# Reduce dimensions

# Step 3: Reduce Dimensionality with UMAP
# Reduce the original 3075 dimensions to 20 dimensions
reducer = umap.UMAP(n_components=20, random_state=42)
mat_reduced = reducer.fit_transform(mat)

  warn(


In [38]:
mat_reduced.shape

(400, 20)

In [40]:
# Step 4: Define Hyperparameter Grid
param_grid = {
    'min_cluster_size': [5, 10, 15, 20],
    'min_samples': [1, 5, 10],
    'cluster_selection_epsilon': [0.1, 0.3, 0.5, 0.7]
}
grid = ParameterGrid(param_grid)

# Step 5: Apply HDBSCAN to Each Parameter Combination and Evaluate Results
results = []

for params in grid:
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=params['min_cluster_size'],
        min_samples=params['min_samples'],
        cluster_selection_epsilon=params['cluster_selection_epsilon']
    )
    labels = clusterer.fit_predict(mat_reduced)
    #print(labels)

    # Number of clusters (excluding noise, which is labeled as -1)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    # Proportion of noise points
    noise_ratio = sum(labels == -1) / len(labels)

    # Store the results for analysis
    results.append({
        'params': params,
        'n_clusters': n_clusters,
        'noise_ratio': noise_ratio
    })

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)

[14 22 24 20  0 10  3 11 22  0 20  6 -1 22 24 11 11  0 16  2 23 17 16  7
 -1  3  5 16 11 16 24 -1 14 -1 11 -1 19 11  0  3  4 21 19 15  3  4  1 -1
  3 19 16  1 17 20  5 -1  3 19  5 -1 -1 11  4 10 11 -1  9 -1 17  3 20 16
  9  4 -1  9  3 16  3 -1  0  5 20 -1 20 -1 13 13 24  2 16 14 23 -1  5 20
  5  1 -1  3 -1 -1 -1 23  3 16 13 16 14 22 13 23 14 21  5 16 11  4 17  0
 -1  3 24 13 16 17  1  5 -1  5 23  5 17 14  2  5 13 20 18 24 11 24 -1 22
 23 -1  3 17 -1 16 11 16  1 14 -1  3  9 21 10  0  3 23 16 10 19 20 -1 14
  4 20  0 18 23  3 11 -1 15 20 -1 21  3  5 24  3 14 -1 -1 -1  5  1 22  0
  8 16  5  0 16 18 16 20 11 10 23 -1 23 10 -1 16 22  8  7 15 -1 15  3  5
 -1 -1  5 20  4 12 10 22  8  0 22 16 11 10  9 23 -1  2  9 16 -1 16 18 16
 16  9 -1 -1 16 11  3 20 22 11  7  2 -1 11 14 -1 16 -1 14 16 16 10 14 12
 13 10  6 23 18 13  4 19 11  5 24 20  6  3 16 -1 16 13 22 -1 14 16 16 16
 20 17 13  5 14 23 16  0 13  4 20  4  3 16  5 11 13 19 22 12  5  3 -1  0
  4  6 10  2 16 22  4 11 12 11 16  9 11 -1 11 20  8

In [41]:
# Step 6: Analyze the Results
# Sort by noise ratio (ascending) and number of clusters (descending)
optimal_results = results_df.sort_values(by=['noise_ratio', 'n_clusters'], ascending=[True, False])
print(optimal_results)  # Print the top parameter combinations

                                                                           params  \
36    {'cluster_selection_epsilon': 0.7, 'min_cluster_size': 5, 'min_samples': 1}   
39   {'cluster_selection_epsilon': 0.7, 'min_cluster_size': 10, 'min_samples': 1}   
42   {'cluster_selection_epsilon': 0.7, 'min_cluster_size': 15, 'min_samples': 1}   
45   {'cluster_selection_epsilon': 0.7, 'min_cluster_size': 20, 'min_samples': 1}   
24    {'cluster_selection_epsilon': 0.5, 'min_cluster_size': 5, 'min_samples': 1}   
33   {'cluster_selection_epsilon': 0.5, 'min_cluster_size': 20, 'min_samples': 1}   
27   {'cluster_selection_epsilon': 0.5, 'min_cluster_size': 10, 'min_samples': 1}   
37    {'cluster_selection_epsilon': 0.7, 'min_cluster_size': 5, 'min_samples': 5}   
40   {'cluster_selection_epsilon': 0.7, 'min_cluster_size': 10, 'min_samples': 5}   
43   {'cluster_selection_epsilon': 0.7, 'min_cluster_size': 15, 'min_samples': 5}   
46   {'cluster_selection_epsilon': 0.7, 'min_cluster_size': 20, '

In [30]:
# Step 7: Choose Optimal Parameters and Apply to Full Dataset
# Assuming the optimal parameters are in the first row of optimal_results
best_params = optimal_results.iloc[0]['params']
print(f"Best Parameters: {best_params}")


Best Parameters: {'cluster_selection_epsilon': 0.1, 'min_cluster_size': 10, 'min_samples': 1}


In [None]:
# Adjustable parameters
dimensionality_methods = ['UMAP','PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 5, "min_samples": 4, "cluster_selection_epsilon": 0.5}

# Load data
df_total = load_embedded_data(sample_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat_reduced,
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, sample_db_clustered)
logger.info(f"Results saved to {sample_db_clustered}")

# how many unique cluster ids are in the data?
length = len(df_total['hdbscan_UMAP_2D'].unique())
print(f'Number of unique clusters: {length}')

# Clustering 

In [3]:
db_embedded = read_json(path_db_embedded)
mat = np.array([entry['embedding'] for entry in db_embedded])

# Reduce the original 3075 dimensions to 20 dimensions
reducer = umap.UMAP(n_components=20, random_state=42)
mat_reduced = reducer.fit_transform(mat)

  warn(


In [6]:
# Adjustable parameters
dimensionality_methods = ['UMAP','PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 30, "min_samples": 10, "cluster_selection_epsilon": 0.4}

# Load data
df_total = load_embedded_data(path_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat_reduced,
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)
logger.info(f"Results saved to {path_db_clustered}")

# how many unique cluster ids are in the data?
length = len(df_total['hdbscan_UMAP_2D'].unique())
print(f'Number of unique clusters: {length}')

2024-12-06 15:45:29,165 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\db_embedded.json
2024-12-06 15:46:52,831 - INFO - Loaded 13767 valid entries with embeddings.
2024-12-06 15:46:58,108 - INFO - Applying HDBSCAN in the original high-dimensional space with params: {'min_cluster_size': 30, 'min_samples': 10, 'cluster_selection_epsilon': 0.2}
2024-12-06 15:46:59,398 - INFO - Applying UMAP for 2D visualization.
2024-12-06 15:46:59,414 - INFO - Applying UMAP with 2 components.
  warn(
2024-12-06 15:47:08,520 - INFO - Applying UMAP for 3D visualization.
2024-12-06 15:47:08,520 - INFO - Applying UMAP with 3 components.
  warn(
2024-12-06 15:47:17,434 - INFO - Applying PCA for 2D visualization.
2024-12-06 15:47:17,434 - INFO - Applying PCA with 2 components.
2024-12-06 15:47:17,467 - INFO - Applying PCA for 3D visualization.
2024-12-06 15:47:17,467 - INFO - Applying PCA with 3 components.
2024-12-06 15:47:17,467 - INFO - Appl

Number of unique clusters: 83


In [10]:
# Apply KMeans (if needed)
dimensionality_methods = ['UMAP', 'tSNE']
kmeans_clusters = [14, 35, 50]

df_total = load_embedded_data(path_db_clustered)
df_total = apply_kmeans(
    df_total,
    mat,
    dimensionality_methods,
    kmeans_clusters,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)
logger.info(f"Results saved to {path_db_clustered}")

2024-12-05 16:46:58,076 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\db_clustered.json
2024-12-05 16:47:55,407 - INFO - Loaded 13767 valid entries with embeddings.
2024-12-05 16:47:58,663 - INFO - Applying KMeans with 14 clusters in high-dimensional space.
2024-12-05 16:48:03,127 - INFO - Applying UMAP in 2D.
2024-12-05 16:48:03,129 - INFO - Applying UMAP with 2 components.
  warn(
2024-12-05 16:48:16,022 - INFO - Applying UMAP in 3D.
2024-12-05 16:48:16,022 - INFO - Applying UMAP with 3 components.
  warn(
2024-12-05 16:48:28,508 - INFO - Applying tSNE in 2D.
2024-12-05 16:48:28,508 - INFO - Applying tSNE with 2 components.
2024-12-05 16:48:28,508 - INFO - Perplexity not provided, setting to 30 based on sample size.
2024-12-05 16:49:31,796 - INFO - Applying tSNE in 3D.
2024-12-05 16:49:31,796 - INFO - Applying tSNE with 3 components.
2024-12-05 16:49:31,798 - INFO - Perplexity not provided, setting to 30 based on samp

# Cluster naming

In [10]:
# Parameters
dimensionality_methods = ["UMAP",'PCA', "tSNE"]
clustering_algorithms = ["hdbscan"]  # No KMeans here
max_centers = 15

#kmeans_clusters = [15, 20, 25, 50]  # Number of clusters for KMeans

# Load data
df_total = load_json_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(
    df_total, 
    dimensionality_methods, 
    clustering_algorithms, 
    max_centers, 
    api_settings) # insert kmeans_clusters in the function when needed


# Save results
save_data_for_streamlit(df_total, path_db_final)

2024-12-06 16:09:32,887 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\db_clustered.json
2024-12-06 16:10:23,600 - INFO - Found 15 Topics for hdbscan_UMAP_2D ID: 37
2024-12-06 16:10:24,301 - INFO - Generated cluster name: Outstanding Game Experiences
2024-12-06 16:10:24,301 - INFO -  Tokens used so far: Prompt Tokens: 58380, Completion Tokens: 1592
2024-12-06 16:10:24,301 - INFO - HDBSCAN Cluster ID 37 (UMAP 2D): Outstanding Game Experiences
2024-12-06 16:10:24,348 - INFO - Found 15 Topics for hdbscan_UMAP_2D ID: 55
2024-12-06 16:10:24,785 - INFO - Generated cluster name: Cross-Platform Game Passion
2024-12-06 16:10:24,785 - INFO -  Tokens used so far: Prompt Tokens: 58756, Completion Tokens: 1597
2024-12-06 16:10:24,785 - INFO - HDBSCAN Cluster ID 55 (UMAP 2D): Cross-Platform Game Passion
2024-12-06 16:10:24,865 - INFO - Found 15 Topics for hdbscan_UMAP_2D ID: 79
2024-12-06 16:10:25,649 - INFO - Generated cluster name: 

In [4]:
# remove embedding key again to save space

def remove_embeddings(data):
    """
    Removes the 'embedding' key from each entry in the dataset.
    Args:
        data (list): List of JSON entries.
    Returns:
        list: Dataset with 'embedding' key removed.
    """
    for entry in data:
        if "embedding" in entry:
            del entry["embedding"]
    return data

data = read_json(path_db_final)
data = remove_embeddings(data)
# transform to df
df = pd.DataFrame(data)
save_data_for_streamlit(df, path_db_final)

2024-12-09 09:41:12,309 - INFO - Saving updated data to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Days Gone\db_final.json
2024-12-09 09:41:12,736 - INFO - Data saved successfully.
