In [1]:
# General modules
import os
import openai
from dotenv import load_dotenv

from helper.data_preparation import load_json

# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"

# Paths
root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps'
steam_title = 'Market'

path_db_analysed = os.path.join(root_dir, steam_title, "db_analysed.json")
path_db_embedded = os.path.join(root_dir, steam_title, "db_embedded.json")

path_db_clustered = os.path.join(root_dir, steam_title, "db_clustered.json")
path_db_named = os.path.join(root_dir, steam_title, "db_named.json")


# my imports
from helper.utils import *

configure_api(client, chat_model_name)

# Random sample

In [2]:
sample_for_embedding = os.path.join(root_dir, steam_title, "sample_for_embedding.json")

# data = read_json(path_db_analysed)
# sample_data = get_random_sample(data, 50, seed=42)
# save_to_json(sample_data, sample_for_embedding)


In [None]:
import openai
import logging
from helper.utils import *

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   embedding = client.embeddings.create(input = [text], model=model).data[0].embedding
   return embedding


def flatten_and_embed(json_data, embed_model_name="text-embedding-3-large", embed_key="sentence"):
    """
    Flattens the topics in the JSON data and embeds the sentences.
    Args:
        json_data (list): List of JSON entries with nested topics.
        embed_model_name (str): The OpenAI embedding model name.
    Returns:
        list: A flattened list of JSON entries with embeddings.
    """
    flattened_data = []
    counter = 0
    
    for entry in json_data:
        # Extract common fields
        common_fields = {key: value for key, value in entry.items() if key != "topics"}
        
        if "topics" in entry and isinstance(entry["topics"], list):
            for topic in entry["topics"]:
                # Combine common fields with topic-specific fields
                flattened_entry = {**common_fields, **topic}
                #print(flattened_entry)
                # Generate embedding for the sentence
                if embed_key in topic:
                    flattened_entry["embedding"] = get_embedding(topic[embed_key], model=embed_model_name)
                    counter += 1
                    if counter % 10 == 0:
                        logger.info(f"Processed {counter} entries")
                else:
                    flattened_entry["embedding"] = None
                    logger.info(f"No sentence found in entry: {entry.get('recommendationid', 'Unknown')}")
                # Append the flattened entry to the list
                flattened_data.append(flattened_entry)
        else:
            logger.warning(f"No topics found in entry: {entry.get('recommendationid', 'Unknown')}")
    
    return flattened_data


data = read_json(sample_for_embedding)

# Process the data
logger.info("Flattening and embedding data...")
processed_data = flatten_and_embed(data)

# Save the processed data

output_file = os.path.join(root_dir, steam_title, "openai_embedding.json")


save_df_as_json(processed_data, output_file)

logger.info("Data flattening and embedding completed successfully.")


In [2]:
# Build matrix
from sklearn.model_selection import ParameterGrid
from helper.cluster_analysis import *

sample_for_embedding = os.path.join(root_dir, steam_title, "sample_for_embedding.json")
open_ai_embedding = os.path.join(root_dir, steam_title, "openai_embedding.json")

param_grid = {
    'min_cluster_size': [3, 5, 7, 10, 15],
    'min_samples': [1, 2, 3, 4, 5, 7, 10],
    'cluster_selection_epsilon': [0.1, 0.3, 0.5, 0.7, 0.9]
}
grid = ParameterGrid(param_grid)

sample_df = read_json(open_ai_embedding)
sampled_mat = np.array([entry['embedding'] for entry in sample_df])
# Reduce dimensions

# Step 3: Reduce Dimensionality with UMAP
# Reduce the original 3075 dimensions to 20 dimensions
reducer = umap.UMAP(n_components=20, random_state=42)
mat_reduced = reducer.fit_transform(sampled_mat)

# Step 5: Apply HDBSCAN to Each Parameter Combination and Evaluate Results
results = []

for params in grid:
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=params['min_cluster_size'],
        min_samples=params['min_samples'],
        cluster_selection_epsilon=params['cluster_selection_epsilon']
    )
    labels = clusterer.fit_predict(mat_reduced)
    #print(labels)

    # Number of clusters (excluding noise, which is labeled as -1)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    # Proportion of noise points
    noise_ratio = sum(labels == -1) / len(labels)

    # Store the results for analysis
    results.append({
        'params': params,
        'n_clusters': n_clusters,
        'noise_ratio': noise_ratio
    })

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)
optimal_results = results_df.sort_values(by=['noise_ratio', 'n_clusters'], ascending=[True, False])
best_params = optimal_results.iloc[0]

  warn(


In [3]:
results_df

Unnamed: 0,params,n_clusters,noise_ratio
0,"{'cluster_selection_epsilon': 0.1, 'min_cluste...",35,0.188755
1,"{'cluster_selection_epsilon': 0.1, 'min_cluste...",26,0.180723
2,"{'cluster_selection_epsilon': 0.1, 'min_cluste...",16,0.253012
3,"{'cluster_selection_epsilon': 0.1, 'min_cluste...",16,0.317269
4,"{'cluster_selection_epsilon': 0.1, 'min_cluste...",13,0.289157
...,...,...,...
170,"{'cluster_selection_epsilon': 0.9, 'min_cluste...",2,0.100402
171,"{'cluster_selection_epsilon': 0.9, 'min_cluste...",2,0.088353
172,"{'cluster_selection_epsilon': 0.9, 'min_cluste...",2,0.128514
173,"{'cluster_selection_epsilon': 0.9, 'min_cluste...",2,0.124498


In [6]:
# Adjustable parameters
dimensionality_methods = ['UMAP','PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 10, "min_samples": 4, "cluster_selection_epsilon": 0.5}

sample_df = load_embedded_data(open_ai_embedding)
# Apply HDBSCAN
df_total = apply_hdbscan(
    sample_df,
    mat_reduced,
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
output_file = os.path.join(root_dir, steam_title, "openai_2_cluster.json")
save_df_as_json(df_total, output_file)
logger.info(f"Results saved to {output_file}")

2024-12-13 16:52:56,515 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\openai_embedding.json
2024-12-13 16:52:57,223 - INFO - Loaded 249 valid entries with embeddings.
2024-12-13 16:52:57,223 - INFO - Applying HDBSCAN with params: {'min_cluster_size': 10, 'min_samples': 4, 'cluster_selection_epsilon': 0.5}
2024-12-13 16:52:57,223 - INFO - Clustering is performed in 20 Dimensions.
2024-12-13 16:52:57,223 - INFO - Number of clusters found: 8
2024-12-13 16:52:57,223 - INFO - Applying UMAP with 2 components.
2024-12-13 16:52:57,374 - INFO - Applying UMAP with 3 components.


[ 6  3  3  3 -1  3 -1 -1  6  2 -1 -1  0  0  1  0  3  2 -1  3  5  2  0  0
 -1  5  5  5  6  5 -1  5 -1  1  6 -1  5  5 -1  6  6  5 -1  5  6  3 -1  0
  1  0  3 -1  2 -1  5  0 -1 -1  2  2  0  6  6  6 -1  0  0  0 -1 -1  5 -1
  4  1  1 -1  6  5  6  1  6  3  5  5  5  6 -1  3 -1 -1 -1  1 -1 -1 -1 -1
  6 -1  6 -1 -1 -1  0 -1 -1 -1 -1  5  5  6  1  0  1  6  6  1 -1  3  3  2
  6  4 -1  0  6 -1  5 -1  0 -1  6  1  1  0  3  0  0  5  4  6  6  6  1  1
  0  6  6  1  2  0 -1  0  2 -1 -1 -1  3  0  0  3 -1  0 -1 -1  3  3 -1  1
  0 -1 -1  3  5 -1  5  5  6 -1  5  3  3 -1 -1 -1  2  0  0  5  5  4  6  1
  3  3  3 -1  3 -1  5  6  1  0  1  6 -1  6 -1 -1 -1  3  0  0  2 -1  6  4
  2  1  1  1  1  4  6  6  5 -1  6  6  6 -1 -1  4  4  4  6  4  6  1  1  0
  1  6  0  2  4  1  3  3  3]


2024-12-13 16:52:57,581 - INFO - Applying PCA with 2 components.
2024-12-13 16:52:57,583 - INFO - Applying PCA with 3 components.
2024-12-13 16:52:57,585 - INFO - Applying tSNE with 2 components.
2024-12-13 16:52:57,586 - INFO - Perplexity not provided, setting to 30 based on sample size.
2024-12-13 16:52:58,483 - INFO - Applying tSNE with 3 components.
2024-12-13 16:52:58,483 - INFO - Perplexity not provided, setting to 30 based on sample size.
2024-12-13 16:52:59,536 - INFO - HDBSCAN clustering and dimensionality reduction completed.
2024-12-13 16:52:59,538 - INFO - Saving data to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\openai_2_cluster.json
2024-12-13 16:53:00,814 - INFO - Results saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\openai_2_cluster.json


Number of unique clusters: 8


In [3]:
from helper.cluster_naming import *
# Parameters
dimensionality_methods = ["UMAP",'PCA', "tSNE"]
clustering_algorithms = ["hdbscan"]  # No KMeans here
max_centers = 10

#kmeans_clusters = [15, 20, 25, 50]  # Number of clusters for KMeans

# Load data
df_total = load_json_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(
    df_total, 
    dimensionality_methods, 
    clustering_algorithms, 
    max_centers, 
    api_settings) # insert kmeans_clusters in the function when needed


# Save results
output_file = os.path.join(root_dir, steam_title, "openai_3_named.json")
save_data_for_streamlit(df_total, output_file)

2024-12-16 09:34:06,053 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\Market\db_clustered.json
2024-12-16 09:34:07,324 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 99
2024-12-16 09:34:07,878 - INFO - Generated cluster name: Balanced Challenge in Gaming
2024-12-16 09:34:07,878 - INFO - Tokens used so far: Prompt Tokens: 32069, Completion Tokens: 828
2024-12-16 09:34:07,894 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 159
2024-12-16 09:34:08,508 - INFO - Generated cluster name: Enjoyable Gameplay with Mixed Engagement
2024-12-16 09:34:08,508 - INFO - Tokens used so far: Prompt Tokens: 32251, Completion Tokens: 834
2024-12-16 09:34:08,527 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 120
2024-12-16 09:34:09,254 - INFO - Generated cluster name: Crafting Mechanics and Player Experience
2024-12-16 09:34:09,258 - INFO - Tokens used so far: Prompt Tokens: 32448, Completion Tokens: 840
2024-12-16 09:34:09,270 - INFO - 

KeyboardInterrupt: 