In [1]:
# My modules
from helper.data_preparation import *
from helper.prompt_templates import *
from helper.data_analysis import *
from helper.embedding import *
from helper.cluster_analysis import *
from helper.cluster_naming import *

# General modules
import os
import openai
from dotenv import load_dotenv

# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"
# embedding_model_name = 'sentence-transformers/all-mpnet-base-v2'


# Paths

root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis'
path_input = prepare_file_path(root_dir, "Data/2024 Trimester 1.xlsx")
path_db_prepared = os.path.join(root_dir, "Data", "db_prepared_HRC.json")

path_db_analysed = os.path.join(root_dir, "Data", "db_analysed_empty.json")
path_db_progress_backup = os.path.join(root_dir, "Data", "db_progress_backup.json")

path_db_embedded = os.path.join(root_dir, "Data", "db_embedded.json")
path_db_clustered = os.path.join(root_dir, "Data", "db_clustered.json")
path_db_final = os.path.join(root_dir, "Data", "db_final.json")

## 1. Prepare the Data

In [3]:
sample_size = 25
set_seed = 42

input_data = load_excel_to_data(path_input)
cleaned_data = clean_data(input_data)
sampled_data = sample_data(cleaned_data, sample_size = sample_size, seed=42)  # Set seed for reproducibility
save_data_to_json(sampled_data, path_db_prepared)

2024-11-19 08:36:07,644 - INFO - Loading Excel file: C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data/Into the Dead Our Darkest Days_1511resposnes.xlsx
2024-11-19 08:36:08,451 - INFO - Excel data successfully loaded and converted to dictionary.
2024-11-19 08:36:08,451 - INFO - Cleaning data.
2024-11-19 08:36:08,473 - INFO - Data cleaning completed. Entries removed: 239
2024-11-19 08:36:08,473 - INFO - Setting random seed to: 42
2024-11-19 08:36:08,473 - INFO - Sampling data. Sample size: 25
2024-11-19 08:36:08,473 - INFO - Data sampling completed with unique IDs assigned.
2024-11-19 08:36:08,473 - INFO - Saving data to JSON: C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_prepared.json
2024-11-19 08:36:08,473 - INFO - Data successfully saved to JSON.


## 2. Analyse the Data

In [6]:
# Load db_prepared 
db = load_json(path_db_prepared)

batch_size = 10 # How many entries to process before a backup is made. 

# Configure OpenAI API client and model
configure_api(api_client=client, model_name=chat_model_name)

# Load existing progress
processed_data, processed_ids = load_existing_progress(path_db_progress_backup)

# Process entries in batches
try:
    analyse_data(db, processed_data, processed_ids, path_db_progress_backup,,
except KeyboardInterrupt:
    logger.warning("Interrupted by user. Progress saved.")
except Exception as e:
    logger.error(f"Unexpected error: {e}")

# Save final output after all processing
save_progress(processed_data, path_db_analysed)
logger.info(f"Processing completed. Final data saved to {path_db_analysed}")

2024-11-19 09:11:34,551 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_prepared.json
2024-11-19 09:11:34,551 - INFO - Loading existing progress from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_progress_backup.json
2024-11-19 09:11:34,551 - INFO - Skipping already processed entry ID 1
2024-11-19 09:11:34,551 - INFO - Skipping already processed entry ID 2
2024-11-19 09:11:34,551 - INFO - Skipping already processed entry ID 3
2024-11-19 09:11:34,551 - INFO - Processing entry ID 4
2024-11-19 09:11:34,551 - INFO - Extracting topics for entry ID 4
2024-11-19 09:11:37,268 - INFO - Analyzing sentiment for topic 'Gameplay' (Entry ID 4)
2024-11-19 09:11:37,635 - INFO - Analyzing sentiment for topic 'Tutorials' (Entry ID 4)
2024-11-19 09:11:38,113 - INFO - Analyzing sentiment for topic 'Game Mechanics' (Entry ID 4)
2024-11-19 09:11:38,626 - INFO - Analyzing sentiment for topic 'Graphics' (Entry ID 4)
2024-11-19 09:11:39,156

## 3. Embed the Data

In [7]:
batch_size = 10 # Embedding is done in batches. After this number of embeddings the cache is cleared to avoid memory issues. Make the number low on weak hardware.
b_override = False  # Change to True if embeddings should be overwritten
embed_key = "topic"  # Change to "sentence" if you want to embed sentences

# Load the JSON data
data = load_json(path_db_analysed)

# Initialize the embedding model once
embed_model = initialize_embedding_model(model_name=embed_model_name)

# Process data in batches
for batch_start in range(0, len(data), batch_size):
    batch_end = min(batch_start + batch_size, len(data))
    batch = data[batch_start:batch_end]
    logger.info(f"Processing batch {batch_start // batch_size + 1} ({batch_start} to {batch_end})")
    batch = process_batch(batch, embed_model, b_override, embed_key=embed_key)
    data[batch_start:batch_end] = batch

# Convert the data to table format
df_table = convert_to_table(data)

# Save the final JSON table
save_to_json(df_table, path_db_embedded)
logger.info("Embedding and conversion to table format completed.")

2024-11-19 09:49:32,329 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_analysed.json
2024-11-19 09:49:32,336 - INFO - Loading embedding model: all-MiniLM-L6-v2
  embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=model_name))
2024-11-19 09:49:35,683 - INFO - PyTorch version 2.4.0+cu124 available.
2024-11-19 09:49:35,695 - INFO - Polars version 1.12.0 available.
2024-11-19 09:49:35,695 - INFO - Duckdb version 1.1.2 available.
2024-11-19 09:49:36,696 - INFO - Use pytorch device_name: cuda
2024-11-19 09:49:36,696 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-11-19 09:49:39,779 - INFO - Processing batch 1 (0 to 4)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
2024-11-19 09:49:44,511 - INFO - Converting JSON data to a table format.
2024-11-19 09:49:44,527 - INFO - Conversion to table format completed.
2024-11-19 09:49:44,527 - INFO - Saving data to C:\Users\fbohm\Desktop\Projects\DataSc

## 4. Cluster the Data

In [8]:
# Adjustable parameters
dimensionality_methods = ['UMAP', 'PCA', 'tSNE']  # Dimensionality reduction methods
kmeans_clusters = [5, 10, 15]  # Number of clusters for KMeans
kmeans_seed = 42  # Seed for reproducibility
include_2d = True  # Whether to include 2D results
include_3d = True  # Whether to include 3D results
hdbscan_params = {"min_cluster_size": 5, "min_samples": 3, "cluster_selection_epsilon": 0.2}  # HDBSCAN params

# t-SNE specific parameter
perplexity = 15  # Set to a default or user-defined value

# Load data
df_total = load_embedded_data(path_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply dimensionality reduction and clustering
apply_clustering(
    df_total,
    mat,
    dimensionality_methods,
    kmeans_clusters,
    path_db_clustered,
    hdbscan_params=hdbscan_params,
    kmeans_seed=kmeans_seed,
    include_2d=include_2d,
    include_3d=include_3d
)

2024-11-19 09:50:00,539 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_embedded.json
2024-11-19 09:50:00,555 - INFO - Loaded 23 valid entries with embeddings.
2024-11-19 09:50:00,555 - INFO - Applying UMAP with 2 components.
  warn(
2024-11-19 09:50:06,339 - INFO - Applying HDBSCAN on UMAP 2D with params: {'min_cluster_size': 5, 'min_samples': 3, 'cluster_selection_epsilon': 0.2}
2024-11-19 09:50:06,339 - INFO - Applying KMeans with 5 clusters on UMAP 2D.
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

2024-11-19 09:50:06,664 - INFO - Applying KMeans with 10 clusters on UMAP 2D.
2024

## 5. Name the Clusters

In [2]:
# Adjustable parameters
dimensionality_methods = ["UMAP", "PCA", "tSNE"]
clustering_algorithms = ["hdbscan", "kmeans"]
kmeans_clusters = [5, 10, 15, 20, 35]  # Number of clusters for KMeans
max_centers = 8  # Maximum number of topics for naming
chat_model_name = 'gpt-4o-mini'

# Load data
df_total = load_data_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(df_total, dimensionality_methods, clustering_algorithms, kmeans_clusters, max_centers, chat_model_name)

# Save results
save_data(df_total, path_db_final)

2024-11-19 10:03:10,119 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_clustered.json
2024-11-19 10:03:10,150 - INFO - Found 4 representative topics for cluster ID 2 in column kmeans_5_UMAP_2D.
2024-11-19 10:03:10,150 - INFO - Generating cluster name using OpenAI API.
2024-11-19 10:03:10,978 - INFO - Generated cluster name: Social Gaming Engagement Strategies
2024-11-19 10:03:10,993 - INFO - Found 4 representative topics for cluster ID 0 in column kmeans_5_UMAP_2D.
2024-11-19 10:03:10,993 - INFO - Generating cluster name using OpenAI API.
2024-11-19 10:03:11,940 - INFO - Generated cluster name: Weapon Aesthetics and Durability Features
2024-11-19 10:03:11,956 - INFO - Found 7 representative topics for cluster ID 1 in column kmeans_5_UMAP_2D.
2024-11-19 10:03:11,956 - INFO - Generating cluster name using OpenAI API.
2024-11-19 10:03:12,459 - INFO - Generated cluster name: Covert Operations and Surprise Tactics
2024-11-19 10:03:12,463 - IN