In [1]:
# General modules
import os
import openai
from dotenv import load_dotenv

# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"
# embed_model_name = 'sentence-transformers/all-mpnet-base-v2'


# Paths
root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis'
path_input = os.path.join(root_dir, "Data", "2024 Trimester 1.xlsx")

path_db_prepared = os.path.join(root_dir, "Data", "db_prepared.json")
path_db_analysed = os.path.join(root_dir, "Data", "db_analysed.json")
path_db_embedded = os.path.join(root_dir, "Data", "db_embedded.json")
path_db_clustered = os.path.join(root_dir, "Data", "db_clustered.json")
path_db_final = os.path.join(root_dir, "Data", "db_final.json")

In [2]:
# my imports
from helper.utils import *
from helper.data_analysis import *
from helper.prompt_templates import *
from helper.embedding import *
from helper.cluster_analysis import *
from helper.cluster_naming import *


columns_of_interest = [
    "24_[OPTIONAL] Is there anything you'd like to share about the game (good, bad, frustrating, improvement, wishlist, etc.)?"
]

configure_api(client, chat_model_name)

In [24]:
data = load_excel_to_data(path_input)

2024-11-21 17:12:02,234 - INFO - Loading Excel file: C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\2024 Trimester 1.xlsx
2024-11-21 17:12:12,129 - INFO - Removing the first entry of the dataset.
2024-11-21 17:12:12,129 - INFO - Excel data successfully loaded and converted to dictionary.


In [26]:
data_cleaned = clean_json_data(data)

2024-11-21 17:15:53,530 - INFO - Cleaned 0 entries from the dataset.


### Filter and enrich the data
#### - This function checks if the cols of interest contain strings that are longer than 3 words
#### - The removed entries in the logger are all the rows where this condition did NOT hold
#### - All entries where it holds get a new JSON key (column) called "player_response"

In [27]:
# This function checks if the cols of interest contain strings that are longer than 3 words
# The removed entries in the logger are all the rows where this condition did NOT hold
# All entries where it holds get a new JSON key (column) called "player_response"
filtered_data = filter_and_enrich_data(data_cleaned, columns_of_interest)

2024-11-21 17:15:57,267 - INFO - Total entries removed: 2518


In [28]:
len(filtered_data)

4254

### Optional: generate a sample size

In [7]:
sample_size = 1100
seed = 42
sample_data = get_random_sample(filtered_data, sample_size, seed=seed)

In [8]:
# save the sample or the cleaned data as JSON
save_to_json(sample_data, path_db_prepared)
# save_to_json(filtered_data, path_db_prepared)

2024-11-21 11:38:43,434 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_prepared.json


# Translation


In [9]:
# Define col of interest and the col where the ID is specified
col_of_interest = ["player_response"]
id_col = "Unnamed: 0"

In [10]:
data = read_json(path_db_prepared)

In [11]:
# detect the language of the player response
language = detect_player_language(data, id_col, col_of_interest)

In [12]:
language[0]

{'Unnamed: 0': 1520,
 'Respondent ID': 114587000000,
 'Collector ID': 431039728,
 'Start Date': '2024-04-26 21:01:16',
 'End Date': '2024-04-26 21:09:56',
 'IP Address': '172.97.49.205',
 'Email Address': '',
 'First Name': '',
 'Last Name': '',
 'Custom Data 1': '',
 '1_Rate_Overall Rival Stars Horse Racing': '5 (Strongly Like)',
 '1_Rate_Breeding': '1 (Strongly Dislike)',
 '1_Rate_Training my horses': '1 (Strongly Dislike)',
 '1_Rate_Racing - Story': '3 (Neutral)',
 '1_Rate_Steeplechase': '3 (Neutral)',
 '1_Rate_Cross Country': '3 (Neutral)',
 '1_Rate_Free Roam': '3 (Neutral)',
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': 2,
 '1_Rate_Completing Story Goals': 4,
 '1_Rate_Foal Caring': '1 (Strongly Dislike)',
 '1_Rate_Pasture': 2,
 '1_Rate_Foaling season event': '5 (Strongly Like)',
 '1_Rate_Steeplechase Stars': '3 (Neutral)',
 '1_Rate_Arabian Days': '1 (Strongly Dislike)',
 '1_Rate_Empowering my horse with skills': 4,
 '1_Rate_Customizing my horse with tack': 4,

#### API calls for translation

In [13]:
translated_data = translate_data(language, id_col, prompt_template_translation, api_settings, columns_of_interest)

2024-11-21 11:39:48,537 - INFO - Translating entry ID 3444 (Language: german)
2024-11-21 11:39:49,528 - INFO - Translating entry ID 1446 (Language: german)
2024-11-21 11:39:50,502 - INFO - Translating entry ID 3686 (Language: spanish)
2024-11-21 11:39:51,516 - INFO - Translating entry ID 3460 (Language: french)
2024-11-21 11:39:55,745 - INFO - Translating entry ID 5527 (Language: german)
2024-11-21 11:39:57,728 - INFO - Translating entry ID 6157 (Language: german)
2024-11-21 11:39:58,670 - INFO - Translating entry ID 6253 (Language: german)
2024-11-21 11:39:59,191 - INFO - Translating entry ID 3492 (Language: german)
2024-11-21 11:39:59,894 - INFO - Translating entry ID 2228 (Language: french)
2024-11-21 11:40:00,376 - INFO - Translating entry ID 1841 (Language: german)
2024-11-21 11:40:01,366 - INFO - Translating entry ID 6004 (Language: spanish)
2024-11-21 11:40:01,975 - INFO - Translating entry ID 3407 (Language: german)
2024-11-21 11:40:02,476 - INFO - Translating entry ID 432 (Lan

In [16]:
# Check out the translations
matching_entry = [entry for entry in translated_data if entry.get(id_col, "unknown") == 5784]
matching_entry


[{'Unnamed: 0': 5784,
  'Respondent ID': 114585000000,
  'Collector ID': 431039728,
  'Start Date': '2024-04-24 17:03:56',
  'End Date': '2024-04-24 17:15:42',
  'IP Address': '91.179.214.141',
  'Email Address': '',
  'First Name': '',
  'Last Name': '',
  'Custom Data 1': '',
  '1_Rate_Overall Rival Stars Horse Racing': '3 (Neutral)',
  '1_Rate_Breeding': 2,
  '1_Rate_Training my horses': '3 (Neutral)',
  '1_Rate_Racing - Story': '3 (Neutral)',
  '1_Rate_Steeplechase': '3 (Neutral)',
  '1_Rate_Cross Country': 2,
  '1_Rate_Free Roam': '3 (Neutral)',
  '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': '3 (Neutral)',
  '1_Rate_Completing Story Goals': '3 (Neutral)',
  '1_Rate_Foal Caring': '3 (Neutral)',
  '1_Rate_Pasture': '3 (Neutral)',
  '1_Rate_Foaling season event': '3 (Neutral)',
  '1_Rate_Steeplechase Stars': '3 (Neutral)',
  '1_Rate_Arabian Days': '1 (Strongly Dislike)',
  '1_Rate_Empowering my horse with skills': '3 (Neutral)',
  '1_Rate_Customizing my horse wi

# Topic Extraction and Sentiment Analysis

In [17]:
# Parameters
id_column = "Unnamed: 0"                # Column name for entry IDs
columns_of_interest = ["player_response"]     # Which cols should be analyzed?
batch_size = 50                          # Fail-safe batching. The higher the number, the less often the progress is saved.

prepared_data = read_json(path_db_prepared)

# Run analysis
analyse_data(
    translated_data=prepared_data,
    id_column=id_column,
    output_path=path_db_analysed,
    prompt_template_topic=prompt_template_topic,
    prompt_template_sentiment=prompt_template_sentiment,
    api_settings=api_settings,
    columns_of_interest=columns_of_interest,
    batch_size=batch_size
)

2024-11-21 11:42:02,361 - INFO - No existing progress found. Starting fresh.
2024-11-21 11:42:02,362 - INFO - Tokens used so far: Prompt Tokens: 0, Completion Tokens: 0
2024-11-21 11:42:02,363 - INFO - Extracting topics for entry ID 1520
2024-11-21 11:42:04,007 - INFO - Analyzing sentiment for topic 'Hidden Coats' (Entry ID 1520)
2024-11-21 11:42:04,449 - INFO - Analyzing sentiment for topic 'Breeding Difficulty' (Entry ID 1520)
2024-11-21 11:42:04,845 - INFO - Analyzing sentiment for topic 'Breeding Resources' (Entry ID 1520)
2024-11-21 11:42:05,283 - INFO - Tokens used so far: Prompt Tokens: 1101, Completion Tokens: 89
2024-11-21 11:42:05,283 - INFO - Extracting topics for entry ID 320
2024-11-21 11:42:06,452 - INFO - Analyzing sentiment for topic 'Mobile Version' (Entry ID 320)
2024-11-21 11:42:07,285 - INFO - Analyzing sentiment for topic 'Desktop Version' (Entry ID 320)
2024-11-21 11:42:07,917 - INFO - Tokens used so far: Prompt Tokens: 1986, Completion Tokens: 166
2024-11-21 11:4

# Embedding

In [5]:
batch_size = 50
b_override = False  # Change to True if embeddings should be overwritten
embed_key = "sentence"  # topic or "sentence"

# embed_model_name = 'sentence-transformers/all-mpnet-base-v2'
# embed_model_name = 'dunzhang/stella_en_1.5B_v5'

# Load the JSON data
data = read_json(path_db_analysed)

# Initialize the embedding model once
embed_model = initialize_embedding_model(model_name=embed_model_name)

# Initialize an empty list for all processed results
processed_results = []

# Process data in batches
for batch_start in range(0, len(data), batch_size):
    batch_end = min(batch_start + batch_size, len(data))
    batch = data[batch_start:batch_end]
    logger.info(f"Processing batch {batch_start // batch_size + 1} ({batch_start} to {batch_end})")
    processed_batch = process_batch(batch, embed_model, b_override, embed_key=embed_key)
    processed_results.extend(processed_batch)  # Collect processed batch results

# Convert all processed results to a DataFrame at once
df_table = json_to_table(processed_results)

# Save the final JSON table
save_df_as_json(df_table, path_db_embedded)
logger.info("Embedding and conversion to table format completed.")


2024-11-25 09:17:37,030 - INFO - Loading embedding model: dunzhang/stella_en_1.5B_v5
2024-11-25 09:17:37,035 - INFO - Use pytorch device_name: cuda
2024-11-25 09:17:37,036 - INFO - Load pretrained SentenceTransformer: dunzhang/stella_en_1.5B_v5


modules.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/174k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

2_Dense_1024/config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

2024-11-25 09:24:31,712 - INFO - 2 prompts are loaded, with the keys: ['s2p_query', 's2s_query']
2024-11-25 09:24:31,758 - INFO - Processing batch 1 (0 to 50)
2024-11-25 09:26:27,247 - INFO - Processing batch 2 (50 to 100)
2024-11-25 09:27:50,557 - INFO - Processing batch 3 (100 to 150)
2024-11-25 09:29:40,107 - INFO - Processing batch 4 (150 to 200)
2024-11-25 09:31:20,044 - INFO - Processing batch 5 (200 to 250)
2024-11-25 09:32:58,735 - INFO - Processing batch 6 (250 to 300)
2024-11-25 09:34:55,423 - INFO - Processing batch 7 (300 to 350)
2024-11-25 09:36:42,718 - INFO - Processing batch 8 (350 to 400)
2024-11-25 09:38:13,951 - INFO - Processing batch 9 (400 to 450)
2024-11-25 09:39:51,398 - INFO - Processing batch 10 (450 to 500)
2024-11-25 09:41:16,014 - INFO - Processing batch 11 (500 to 550)
2024-11-25 09:43:00,109 - INFO - Processing batch 12 (550 to 600)
2024-11-25 09:44:23,338 - INFO - Processing batch 13 (600 to 650)
2024-11-25 09:46:20,425 - INFO - Processing batch 14 (650 

In [None]:
# Embedding with OpenAI
batch_size = 50



# Cluster Analysis

In [3]:
# Adjustable parameters
dimensionality_methods = ['UMAP', 'tSNE']
hdbscan_params = {"min_cluster_size": 20, "min_samples": 1, "cluster_selection_epsilon": 0.2}

# Load data
df_total = load_embedded_data(path_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat,
    dimensionality_methods,
    path_db_clustered,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)
logger.info(f"Results saved to {path_db_clustered}")

# how many unique cluster ids are in the data?
len(df_total['hdbscan_UMAP_2D'].unique())

2024-11-25 12:50:05,936 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_embedded.json
2024-11-25 12:50:08,935 - INFO - Loaded 2704 valid entries with embeddings.
2024-11-25 12:50:09,055 - INFO - Applying HDBSCAN in the original high-dimensional space with params: {'min_cluster_size': 20, 'min_samples': 1, 'cluster_selection_epsilon': 0.2}
2024-11-25 12:50:21,511 - INFO - Applying UMAP for 2D visualization.
2024-11-25 12:50:21,512 - INFO - Applying UMAP with 2 components.
  warn(
2024-11-25 12:50:37,916 - INFO - Applying UMAP for 3D visualization.
2024-11-25 12:50:37,916 - INFO - Applying UMAP with 3 components.
  warn(
2024-11-25 12:50:48,396 - INFO - Applying tSNE for 2D visualization.
2024-11-25 12:50:48,397 - INFO - Applying tSNE with 2 components.
2024-11-25 12:50:48,397 - INFO - Perplexity not provided, setting to 30 based on sample size.
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both librarie

In [6]:
# Apply KMeans (if needed)
dimensionality_methods = ['UMAP']
kmeans_clusters = [15, 20, 50]

df_total = load_embedded_data(path_db_clustered)
df_total = apply_kmeans(
    df_total,
    mat,
    dimensionality_methods,
    kmeans_clusters,
    path_db_clustered,
    include_2d=True,
    include_3d=True
)

# Save results
# save_df_as_json(df_total, path_db_clustered)
# logger.info(f"Results saved to {path_db_clustered}")

2024-11-25 12:57:36,488 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_clustered.json
2024-11-25 12:57:39,605 - INFO - Loaded 2704 valid entries with embeddings.
2024-11-25 12:57:39,611 - INFO - Applying UMAP with 2 components.
  warn(
2024-11-25 12:57:49,015 - INFO - Applying KMeans with 15 clusters on UMAP 2D.
2024-11-25 12:57:49,049 - INFO - Applying KMeans with 20 clusters on UMAP 2D.
2024-11-25 12:57:49,075 - INFO - Applying KMeans with 50 clusters on UMAP 2D.
2024-11-25 12:57:49,116 - INFO - Applying UMAP with 3 components.
  warn(
2024-11-25 12:57:59,539 - INFO - Applying KMeans with 15 clusters on UMAP 3D.
2024-11-25 12:57:59,557 - INFO - Applying KMeans with 20 clusters on UMAP 3D.
2024-11-25 12:57:59,580 - INFO - Applying KMeans with 50 clusters on UMAP 3D.
2024-11-25 12:57:59,640 - INFO - KMeans clustering completed.


In [7]:
save_df_as_json(df_total, path_db_clustered)
logger.info(f"Results saved to {path_db_clustered}")

2024-11-25 12:58:32,392 - INFO - Saving data to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_clustered.json
2024-11-25 12:58:37,976 - INFO - Results saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_clustered.json


# Cluster Naming

In [3]:
# Parameters
dimensionality_methods = ["UMAP", "tSNE"]
clustering_algorithms = ["hdbscan", "kmeans"]  # No KMeans here
max_centers = 8

kmeans_clusters = [15, 20, 25, 50]  # Number of clusters for KMeans

# Load data
df_total = load_json_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(df_total, dimensionality_methods, clustering_algorithms, max_centers, api_settings, kmeans_clusters)


# Save results
save_data_for_streamlit(df_total, path_db_final)

2024-11-25 13:38:34,489 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_clustered.json
2024-11-25 13:38:36,059 - INFO - Found 8 Topics for hdbscan_UMAP_2D ID: 0
2024-11-25 13:38:36,639 - INFO - Generated cluster name: Equestrian Sports and Equipment
2024-11-25 13:38:36,639 - INFO -  Tokens used so far: Prompt Tokens: 88, Completion Tokens: 5
2024-11-25 13:38:36,654 - INFO - HDBSCAN Cluster ID 0 (UMAP 2D): Equestrian Sports and Equipment
2024-11-25 13:38:36,655 - INFO - Found 8 Topics for hdbscan_UMAP_2D ID: 1
2024-11-25 13:38:37,292 - INFO - Generated cluster name: Game Development Elements and Design
2024-11-25 13:38:37,292 - INFO -  Tokens used so far: Prompt Tokens: 168, Completion Tokens: 10
2024-11-25 13:38:37,292 - INFO - HDBSCAN Cluster ID 1 (UMAP 2D): Game Development Elements and Design
2024-11-25 13:38:37,307 - INFO - Found 8 Topics for kmeans_15_UMAP_2D ID: 5
2024-11-25 13:38:37,850 - INFO - Generated cluster name: Equine Breed

In [8]:
# Paths and parameters
dimensionality_methods = ["UMAP", "tSNE"]
clustering_algorithms = ["hdbscan"]

max_centers = 8  # Maximum number of topics for naming

# Load data
df_total = load_json_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(df_total, dimensionality_methods, clustering_algorithms, kmeans_clusters, max_centers,
                            api_settings)

# Save results
save_data_for_streamlit(df_total, path_db_final)

2024-11-25 10:08:00,783 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_clustered.json


NameError: name 'kmeans_clusters' is not defined