In [8]:
# General modules
import os
import openai
from dotenv import load_dotenv

# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"

# Paths
root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC'
path_input = os.path.join(root_dir, "HRC SJ data.xlsx")

path_db_prepared = os.path.join(root_dir, "db_prepared.json")
path_db_translated = os.path.join(root_dir, "db_translated.json")
path_db_analysed = os.path.join(root_dir, "db_analysed.json")
path_db_embedded = os.path.join(root_dir, "db_embedded.json")
path_db_clustered = os.path.join(root_dir, "db_clustered.json")
path_db_final = os.path.join(root_dir, "db_final.json")

In [9]:
# my imports
from helper.utils import *
from helper.data_analysis import *
from helper.prompt_templates import *
from helper.embedding import *
from helper.cluster_analysis import *
from helper.cluster_naming import *


columns_of_interest = [
    "[Optional] Is there anything you currently find frustrating in the Show Jumping?"
]

configure_api(client, chat_model_name)

In [10]:
data = load_excel_to_data(path_input)

2024-11-28 11:09:20,167 - INFO - Loading Excel file: C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\HRC SJ data.xlsx
2024-11-28 11:09:21,185 - INFO - Removing the first entry of the dataset.
2024-11-28 11:09:21,186 - INFO - Excel data successfully loaded and converted to dictionary.


In [11]:
data_cleaned = clean_json_data(data)

2024-11-28 11:09:22,182 - INFO - Cleaned 0 entries from the dataset.


### Filter and enrich the data
#### - This function checks if the cols of interest contain strings that are longer than 3 words
#### - The removed entries in the logger are all the rows where this condition did NOT hold
#### - All entries where it holds get a new JSON key (column) called "player_response"

In [12]:
# This function checks if the cols of interest contain strings that are longer than 3 words
# The removed entries in the logger are all the rows where this condition did NOT hold
# All entries where it holds get a new JSON key (column) called "player_response"
filtered_data = filter_and_enrich_data(data_cleaned, columns_of_interest)

2024-11-28 11:09:23,968 - INFO - Total entries removed: 1813


In [13]:
len(filtered_data)

125

### Optional: generate a sample size

In [24]:
sample_size = 15
seed = 43
sample_data = get_random_sample(filtered_data, sample_size, seed=seed)

In [25]:
# save the sample or the cleaned data as JSON
save_to_json(sample_data, path_db_prepared)
# save_to_json(filtered_data, path_db_prepared)

2024-11-28 11:19:40,941 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\db_prepared.json


# Translation


In [26]:
# Define col of interest and the col where the ID is specified
col_of_interest = ["player_response"]
id_col = "Unnamed: 0"

In [27]:
data = read_json(path_db_prepared)

In [28]:
# detect the language of the player response
language = detect_player_language(data, id_col, col_of_interest)

In [30]:
language[12]

{'Unnamed: 0': 1352,
 'Start Date': '2024-11-10 23:48:00',
 'End Date': '2024-11-10 23:54:00',
 'Have you ever played Show Jumping Arena in Rival Stars Horse Racing? https://surveymonkey-assets.s3.amazonaws.com/survey/520321022/rte/c1a2b9d4-a68b-4299-a55b-2360b21235af.jpg': 'Yes',
 'Why have you not played Show Jumping Arena yet?': '',
 'Other (please specify)': '',
 'Please rate your overall experience playing Show Jumping Arena': 5.0,
 'Have you tried the new Show Jumping Switzerland course? https://surveymonkey-assets.s3.amazonaws.com/survey/520321022/rte/52ed4522-cc62-4a32-9448-e31c03dc036c.jpg': 'Yes',
 'How do you feel about the new Switzerland course?': 'Love it',
 '[Optional] Please explain why you rated the above course the way you did.': 'I love horses ',
 'Refusals on_Please tick the following options you have tried around Show Jumping controls in new Switzerland course. [Select all that apply]': 'Refusals on',
 'Refusals off': '',
 'Equestrian controls on': 'Equestrian cont

#### API calls for translation

In [31]:
translated_data = translate_data(language, id_col, prompt_template_translation, api_settings, columns_of_interest)

2024-11-28 11:20:15,119 - INFO - Translating entry ID 1524 (Language: german)
2024-11-28 11:20:16,348 - INFO - Translating entry ID 1904 (Language: german)


In [33]:
save_to_json(translated_data, path_db_translated)

2024-11-28 11:20:36,485 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\db_translated.json


In [15]:
# Check out the translations
matching_entry = [entry for entry in translated_data if entry.get(id_col, "unknown") == 1904]
matching_entry


[{'Unnamed: 0': 1904,
  'Start Date': '2024-11-06 12:58:00',
  'End Date': '2024-11-06 13:02:00',
  'Have you ever played Show Jumping Arena in Rival Stars Horse Racing? https://surveymonkey-assets.s3.amazonaws.com/survey/520321022/rte/c1a2b9d4-a68b-4299-a55b-2360b21235af.jpg': 'Yes',
  'Why have you not played Show Jumping Arena yet?': '',
  'Other (please specify)': '',
  'Please rate your overall experience playing Show Jumping Arena': 4.0,
  'Have you tried the new Show Jumping Switzerland course? https://surveymonkey-assets.s3.amazonaws.com/survey/520321022/rte/52ed4522-cc62-4a32-9448-e31c03dc036c.jpg': 'Yes',
  'How do you feel about the new Switzerland course?': 'Neutral',
  '[Optional] Please explain why you rated the above course the way you did.': "I like it, but I'd wish for different jumping levels, and jumps not so tight, and it's be easier on PC",
  'Refusals on_Please tick the following options you have tried around Show Jumping controls in new Switzerland course. [Selec

# Topic Extraction and Sentiment Analysis

In [16]:
# Parameters
id_column = "Unnamed: 0"                # Column name for entry IDs
columns_of_interest = ["player_response"]     # Which cols should be analyzed?
batch_size = 10                          # Fail-safe batching. The higher the number, the less often the progress is saved.

prepared_data = read_json(path_db_translated)

# Run analysis
analyse_data(
    translated_data=prepared_data,
    id_column=id_column,
    output_path=path_db_analysed,
    prompt_template_topic=prompt_template_topic,
    prompt_template_sentiment=prompt_template_sentiment,
    api_settings=api_settings,
    columns_of_interest=columns_of_interest,
    batch_size=batch_size
)

2024-11-28 08:59:37,940 - INFO - No existing progress found. Starting fresh.
2024-11-28 08:59:37,940 - INFO - Tokens used so far: Prompt Tokens: 0, Completion Tokens: 0
2024-11-28 08:59:37,940 - INFO - Extracting topics for entry ID 1446
2024-11-28 08:59:40,080 - INFO - Analyzing sentiment for topic 'Slow Button Redundancy' (Entry ID 1446)
2024-11-28 08:59:40,518 - INFO - Analyzing sentiment for topic 'Button Burden' (Entry ID 1446)
2024-11-28 08:59:40,997 - INFO - Tokens used so far: Prompt Tokens: 1051, Completion Tokens: 122
2024-11-28 08:59:40,997 - INFO - Extracting topics for entry ID 193
2024-11-28 08:59:42,925 - INFO - Analyzing sentiment for topic 'Control Sensitivity' (Entry ID 193)
2024-11-28 08:59:43,322 - INFO - Analyzing sentiment for topic 'Turning Difficulty' (Entry ID 193)
2024-11-28 08:59:43,702 - INFO - Tokens used so far: Prompt Tokens: 2031, Completion Tokens: 232
2024-11-28 08:59:43,702 - INFO - Extracting topics for entry ID 28
2024-11-28 08:59:44,559 - INFO - An

# Embedding

In [17]:
batch_size = 50
b_override = False  # Change to True if embeddings should be overwritten
embed_key = "topic"  # topic or "sentence"

# embed_model_name = 'sentence-transformers/all-mpnet-base-v2'
# embed_model_name = 'dunzhang/stella_en_1.5B_v5'

# Load the JSON data
data = read_json(path_db_analysed)

# Initialize the embedding model once
embed_model = initialize_embedding_model(model_name=embed_model_name)

# Initialize an empty list for all processed results
processed_results = []

# Process data in batches
for batch_start in range(0, len(data), batch_size):
    batch_end = min(batch_start + batch_size, len(data))
    batch = data[batch_start:batch_end]
    logger.info(f"Processing batch {batch_start // batch_size + 1} ({batch_start} to {batch_end})")
    processed_batch = process_batch(batch, embed_model, b_override, embed_key=embed_key)
    processed_results.extend(processed_batch)  # Collect processed batch results

# Convert all processed results to a DataFrame at once
df_table = json_to_table(processed_results)

# Save the final JSON table
save_df_as_json(df_table, path_db_embedded)
logger.info("Embedding and conversion to table format completed.")


2024-11-28 09:37:20,442 - INFO - Loading embedding model: all-MiniLM-L6-v2
  embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=model_name))
2024-11-28 09:37:33,303 - INFO - PyTorch version 2.4.0+cu124 available.
2024-11-28 09:37:33,319 - INFO - Polars version 1.12.0 available.
2024-11-28 09:37:33,334 - INFO - Duckdb version 1.1.2 available.
2024-11-28 09:37:34,848 - INFO - Use pytorch device_name: cuda
2024-11-28 09:37:34,848 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-11-28 09:37:38,354 - INFO - Processing batch 1 (0 to 50)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
2024-11-28 09:37:49,436 - INFO - Processing batch 2 (50 to 100)
2024-11-28 09:38:01,049 - INFO - Processing batch 3 (100 to 125)
2024-11-28 09:38:06,540 - INFO - Converting JSON data to a table format.
2024-11-28 09:38:06,550 - INFO - Conversion to table format completed.
2024-11-28 09:38:06,550 - INFO - Saving data to C:\Users\fbohm\Desktop\Projects\DataScienc

# Cluster Analysis

In [4]:
# Adjustable parameters
dimensionality_methods = ['UMAP', 'tSNE']
hdbscan_params = {"min_cluster_size": 15, "min_samples": 10, "cluster_selection_epsilon": 0.2}

# Load data
df_total = load_embedded_data(path_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat,
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)
logger.info(f"Results saved to {path_db_clustered}")

# how many unique cluster ids are in the data?
len(df_total['hdbscan_UMAP_2D'].unique(), 'HDBSACN clusters')

2024-11-28 10:00:25,826 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\db_embedded.json
2024-11-28 10:00:25,889 - INFO - Loaded 204 valid entries with embeddings.
2024-11-28 10:00:25,889 - INFO - Applying HDBSCAN in the original high-dimensional space with params: {'min_cluster_size': 15, 'min_samples': 10, 'cluster_selection_epsilon': 0.2}
2024-11-28 10:00:25,936 - INFO - Applying UMAP for 2D visualization.
2024-11-28 10:00:25,936 - INFO - Applying UMAP with 2 components.
  warn(
2024-11-28 10:00:32,262 - INFO - Applying UMAP for 3D visualization.
2024-11-28 10:00:32,278 - INFO - Applying UMAP with 3 components.
  warn(
2024-11-28 10:00:32,576 - INFO - Applying tSNE for 2D visualization.
2024-11-28 10:00:32,576 - INFO - Applying tSNE with 2 components.
2024-11-28 10:00:32,576 - INFO - Perplexity not provided, setting to 30 based on sample size.
2024-11-28 10:00:33,320 - INFO - Applying tSNE for 3D visualization.
2024-11-28 10:00:33,320

3

In [5]:
# Apply KMeans (if needed)
dimensionality_methods = ['UMAP', 'tSNE']
kmeans_clusters = [5, 8, 12, 15, 20, 30]

df_total = load_embedded_data(path_db_clustered)
df_total = apply_kmeans(
    df_total,
    mat,
    dimensionality_methods,
    kmeans_clusters,
    include_2d=True,
    include_3d=True
)

# Save results
# save_df_as_json(df_total, path_db_clustered)
# logger.info(f"Results saved to {path_db_clustered}")

2024-11-28 10:01:00,453 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\db_clustered.json
2024-11-28 10:01:00,516 - INFO - Loaded 204 valid entries with embeddings.
2024-11-28 10:01:00,516 - INFO - Applying KMeans with 5 clusters in high-dimensional space.
2024-11-28 10:01:00,532 - INFO - Applying UMAP in 2D.
2024-11-28 10:01:00,532 - INFO - Applying UMAP with 2 components.
  warn(
2024-11-28 10:01:00,876 - INFO - Applying UMAP in 3D.
2024-11-28 10:01:00,892 - INFO - Applying UMAP with 3 components.
  warn(
2024-11-28 10:01:01,203 - INFO - Applying tSNE in 2D.
2024-11-28 10:01:01,203 - INFO - Applying tSNE with 2 components.
2024-11-28 10:01:01,203 - INFO - Perplexity not provided, setting to 30 based on sample size.
2024-11-28 10:01:02,161 - INFO - Applying tSNE in 3D.
2024-11-28 10:01:02,161 - INFO - Applying tSNE with 3 components.
2024-11-28 10:01:02,161 - INFO - Perplexity not provided, setting to 30 based on sample size.
2024-11-28

In [6]:
save_df_as_json(df_total, path_db_clustered)
logger.info(f"Results saved to {path_db_clustered}")

2024-11-28 10:01:21,462 - INFO - Saving data to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\db_clustered.json
2024-11-28 10:01:21,673 - INFO - Results saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\db_clustered.json


# Cluster Naming

In [7]:
# Parameters
dimensionality_methods = ["UMAP", "tSNE"]
clustering_algorithms = ["hdbscan", 'kmeans']  # No KMeans here
max_centers = 8

#kmeans_clusters = [15, 20, 25, 50]  # Number of clusters for KMeans
kmeans_clusters = [5, 8, 12, 15, 20, 30]
# Load data
df_total = load_json_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(df_total, dimensionality_methods, clustering_algorithms, max_centers, api_settings, kmeans_clusters) # insert kmeans_clusters in the function when needed


# Save results
save_data_for_streamlit(df_total, path_db_final)

2024-11-28 10:03:52,773 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\db_clustered.json
2024-11-28 10:03:52,867 - INFO - Found 8 Topics for hdbscan_UMAP_2D ID: 0
2024-11-28 10:03:53,667 - INFO - Generated cluster name: Jump Timing and Mechanics Insights
2024-11-28 10:03:53,667 - INFO -  Tokens used so far: Prompt Tokens: 85, Completion Tokens: 5
2024-11-28 10:03:53,667 - INFO - HDBSCAN Cluster ID 0 (UMAP 2D): Jump Timing and Mechanics Insights
2024-11-28 10:03:53,667 - INFO - Found 8 Topics for hdbscan_UMAP_2D ID: 1
2024-11-28 10:03:54,110 - INFO - Generated cluster name: Course Design and Navigation Essentials
2024-11-28 10:03:54,110 - INFO -  Tokens used so far: Prompt Tokens: 166, Completion Tokens: 10
2024-11-28 10:03:54,110 - INFO - HDBSCAN Cluster ID 1 (UMAP 2D): Course Design and Navigation Essentials
2024-11-28 10:03:54,126 - INFO - Found 8 Topics for kmeans_5_UMAP_2D ID: 4
2024-11-28 10:03:54,561 - INFO - Generated cluster nam