In [1]:
# General modules
import os
import openai
from dotenv import load_dotenv

# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"
# embed_model_name = 'sentence-transformers/all-mpnet-base-v2'

# Paths
root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis'
path_input = os.path.join(root_dir, "Data", "HRC SJ data.xlsx")

path_db_prepared = os.path.join(root_dir, "Data", "db_prepared.json")
path_db_translated = os.path.join(root_dir, "Data", "db_translated.json")
path_db_analysed = os.path.join(root_dir, "Data", "db_analysed.json")
path_db_embedded = os.path.join(root_dir, "Data", "db_embedded.json")
path_db_clustered = os.path.join(root_dir, "Data", "db_clustered.json")
path_db_final = os.path.join(root_dir, "Data", "db_final.json")

In [2]:
# my imports
from helper.utils import *
from helper.data_analysis import *
from helper.prompt_templates import *
from helper.embedding import *
from helper.cluster_analysis import *
from helper.cluster_naming import *


columns_of_interest = [
    "[Optional] Is there anything you currently find frustrating in the Show Jumping?"
]

configure_api(client, chat_model_name)

In [3]:
data = load_excel_to_data(path_input)

2024-11-27 11:03:25,455 - INFO - Loading Excel file: C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC SJ data.xlsx
2024-11-27 11:03:26,765 - INFO - Removing the first entry of the dataset.
2024-11-27 11:03:26,765 - INFO - Excel data successfully loaded and converted to dictionary.


In [4]:
data_cleaned = clean_json_data(data)

2024-11-27 11:03:29,908 - INFO - Cleaned 0 entries from the dataset.


### Filter and enrich the data
#### - This function checks if the cols of interest contain strings that are longer than 3 words
#### - The removed entries in the logger are all the rows where this condition did NOT hold
#### - All entries where it holds get a new JSON key (column) called "player_response"

In [5]:
# This function checks if the cols of interest contain strings that are longer than 3 words
# The removed entries in the logger are all the rows where this condition did NOT hold
# All entries where it holds get a new JSON key (column) called "player_response"
filtered_data = filter_and_enrich_data(data_cleaned, columns_of_interest)

2024-11-27 11:03:33,015 - INFO - Total entries removed: 1813


In [6]:
len(filtered_data)

125

### Optional: generate a sample size

In [7]:
sample_size = 125
seed = 42
sample_data = get_random_sample(filtered_data, sample_size, seed=seed)

In [9]:
# save the sample or the cleaned data as JSON
save_to_json(sample_data, path_db_prepared)
# save_to_json(filtered_data, path_db_prepared)

2024-11-27 11:11:50,943 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_prepared.json


# Translation


In [10]:
# Define col of interest and the col where the ID is specified
col_of_interest = ["player_response"]
id_col = "Unnamed: 0"

In [11]:
data = read_json(path_db_prepared)

In [12]:
# detect the language of the player response
language = detect_player_language(data, id_col, col_of_interest)

In [13]:
language[0]

{'Unnamed: 0': 1446,
 'Start Date': '2024-11-10 06:27:00',
 'End Date': '2024-11-10 06:42:00',
 'Have you ever played Show Jumping Arena in Rival Stars Horse Racing? https://surveymonkey-assets.s3.amazonaws.com/survey/520321022/rte/c1a2b9d4-a68b-4299-a55b-2360b21235af.jpg': 'Yes',
 'Why have you not played Show Jumping Arena yet?': '',
 'Other (please specify)': '',
 'Please rate your overall experience playing Show Jumping Arena': 4.0,
 'Have you tried the new Show Jumping Switzerland course? https://surveymonkey-assets.s3.amazonaws.com/survey/520321022/rte/52ed4522-cc62-4a32-9448-e31c03dc036c.jpg': 'Yes',
 'How do you feel about the new Switzerland course?': 'Love it',
 '[Optional] Please explain why you rated the above course the way you did.': 'It kind of looked like there was a graveyard to one side, which I didnâ€™t mind! But might be kinda weird for other players. I believe these are meant to be stones? Or tree trunks? ',
 'Refusals on_Please tick the following options you have 

#### API calls for translation

In [14]:
translated_data = translate_data(language, id_col, prompt_template_translation, api_settings, columns_of_interest)

2024-11-27 11:12:52,618 - INFO - Translating entry ID 1904 (Language: german)
2024-11-27 11:12:53,569 - INFO - Translating entry ID 1524 (Language: german)
2024-11-27 11:12:54,167 - INFO - Translating entry ID 1521 (Language: german)


In [15]:
save_to_json(translated_data, path_db_translated)

2024-11-27 11:12:58,179 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_translated.json


In [16]:
# Check out the translations
matching_entry = [entry for entry in translated_data if entry.get(id_col, "unknown") == 5784]
matching_entry


[{'Unnamed: 0': 5784,
  'Respondent ID': 114585000000,
  'Collector ID': 431039728,
  'Start Date': '2024-04-24 17:03:56',
  'End Date': '2024-04-24 17:15:42',
  'IP Address': '91.179.214.141',
  'Email Address': '',
  'First Name': '',
  'Last Name': '',
  'Custom Data 1': '',
  '1_Rate_Overall Rival Stars Horse Racing': '3 (Neutral)',
  '1_Rate_Breeding': 2,
  '1_Rate_Training my horses': '3 (Neutral)',
  '1_Rate_Racing - Story': '3 (Neutral)',
  '1_Rate_Steeplechase': '3 (Neutral)',
  '1_Rate_Cross Country': 2,
  '1_Rate_Free Roam': '3 (Neutral)',
  '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': '3 (Neutral)',
  '1_Rate_Completing Story Goals': '3 (Neutral)',
  '1_Rate_Foal Caring': '3 (Neutral)',
  '1_Rate_Pasture': '3 (Neutral)',
  '1_Rate_Foaling season event': '3 (Neutral)',
  '1_Rate_Steeplechase Stars': '3 (Neutral)',
  '1_Rate_Arabian Days': '1 (Strongly Dislike)',
  '1_Rate_Empowering my horse with skills': '3 (Neutral)',
  '1_Rate_Customizing my horse wi

# Topic Extraction and Sentiment Analysis

In [16]:
# Parameters
id_column = "Unnamed: 0"                # Column name for entry IDs
columns_of_interest = ["player_response"]     # Which cols should be analyzed?
batch_size = 10                          # Fail-safe batching. The higher the number, the less often the progress is saved.

prepared_data = read_json(path_db_prepared)

# Run analysis
analyse_data(
    translated_data=prepared_data,
    id_column=id_column,
    output_path=path_db_analysed,
    prompt_template_topic=prompt_template_topic,
    prompt_template_sentiment=prompt_template_sentiment,
    api_settings=api_settings,
    columns_of_interest=columns_of_interest,
    batch_size=batch_size
)

2024-11-27 11:13:20,497 - INFO - No existing progress found. Starting fresh.
2024-11-27 11:13:20,497 - INFO - Tokens used so far: Prompt Tokens: 0, Completion Tokens: 0
2024-11-27 11:13:20,512 - INFO - Extracting topics for entry ID 1446
2024-11-27 11:13:22,252 - INFO - Analyzing sentiment for topic 'Slow Button Functionality' (Entry ID 1446)
2024-11-27 11:13:22,692 - INFO - Analyzing sentiment for topic 'Button Burden' (Entry ID 1446)
2024-11-27 11:13:23,330 - INFO - Tokens used so far: Prompt Tokens: 1035, Completion Tokens: 109
2024-11-27 11:13:23,330 - INFO - Extracting topics for entry ID 193
2024-11-27 11:13:25,007 - INFO - Analyzing sentiment for topic 'Control Sensitivity' (Entry ID 193)
2024-11-27 11:13:25,455 - INFO - Analyzing sentiment for topic 'Turning Difficulty' (Entry ID 193)
2024-11-27 11:13:25,876 - INFO - Tokens used so far: Prompt Tokens: 2014, Completion Tokens: 218
2024-11-27 11:13:25,876 - INFO - Extracting topics for entry ID 28
2024-11-27 11:13:26,890 - INFO -

# Embedding

In [17]:
batch_size = 50
b_override = False  # Change to True if embeddings should be overwritten
embed_key = "topic"  # topic or "sentence"

# embed_model_name = 'sentence-transformers/all-mpnet-base-v2'
# embed_model_name = 'dunzhang/stella_en_1.5B_v5'

# Load the JSON data
data = read_json(path_db_analysed)

# Initialize the embedding model once
embed_model = initialize_embedding_model(model_name=embed_model_name)

# Initialize an empty list for all processed results
processed_results = []

# Process data in batches
for batch_start in range(0, len(data), batch_size):
    batch_end = min(batch_start + batch_size, len(data))
    batch = data[batch_start:batch_end]
    logger.info(f"Processing batch {batch_start // batch_size + 1} ({batch_start} to {batch_end})")
    processed_batch = process_batch(batch, embed_model, b_override, embed_key=embed_key)
    processed_results.extend(processed_batch)  # Collect processed batch results

# Convert all processed results to a DataFrame at once
df_table = json_to_table(processed_results)

# Save the final JSON table
save_df_as_json(df_table, path_db_embedded)
logger.info("Embedding and conversion to table format completed.")


2024-11-27 11:19:28,917 - INFO - Loading embedding model: all-MiniLM-L6-v2
  embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=model_name))
2024-11-27 11:19:41,038 - INFO - PyTorch version 2.4.0+cu124 available.
2024-11-27 11:19:41,054 - INFO - Polars version 1.12.0 available.
2024-11-27 11:19:41,070 - INFO - Duckdb version 1.1.2 available.
2024-11-27 11:19:42,666 - INFO - Use pytorch device_name: cuda
2024-11-27 11:19:42,666 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-11-27 11:19:46,072 - INFO - Processing batch 1 (0 to 50)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
2024-11-27 11:19:57,426 - INFO - Processing batch 2 (50 to 100)
2024-11-27 11:20:08,165 - INFO - Processing batch 3 (100 to 125)
2024-11-27 11:20:13,310 - INFO - Converting JSON data to a table format.
2024-11-27 11:20:13,326 - INFO - Conversion to table format completed.
2024-11-27 11:20:13,326 - INFO - Saving data to C:\Users\fbohm\Desktop\Projects\DataScienc

In [None]:
# Embedding with OpenAI
batch_size = 50



# Cluster Analysis

In [18]:
# Adjustable parameters
dimensionality_methods = ['UMAP', 'tSNE']
hdbscan_params = {"min_cluster_size": 8, "min_samples": 4, "cluster_selection_epsilon": 0.2}

# Load data
df_total = load_embedded_data(path_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat,
    dimensionality_methods,
    path_db_clustered,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)
logger.info(f"Results saved to {path_db_clustered}")

# how many unique cluster ids are in the data?
len(df_total['hdbscan_UMAP_2D'].unique())

2024-11-27 11:20:50,490 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_embedded.json
2024-11-27 11:20:50,572 - INFO - Loaded 204 valid entries with embeddings.
2024-11-27 11:20:50,576 - INFO - Applying HDBSCAN in the original high-dimensional space with params: {'min_cluster_size': 8, 'min_samples': 4, 'cluster_selection_epsilon': 0.2}
2024-11-27 11:20:50,607 - INFO - Applying UMAP for 2D visualization.
2024-11-27 11:20:50,622 - INFO - Applying UMAP with 2 components.
  warn(
2024-11-27 11:20:57,006 - INFO - Applying UMAP for 3D visualization.
2024-11-27 11:20:57,006 - INFO - Applying UMAP with 3 components.
  warn(
2024-11-27 11:20:57,327 - INFO - Applying tSNE for 2D visualization.
2024-11-27 11:20:57,327 - INFO - Applying tSNE with 2 components.
2024-11-27 11:20:57,327 - INFO - Perplexity not provided, setting to 30 based on sample size.
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries 

4

In [19]:
# Apply KMeans (if needed)
dimensionality_methods = ['UMAP', 'tSNE']
kmeans_clusters = [5, 8, 12]

df_total = load_embedded_data(path_db_clustered)
df_total = apply_kmeans(
    df_total,
    mat,
    dimensionality_methods,
    kmeans_clusters,
    path_db_clustered,
    include_2d=True,
    include_3d=True
)

# Save results
# save_df_as_json(df_total, path_db_clustered)
# logger.info(f"Results saved to {path_db_clustered}")

2024-11-27 11:21:39,224 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_clustered.json
2024-11-27 11:21:39,302 - INFO - Loaded 204 valid entries with embeddings.
2024-11-27 11:21:39,302 - INFO - Applying UMAP with 2 components.
  warn(
2024-11-27 11:21:39,611 - INFO - Applying KMeans with 5 clusters on UMAP 2D.
2024-11-27 11:21:39,643 - INFO - Applying KMeans with 8 clusters on UMAP 2D.
2024-11-27 11:21:39,643 - INFO - Applying KMeans with 12 clusters on UMAP 2D.
2024-11-27 11:21:39,658 - INFO - Applying UMAP with 3 components.
  warn(
2024-11-27 11:21:39,990 - INFO - Applying KMeans with 5 clusters on UMAP 3D.
2024-11-27 11:21:39,990 - INFO - Applying KMeans with 8 clusters on UMAP 3D.
2024-11-27 11:21:40,005 - INFO - Applying KMeans with 12 clusters on UMAP 3D.
2024-11-27 11:21:40,021 - INFO - Applying tSNE with 2 components.
2024-11-27 11:21:40,021 - INFO - Perplexity not provided, setting to 30 based on sample size.
2024-11-27 11:21:4

In [20]:
save_df_as_json(df_total, path_db_clustered)
logger.info(f"Results saved to {path_db_clustered}")

2024-11-27 11:21:46,280 - INFO - Saving data to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_clustered.json
2024-11-27 11:21:46,465 - INFO - Results saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_clustered.json


# Cluster Naming

In [22]:
# Parameters
dimensionality_methods = ["UMAP", "tSNE"]
clustering_algorithms = ["hdbscan", 'kmeans']  # No KMeans here
max_centers = 8

#kmeans_clusters = [15, 20, 25, 50]  # Number of clusters for KMeans
kmeans_clusters = [5, 8, 12]
# Load data
df_total = load_json_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(df_total, dimensionality_methods, clustering_algorithms, max_centers, api_settings, kmeans_clusters) # insert kmeans_clusters in the function when needed


# Save results
save_data_for_streamlit(df_total, path_db_final)

2024-11-27 11:24:40,582 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_clustered.json
2024-11-27 11:24:40,644 - INFO - Found 8 Topics for hdbscan_UMAP_2D ID: 1
2024-11-27 11:24:41,344 - INFO - Generated cluster name: Jump Strategy and Execution
2024-11-27 11:24:41,344 - INFO -  Tokens used so far: Prompt Tokens: 325, Completion Tokens: 16
2024-11-27 11:24:41,344 - INFO - HDBSCAN Cluster ID 1 (UMAP 2D): Jump Strategy and Execution
2024-11-27 11:24:41,344 - INFO - Found 8 Topics for hdbscan_UMAP_2D ID: 0
2024-11-27 11:24:41,780 - INFO - Generated cluster name: Speed and Turning Control Techniques
2024-11-27 11:24:41,780 - INFO -  Tokens used so far: Prompt Tokens: 406, Completion Tokens: 21
2024-11-27 11:24:41,780 - INFO - HDBSCAN Cluster ID 0 (UMAP 2D): Speed and Turning Control Techniques
2024-11-27 11:24:41,786 - INFO - Found 8 Topics for hdbscan_UMAP_2D ID: 2
2024-11-27 11:24:42,266 - INFO - Generated cluster name: Effective Course Des