In [1]:
# General modules
import os
import openai
from dotenv import load_dotenv

# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"
# embedding_model_name = 'sentence-transformers/all-mpnet-base-v2'


# Paths
root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis'
path_input = os.path.join(root_dir, "Data", "2024 Trimester 1.xlsx")

path_db_prepared = os.path.join(root_dir, "Data", "db_prepared.json")
path_db_analysed = os.path.join(root_dir, "Data", "db_analysed.json")
path_db_embedded = os.path.join(root_dir, "Data", "db_embedded.json")
path_db_clustered = os.path.join(root_dir, "Data", "db_clustered.json")
path_db_final = os.path.join(root_dir, "Data", "db_final.json")

In [20]:
# my imports
from helper.utils import *
from helper.data_analysis import *
from helper.prompt_templates import *
from helper.data_embedding import *
from helper.cluster_analysis import *
from helper.name_clusters import *


columns_of_interest = [
    "24_[OPTIONAL] Is there anything you'd like to share about the game (good, bad, frustrating, improvement, wishlist, etc.)?"
]

In [3]:
data = load_excel_to_data(path_input)

2024-11-20 14:00:33,079 - INFO - Loading Excel file: C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\2024 Trimester 1.xlsx
2024-11-20 14:00:43,042 - INFO - Removing the first entry of the dataset.
2024-11-20 14:00:43,043 - INFO - Excel data successfully loaded and converted to dictionary.


In [4]:
data_cleaned = clean_json_data(data)

2024-11-20 14:00:46,882 - INFO - Cleaned 0 entries from the dataset.


### Filter and enrich the data
#### - This function checks if the cols of interest contain strings that are longer than 3 words
#### - The removed entries in the logger are all the rows where this condition did NOT hold
#### - All entries where it holds get a new JSON key (column) called "player_response"

In [5]:
# This function checks if the cols of interest contain strings that are longer than 3 words
# The removed entries in the logger are all the rows where this condition did NOT hold
# All entries where it holds get a new JSON key (column) called "player_response"
filtered_data = filter_and_enrich_data(data_cleaned, columns_of_interest)

2024-11-20 14:00:48,734 - INFO - Total entries removed: 4397


In [6]:
len(filtered_data)

2375

### Optional: generate a sample size

In [7]:
sample_size = 100
seed = 42
sample_data = get_random_sample(filtered_data, sample_size, seed=seed)

In [8]:
# save the sample or the cleaned data as JSON
save_to_json(sample_data, path_db_prepared)
# save_to_json(filtered_data, path_db_prepared)

2024-11-20 14:00:54,108 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_prepared.json


# Translation


In [9]:
# Define col of interest and the col where the ID is specified
col_of_interest = ["player_response"]
id_col = "Unnamed: 0"

In [10]:
data = read_json(path_db_prepared)

In [11]:
# detect the language of the player response
language = detect_player_language(data, id_col, col_of_interest)

In [12]:
language[0]

{'Unnamed: 0': 1520,
 'Respondent ID': 114587000000,
 'Collector ID': 431039728,
 'Start Date': '2024-04-26 21:01:16',
 'End Date': '2024-04-26 21:09:56',
 'IP Address': '172.97.49.205',
 'Email Address': '',
 'First Name': '',
 'Last Name': '',
 'Custom Data 1': '',
 '1_Rate_Overall Rival Stars Horse Racing': '5 (Strongly Like)',
 '1_Rate_Breeding': '1 (Strongly Dislike)',
 '1_Rate_Training my horses': '1 (Strongly Dislike)',
 '1_Rate_Racing - Story': '3 (Neutral)',
 '1_Rate_Steeplechase': '3 (Neutral)',
 '1_Rate_Cross Country': '3 (Neutral)',
 '1_Rate_Free Roam': '3 (Neutral)',
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': 2,
 '1_Rate_Completing Story Goals': 4,
 '1_Rate_Foal Caring': '1 (Strongly Dislike)',
 '1_Rate_Pasture': 2,
 '1_Rate_Foaling season event': '5 (Strongly Like)',
 '1_Rate_Steeplechase Stars': '3 (Neutral)',
 '1_Rate_Arabian Days': '1 (Strongly Dislike)',
 '1_Rate_Empowering my horse with skills': 4,
 '1_Rate_Customizing my horse with tack': 4,

In [13]:
configure_api(client, chat_model_name)

In [14]:
translated_data = translate_data(language, id_col, prompt_template_translation, api_settings, columns_of_interest)

2024-11-20 14:01:05,110 - INFO - Translating entry ID 3444 (Language: german)


In [15]:
# Check out the translations
matching_entry = [entry for entry in translated_data if entry.get(id_col, "unknown") == 3444]
matching_entry


[{'Unnamed: 0': 3444,
  'Respondent ID': 114585000000,
  'Collector ID': 431039728,
  'Start Date': '2024-04-25 04:46:57',
  'End Date': '2024-04-25 04:58:36',
  'IP Address': '145.53.23.6',
  'Email Address': '',
  'First Name': '',
  'Last Name': '',
  'Custom Data 1': '',
  '1_Rate_Overall Rival Stars Horse Racing': '5 (Strongly Like)',
  '1_Rate_Breeding': '5 (Strongly Like)',
  '1_Rate_Training my horses': 2,
  '1_Rate_Racing - Story': '3 (Neutral)',
  '1_Rate_Steeplechase': 4,
  '1_Rate_Cross Country': '5 (Strongly Like)',
  '1_Rate_Free Roam': '3 (Neutral)',
  '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': '5 (Strongly Like)',
  '1_Rate_Completing Story Goals': '3 (Neutral)',
  '1_Rate_Foal Caring': '3 (Neutral)',
  '1_Rate_Pasture': 2,
  '1_Rate_Foaling season event': '3 (Neutral)',
  '1_Rate_Steeplechase Stars': '3 (Neutral)',
  '1_Rate_Arabian Days': 4,
  '1_Rate_Empowering my horse with skills': 4,
  '1_Rate_Customizing my horse with tack': '3 (Neutral)',

# Topic Extraction and Sentiment Analysis

In [16]:
# Parameters
id_column = "Unnamed: 0"                # Column name for entry IDs
columns_of_interest = ["player_response"]     # Which cols should be analyzed?
batch_size = 5                          # Fail-safe batching. The higher the number, the less often the progress is saved.

prepared_data = read_json(path_db_prepared)

# Run analysis
analyse_data(
    translated_data=prepared_data,
    id_column=id_column,
    output_path=path_db_analysed,
    prompt_template_topic=prompt_template_topic,
    prompt_template_sentiment=prompt_template_sentiment,
    api_settings=api_settings,
    columns_of_interest=columns_of_interest,
    batch_size=batch_size
)

2024-11-20 14:01:25,286 - INFO - No existing progress found. Starting fresh.
2024-11-20 14:01:25,286 - INFO - Tokens used so far: Prompt Tokens: 0, Completion Tokens: 0
2024-11-20 14:01:25,286 - INFO - Extracting topics for entry ID 1520
2024-11-20 14:01:27,193 - INFO - Analyzing sentiment for topic 'Hidden Coats' (Entry ID 1520)
2024-11-20 14:01:27,607 - INFO - Analyzing sentiment for topic 'Breeding Difficulty' (Entry ID 1520)
2024-11-20 14:01:28,169 - INFO - Tokens used so far: Prompt Tokens: 962, Completion Tokens: 99
2024-11-20 14:01:28,169 - INFO - Extracting topics for entry ID 320
2024-11-20 14:01:29,637 - INFO - Analyzing sentiment for topic 'Game Enjoyment' (Entry ID 320)
2024-11-20 14:01:30,633 - INFO - Analyzing sentiment for topic 'Wishlist' (Entry ID 320)
2024-11-20 14:01:31,056 - INFO - Analyzing sentiment for topic 'Mobile Version' (Entry ID 320)
2024-11-20 14:01:31,560 - INFO - Analyzing sentiment for topic 'Desktop Version' (Entry ID 320)
2024-11-20 14:01:32,104 - INF

# Embedding

In [18]:
batch_size = 10
b_override = False  # Change to True if embeddings should be overwritten
embed_key = "topic"  # Change to "sentence" if you want to embed sentences

# Load the JSON data
data = read_json(path_db_analysed)

# Initialize the embedding model once
embed_model = initialize_embedding_model(model_name=embed_model_name)

# Process data in batches
for batch_start in range(0, len(data), batch_size):
    batch_end = min(batch_start + batch_size, len(data))
    batch = data[batch_start:batch_end]
    logger.info(f"Processing batch {batch_start // batch_size + 1} ({batch_start} to {batch_end})")
    batch = process_batch(batch, embed_model, b_override, embed_key=embed_key)
    data[batch_start:batch_end] = batch

# Convert the data to table format
df_table = convert_to_table(data)

# Save the final JSON table
save_to_json(df_table, path_db_embedded)
logger.info("Embedding and conversion to table format completed.")

2024-11-20 14:08:20,513 - INFO - Loading embedding model: all-MiniLM-L6-v2
  embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=model_name))
2024-11-20 14:08:23,327 - INFO - PyTorch version 2.4.0+cu124 available.
2024-11-20 14:08:23,327 - INFO - Polars version 1.12.0 available.
2024-11-20 14:08:23,327 - INFO - Duckdb version 1.1.2 available.
2024-11-20 14:08:24,243 - INFO - Use pytorch device_name: cuda
2024-11-20 14:08:24,243 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-11-20 14:08:27,323 - INFO - Processing batch 1 (0 to 10)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
2024-11-20 14:08:33,721 - INFO - Processing batch 2 (10 to 20)
2024-11-20 14:08:38,964 - INFO - Processing batch 3 (20 to 30)
2024-11-20 14:08:44,772 - INFO - Processing batch 4 (30 to 40)
2024-11-20 14:08:48,776 - INFO - Processing batch 5 (40 to 50)
2024-11-20 14:08:53,617 - INFO - Processing batch 6 (50 to 60)
2024-11-20 14:08:57,635 - INFO - Processing batc

# Cluster Analysis

In [19]:

# Adjustable parameters
dimensionality_methods = ['UMAP', 'PCA', 'tSNE']  # Dimensionality reduction methods
kmeans_clusters = [15, 20, 25, 30]  # Number of clusters for KMeans
kmeans_seed = 42  # Seed for reproducibility
include_2d = True  # Whether to include 2D results
include_3d = True  # Whether to include 3D results
hdbscan_params = {"min_cluster_size": 8, "min_samples": 5, "cluster_selection_epsilon": 0.2}  # HDBSCAN params

# t-SNE specific parameter
perplexity = 15  # Set to a default or user-defined value

# Load data
df_total = load_embedded_data(path_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply dimensionality reduction and clustering
apply_clustering(
    df_total,
    mat,
    dimensionality_methods,
    kmeans_clusters,
    path_db_clustered,
    hdbscan_params=hdbscan_params,
    kmeans_seed=kmeans_seed,
    include_2d=include_2d,
    include_3d=include_3d
)


2024-11-20 14:13:51,901 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_embedded.json
2024-11-20 14:13:52,010 - INFO - Loaded 236 valid entries with embeddings.
2024-11-20 14:13:52,010 - INFO - Applying UMAP with 2 components.
  warn(
2024-11-20 14:13:58,030 - INFO - Applying HDBSCAN on UMAP 2D with params: {'min_cluster_size': 8, 'min_samples': 5, 'cluster_selection_epsilon': 0.2}
2024-11-20 14:13:58,030 - INFO - Applying KMeans with 15 clusters on UMAP 2D.
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

2024-11-20 14:13:58,353 - INFO - Applying KMeans with 20 clusters on UMAP 2D.
20

# Cluster Naming

In [22]:
dimensionality_methods = ["UMAP", "PCA", "tSNE"]
clustering_algorithms = ["hdbscan", "kmeans"]
kmeans_clusters = [15, 20, 25, 30]  # Number of clusters for KMeans
max_centers = 8  # Maximum number of topics for naming

# Load data
df_total = load_data_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(df_total, dimensionality_methods, clustering_algorithms, kmeans_clusters, max_centers, chat_model_name)

# Save results
save_data(df_total, path_db_final)


2024-11-20 14:17:35,630 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_clustered.json
2024-11-20 14:17:35,755 - INFO - Found 8 representative topics for cluster ID 7 in column hdbscan_UMAP_2D.
2024-11-20 14:17:35,755 - INFO - Generating cluster name using OpenAI API.
2024-11-20 14:17:36,586 - INFO - Generated cluster name: Character Appearance and Customization
2024-11-20 14:17:36,586 - INFO - Found 8 representative topics for cluster ID 0 in column hdbscan_UMAP_2D.
2024-11-20 14:17:36,586 - INFO - Generating cluster name using OpenAI API.
2024-11-20 14:17:37,078 - INFO - Generated cluster name: Breeding Concepts and Processes
2024-11-20 14:17:37,082 - INFO - Found 8 representative topics for cluster ID 4 in column hdbscan_UMAP_2D.
2024-11-20 14:17:37,083 - INFO - Generating cluster name using OpenAI API.
2024-11-20 14:17:38,483 - INFO - Generated cluster name: Game Enjoyment and Appreciation
2024-11-20 14:17:38,483 - INFO - Found 8 repr