In [1]:
# General modules
import os
import openai
from dotenv import load_dotenv

# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"
# embedding_model_name = 'sentence-transformers/all-mpnet-base-v2'


# Paths
root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\DRS'
path_input = os.path.join(root_dir, "Into the Dead Our Darkest Days_1511resposnes.xlsx")

path_db_prepared = os.path.join(root_dir, "db_prepared.json")
path_db_analysed = os.path.join(root_dir, "db_analysed.json")
path_db_embedded = os.path.join(root_dir, "db_embedded.json")
path_db_clustered = os.path.join(root_dir, "db_clustered.json")
path_db_final = os.path.join(root_dir, "db_final.json")

In [2]:
# my imports
from helper.utils import *
from helper.data_analysis import *
from helper.prompt_templates import *
from helper.embedding import *
from helper.cluster_analysis import *
from helper.cluster_naming import *


columns_of_interest = [
    "Please tell us why you chose the rating above:", 
    "If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?"
]

configure_api(client, chat_model_name)

In [3]:
data = load_excel_to_data(path_input)

2024-11-28 09:38:58,822 - INFO - Loading Excel file: C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\DRS\Into the Dead Our Darkest Days_1511resposnes.xlsx
2024-11-28 09:38:59,319 - INFO - Removing the first entry of the dataset.
2024-11-28 09:38:59,319 - INFO - Excel data successfully loaded and converted to dictionary.


In [4]:
data_cleaned = clean_json_data(data)

2024-11-28 09:39:02,807 - INFO - Cleaned 0 entries from the dataset.


In [5]:
# This function checks if the cols of interest contain strings that are longer than 3 words
# The removed entries in the logger are all the rows where this condition did NOT hold
# All entries where it holds get a new JSON key (column) called "player_response"
filtered_data = filter_and_enrich_data(data_cleaned, columns_of_interest)

2024-11-28 09:39:04,452 - INFO - Total entries removed: 220


In [6]:
len(filtered_data)

1291

In [7]:
filtered_data[4]

{'Please rate your overall experience playing Into the Dead: Our Darkest Days': 8,
 'Please tell us why you chose the rating above:': 'simple but engaging gameplay. Easy to use inventory system. Tactical decision making. ',
 'If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?': 'distractions??? i did not play the whole demo yet so if there is distractions in game then Good Job :)',
 'Had you heard of Into the Dead before this demo?': 'Never heard of it',
 'What is your age group': '35-44',
 'What is your gender?': 'Man',
 'What are your favourite Steam games you have played in the last 3 months?': 'Vigor  Once Human',
 'player_response': 'simple but engaging gameplay. Easy to use inventory system. Tactical decision making. distractions??? i did not play the whole demo yet so if there is distractions in game then Good Job :)'}

In [8]:
sample_size = 100
seed = 42
sample_data = get_random_sample(filtered_data, sample_size, seed=seed)

In [9]:
save_to_json(sample_data, path_db_prepared)

2024-11-28 09:39:23,180 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\DRS\db_prepared.json


# Translation

In [None]:
# Data has no ID so we generate one with the function from helper.utils

In [10]:
data_prepared = read_json(path_db_prepared)

In [11]:
# A unique ID is generated in the new column / key "response_ID"
data_prepared = generate_ID(data_prepared)
save_to_json(data_prepared, path_db_prepared)

2024-11-20 13:16:16,582 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_prepared.json


In [12]:
data_prepared[0]

{'Please rate your overall experience playing Into the Dead: Our Darkest Days': 10,
 'Please tell us why you chose the rating above:': 'Definitely a different approach to the "Into the Dead" series, I totally loved it. Makes me feel somewhat proud seeing how the series developed when I myself played the original Into the Dead on mobile.',
 'If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?': 'The durability of some weapons can be improved, and also adding a stealth kill even without a weapon can be useful because in my experience when scavenging without a weapon is really difficult considering I cannot do anything against the zombies and they take a lot of hits when attacking them unarmed.',
 'Had you heard of Into the Dead before this demo?': 'Played another Into the Dead game',
 'What is your age group': '18-24',
 'What is your gender?': 'Man',
 'What are your favourite Steam games you have played in the last 3 months?': "I

In [13]:
# Define col of interest and the col where the ID is specified
col_of_interest = ["player_response"]
id_col = "response_ID"

In [14]:
# detect the language of the player response
language = detect_player_language(data_prepared, id_col, col_of_interest)

2024-11-20 13:16:35,113 - ERROR - Error detecting language for entry #43: 'NoneType' object has no attribute 'name'


In [15]:
configure_api(client, chat_model_name)

In [16]:
translated_data = translate_data(language, id_col, prompt_template_translation, api_settings, columns_of_interest)

2024-11-20 13:16:51,250 - INFO - Translating entry ID 30 (Language: french)
2024-11-20 13:16:55,148 - INFO - Translating entry ID 43 (Language: error)
2024-11-20 13:16:57,084 - INFO - Translating entry ID 81 (Language: spanish)
2024-11-20 13:16:57,804 - INFO - Translating entry ID 85 (Language: spanish)
2024-11-20 13:17:00,211 - INFO - Translating entry ID 94 (Language: french)


In [17]:
matching_entry = [entry for entry in translated_data if entry.get(id_col, "unknown") == 43]
matching_entry


[{'Please rate your overall experience playing Into the Dead: Our Darkest Days': 10,
  'Please tell us why you chose the rating above:': 'Атмосферно, хорошие звуки. Красивая картинка и приятный геймлпей боя. Очень хорошо ощущается каждый удар, особенно мне понравилась раздавливать головы зомби',
  'If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?': 'Можно добавить вариативность ударов в зависимости от персонажа. Допустим у какого-то из персонажей есть навыки боксера ( потому что по своей биографии он занимался боксом ) а какому-то персонажу дать навык карате, и этот персонаж отлично бьет ногой по голове ( опять же, потому что в его биографии есть занятие карате в былые годы ) Тем самым вы разнообразите боевую систему и у некоторых персонажей будут индивидуальные удары',
  'Had you heard of Into the Dead before this demo?': 'Heard of it',
  'What is your age group': '25-34',
  'What is your gender?': 'Man',
  'What are your f

# Topic Extraction and Sentiment Analysis

In [19]:
# Parameters
col_of_interest = ["player_response"]
id_col = "response_ID"
batch_size = 5                          # Fail-safe batching. The higher the number, the less often the progress is saved.

prepared_data = translated_data

# Run analysis
analyse_data(
    translated_data=prepared_data,
    id_column=id_col,
    output_path=path_db_analysed,
    prompt_template_topic=prompt_template_topic,
    prompt_template_sentiment=prompt_template_sentiment,
    api_settings=api_settings,
    columns_of_interest=columns_of_interest,
    batch_size=batch_size
)

2024-11-20 13:18:02,334 - INFO - Loading existing progress from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_analysed.json
2024-11-20 13:18:02,334 - INFO - Skipping already processed entry ID 1
2024-11-20 13:18:02,334 - INFO - Skipping already processed entry ID 2
2024-11-20 13:18:02,334 - INFO - Skipping already processed entry ID 3
2024-11-20 13:18:02,334 - INFO - Tokens used so far: Prompt Tokens: 3551, Completion Tokens: 443
2024-11-20 13:18:02,334 - INFO - Extracting topics for entry ID 4
2024-11-20 13:18:09,091 - INFO - Analyzing sentiment for topic 'Controls' (Entry ID 4)
2024-11-20 13:18:09,516 - INFO - Analyzing sentiment for topic 'Gameplay' (Entry ID 4)
2024-11-20 13:18:09,965 - INFO - Analyzing sentiment for topic 'Zombies' (Entry ID 4)
2024-11-20 13:18:10,442 - INFO - Analyzing sentiment for topic 'World Design' (Entry ID 4)
2024-11-20 13:18:10,987 - INFO - Analyzing sentiment for topic 'Looting System' (Entry ID 4)
2024-11-20 13:18:11,364 - INFO - 

KeyboardInterrupt: 

# Embedding

In [25]:
batch_size = 2
b_override = False  # Change to True if embeddings should be overwritten
embed_key = "topic"  # Change to "sentence" if you want to embed sentences

# Load the JSON data
data = read_json(path_db_analysed)

# Initialize the embedding model once
embed_model = initialize_embedding_model(model_name=embed_model_name)

# Process data in batches
for batch_start in range(0, len(data), batch_size):
    batch_end = min(batch_start + batch_size, len(data))
    batch = data[batch_start:batch_end]
    logger.info(f"Processing batch {batch_start // batch_size + 1} ({batch_start} to {batch_end})")
    batch = process_batch(batch, embed_model, b_override, embed_key=embed_key)
    data[batch_start:batch_end] = batch

# Convert the data to table format
df_table = json_to_table(data)

# Save the final JSON table
save_to_json(df_table, path_db_embedded)
logger.info("Embedding and conversion to table format completed.")

2024-11-20 13:51:24,382 - INFO - Loading embedding model: all-MiniLM-L6-v2
2024-11-20 13:51:24,382 - INFO - Use pytorch device_name: cuda
2024-11-20 13:51:24,382 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-11-20 13:51:26,860 - INFO - Processing batch 1 (0 to 2)
2024-11-20 13:51:28,435 - INFO - Processing batch 2 (2 to 4)
2024-11-20 13:51:31,517 - INFO - Converting JSON data to a table format.
2024-11-20 13:51:31,534 - INFO - Conversion to table format completed.
2024-11-20 13:51:31,534 - INFO - Saving data to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_embedded.json
2024-11-20 13:51:31,566 - INFO - Embedding and conversion to table format completed.


# Clustering

In [26]:
# Adjustable parameters
dimensionality_methods = ['UMAP', 'tSNE']
hdbscan_params = {"min_cluster_size": 20, "min_samples": 1, "cluster_selection_epsilon": 0.2}

# Load data
df_total = load_embedded_data(path_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat,
    dimensionality_methods,
    path_db_clustered,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)
logger.info(f"Results saved to {path_db_clustered}")

# how many unique cluster ids are in the data?
len(df_total['hdbscan_UMAP_2D'].unique())

2024-11-20 13:53:15,963 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_embedded.json
2024-11-20 13:53:15,979 - INFO - Loaded 21 valid entries with embeddings.
2024-11-20 13:53:15,979 - INFO - Applying UMAP with 2 components.
  warn(
2024-11-20 13:53:21,993 - INFO - Applying HDBSCAN on UMAP 2D with params: {'min_cluster_size': 5, 'min_samples': 3, 'cluster_selection_epsilon': 0.2}
2024-11-20 13:53:22,009 - INFO - Applying KMeans with 5 clusters on UMAP 2D.
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

2024-11-20 13:53:22,387 - INFO - Applying KMeans with 10 clusters on UMAP 2D.
2024

# Cluster Naming

In [5]:
# Paths and parameters
dimensionality_methods = ["UMAP", "PCA", "tSNE"]
clustering_algorithms = ["hdbscan", "kmeans"]
kmeans_clusters = [15, 20, 25, 50]  # Number of clusters for KMeans
max_centers = 8  # Maximum number of topics for naming

# Load data
df_total = load_json_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(df_total, dimensionality_methods, clustering_algorithms, kmeans_clusters, max_centers,
                            api_settings)

# Save results
save_data_for_streamlit(df_total, path_db_final)

2024-11-22 08:27:47,788 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_clustered.json
2024-11-22 08:27:48,870 - INFO - Found 8 Topics for hdbscan_UMAP_2D ID: 18
2024-11-22 08:27:49,793 - INFO - Generated cluster name: Coat Color Variations and Patterns
2024-11-22 08:27:49,793 - INFO -  Tokens used so far: Prompt Tokens: 85, Completion Tokens: 7
2024-11-22 08:27:49,809 - INFO - Found 8 Topics for hdbscan_UMAP_2D ID: 17
2024-11-22 08:27:50,281 - INFO - Generated cluster name: Breeding Strategies and Practices
2024-11-22 08:27:50,281 - INFO -  Tokens used so far: Prompt Tokens: 166, Completion Tokens: 12
2024-11-22 08:27:50,297 - INFO - Found 8 Topics for hdbscan_UMAP_2D ID: 22


KeyboardInterrupt: 