In [1]:
# General modules
import os
import openai
from dotenv import load_dotenv

# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"

# Paths
root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps'
steam_title = 'DreadDawn'

path_db_prepared = os.path.join(root_dir, steam_title, "db_prepared.json")
path_db_translated = os.path.join(root_dir, steam_title, "db_translated.json")
path_db_analysed = os.path.join(root_dir, steam_title, "db_analysed.json")
path_db_embedded = os.path.join(root_dir, steam_title, "db_embedded.json")
path_db_clustered = os.path.join(root_dir, steam_title, "db_clustered.json")
path_db_final = os.path.join(root_dir, steam_title, "db_final.json")

In [2]:
# my imports
from helper.utils import *
from helper.data_analysis import *
from helper.prompt_templates import *
from helper.embedding import *
from helper.cluster_analysis import *
from helper.cluster_naming import *
from helper.steam_scraper import *

configure_api(client, chat_model_name)

# Scrape Steam reviews

In [12]:
# https: // store.steampowered.com / app / 455690 / Pixel_Puzzles_Junior_Jigsaw /
# https://store.steampowered.com/app/2093920/Dread_Dawn/
appid = '2093920'
n_reviews = 2000     # Number of reviews to scrape

params = {
    'json': 1,
    'filter': 'all',
    'language': 'all',
    'day_range': 9223372036854775807,              # Dont ask, just dont touch this number
    'review_type': 'all',
    'purchase_type': 'all'
}


reviews = get_n_reviews(appid, params, n_reviews)
print(f"Total reviews: {len(reviews)}")

2024-11-28 15:39:22,106 - INFO - Retrieved 100 reviews in API call. Total so far: 100
2024-11-28 15:39:22,730 - INFO - Retrieved 100 reviews in API call. Total so far: 200
2024-11-28 15:39:23,383 - INFO - Retrieved 100 reviews in API call. Total so far: 300
2024-11-28 15:39:24,051 - INFO - Retrieved 100 reviews in API call. Total so far: 400
2024-11-28 15:39:24,733 - INFO - Retrieved 100 reviews in API call. Total so far: 500
2024-11-28 15:39:25,610 - INFO - Retrieved 100 reviews in API call. Total so far: 600
2024-11-28 15:39:26,479 - INFO - Retrieved 100 reviews in API call. Total so far: 700
2024-11-28 15:39:27,033 - INFO - Retrieved 100 reviews in API call. Total so far: 800
2024-11-28 15:39:27,549 - INFO - Retrieved 100 reviews in API call. Total so far: 900
2024-11-28 15:39:28,149 - INFO - Retrieved 100 reviews in API call. Total so far: 1000
2024-11-28 15:39:28,729 - INFO - Retrieved 100 reviews in API call. Total so far: 1100
2024-11-28 15:39:29,232 - INFO - Retrieved 100 revie

Total reviews: 1556


In [13]:
# Save reviews?
save_to_json(reviews, path_db_prepared)

2024-11-28 15:40:50,057 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\PixelPuzzles\db_prepared.json


In [15]:
# Generate sample and save
sample_size = 700
seed = 42
sample_data = get_random_sample(reviews, sample_size, seed=seed)
save_to_json(sample_data, path_db_prepared)

2024-11-28 15:41:45,226 - INFO - Generating a random sample of size 700 with seed 42.
2024-11-28 15:41:45,258 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\DreadDawn\db_prepared.json


# Translate reviews

#### This transformation is not pretty but it is used for now to keep using the same helpers as before

In [16]:
# chang key from 'language' to 'player_language'
data = load_json(path_db_prepared)
def rename_key_in_json(obj, old_key, new_key):
    if isinstance(obj, dict):
        return {new_key if k == old_key else k: rename_key_in_json(v, old_key, new_key) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [rename_key_in_json(i, old_key, new_key) for i in obj]
    return obj

# Rename 'language' to 'player_language'
updated_data = rename_key_in_json(data, 'language', 'player_language')
updated_data = rename_key_in_json(updated_data, 'review', 'player_response')


2024-11-28 15:42:16,985 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\DreadDawn\db_prepared.json


In [10]:
updated_data[0]

{'recommendationid': '144967552',
 'author': {'steamid': '76561198262833737',
  'num_games_owned': 3025,
  'num_reviews': 218,
  'playtime_forever': 438,
  'playtime_last_two_weeks': 0,
  'playtime_at_review': 438,
  'last_played': 1692944715},
 'player_language': 'brazilian',
 'player_response': "Jogo simples de quebra cabeça, são 50 níveis, perfeitos para quem curte o gênero, as imagens são fofinhas e sem coisas '' a mais '' então as crianças podem jogar e se divertir também.\n\nPara quem curte conquistas (eu), são 50 conquistas, todas relacionadas a completar cada quebra cabeça pelo menos 1 vez, podendo ser completado em qualquer dificuldade, incluindo 9 peças, se tornando extremamente fácil e rápido. \n\nRecomendo para quem curte o gênero ou queira conquistas sem maiores dificuldades.",
 'timestamp_created': 1692945306,
 'timestamp_updated': 1692945306,
 'voted_up': True,
 'votes_up': 11,
 'votes_funny': 0,
 'weighted_vote_score': '0.637903869152069092',
 'comment_count': 0,
 'stea

In [17]:
id_col = 'recommendationid'
columns_of_interest = ['player_response']
translated_data = translate_data(updated_data, id_col, prompt_template_translation, api_settings, columns_of_interest)

2024-11-28 15:42:28,726 - INFO - Translating entry ID 176460870 (Language: schinese)
2024-11-28 15:42:29,497 - INFO - Translating entry ID 176135921 (Language: schinese)
2024-11-28 15:42:30,623 - INFO - Translating entry ID 176343392 (Language: schinese)
2024-11-28 15:42:34,148 - INFO - Translating entry ID 176555290 (Language: russian)
2024-11-28 15:42:34,607 - INFO - Translating entry ID 176144715 (Language: schinese)
2024-11-28 15:42:36,635 - INFO - Translating entry ID 176816023 (Language: schinese)
2024-11-28 15:42:38,494 - INFO - Translating entry ID 177101970 (Language: schinese)
2024-11-28 15:42:38,924 - INFO - Translating entry ID 176574071 (Language: french)
2024-11-28 15:42:40,937 - INFO - Translating entry ID 176147064 (Language: schinese)
2024-11-28 15:42:41,422 - INFO - Translating entry ID 176359361 (Language: schinese)
2024-11-28 15:42:42,053 - INFO - Translating entry ID 176222165 (Language: schinese)
2024-11-28 15:42:43,028 - INFO - Translating entry ID 176144386 (Lan

In [18]:
save_to_json(translated_data, path_db_translated)

2024-11-28 16:09:05,068 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\DreadDawn\db_translated.json


# Topic extraction and sentiment analysis

In [3]:
# Parameters
id_column = 'recommendationid'                # Column name for entry IDs
columns_of_interest = ["player_response"]     # Which cols should be analyzed?
batch_size = 10                               # Fail-safe batching. The higher the number, the less often the progress is saved.

prepared_data = read_json(path_db_translated)

# Run analysis
analyse_data(
    translated_data=prepared_data,
    id_column=id_column,
    output_path=path_db_analysed,
    prompt_template_topic=prompt_template_topic_steam,
    prompt_template_sentiment=prompt_template_sentiment,
    api_settings=api_settings,
    columns_of_interest=columns_of_interest,
    batch_size=batch_size
)

2024-11-29 10:31:57,149 - INFO - Loading existing progress from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\DreadDawn\db_analysed.json
2024-11-29 10:31:57,149 - INFO - Skipping already processed entry ID 176460870
2024-11-29 10:31:57,149 - INFO - Skipping already processed entry ID 176135921
2024-11-29 10:31:57,149 - INFO - Skipping already processed entry ID 176343392
2024-11-29 10:31:57,149 - INFO - Skipping already processed entry ID 176555290
2024-11-29 10:31:57,149 - INFO - Skipping already processed entry ID 176128942
2024-11-29 10:31:57,149 - INFO - Skipping already processed entry ID 176144715
2024-11-29 10:31:57,149 - INFO - Skipping already processed entry ID 176273745
2024-11-29 10:31:57,149 - INFO - Skipping already processed entry ID 176816023
2024-11-29 10:31:57,149 - INFO - Skipping already processed entry ID 177101970
2024-11-29 10:31:57,149 - INFO - Tokens used so far: Prompt Tokens: 0, Completion Tokens: 0
2024-11-29 10:31:57,149 - INFO

# Embed reviews

In [4]:
batch_size = 10
b_override = False  # Change to True if embeddings should be overwritten
embed_key = "topic"  # topic or "sentence"

# embed_model_name = 'sentence-transformers/all-mpnet-base-v2'
# embed_model_name = 'dunzhang/stella_en_1.5B_v5'

# Load the JSON data
data = read_json(path_db_analysed)

# Initialize the embedding model once
embed_model = initialize_embedding_model(model_name=embed_model_name)

# Initialize an empty list for all processed results
processed_results = []

# Process data in batches
for batch_start in range(0, len(data), batch_size):
    batch_end = min(batch_start + batch_size, len(data))
    batch = data[batch_start:batch_end]
    logger.info(f"Processing batch {batch_start // batch_size + 1} ({batch_start} to {batch_end})")
    processed_batch = process_batch(batch, embed_model, b_override, embed_key=embed_key)
    processed_results.extend(processed_batch)  # Collect processed batch results

# Convert all processed results to a DataFrame at once
df_table = json_to_table(processed_results)

# Save the final JSON table
save_df_as_json(df_table, path_db_embedded)
logger.info("Embedding and conversion to table format completed.")


2024-11-29 11:23:11,426 - INFO - Loading embedding model: all-MiniLM-L6-v2
  embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=model_name))
2024-11-29 11:23:56,355 - INFO - PyTorch version 2.4.0+cu124 available.
2024-11-29 11:23:56,370 - INFO - Polars version 1.12.0 available.
2024-11-29 11:23:56,370 - INFO - Duckdb version 1.1.2 available.
2024-11-29 11:23:57,902 - INFO - Use pytorch device_name: cuda
2024-11-29 11:23:57,902 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-11-29 11:24:01,464 - INFO - Processing batch 1 (0 to 10)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
2024-11-29 11:24:04,014 - INFO - Processing batch 2 (10 to 20)
2024-11-29 11:24:06,195 - INFO - Processing batch 3 (20 to 30)
2024-11-29 11:24:08,276 - INFO - Processing batch 4 (30 to 40)
2024-11-29 11:24:10,336 - INFO - Processing batch 5 (40 to 50)
2024-11-29 11:24:12,871 - INFO - Processing batch 6 (50 to 60)
2024-11-29 11:24:15,119 - INFO - Processing batc

# Cluster Analysis


In [5]:
# Adjustable parameters
dimensionality_methods = ['UMAP', 'tSNE']
hdbscan_params = {"min_cluster_size": 40, "min_samples": 20, "cluster_selection_epsilon": 0.2}

# Load data
df_total = load_embedded_data(path_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat,
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)
logger.info(f"Results saved to {path_db_clustered}")

# how many unique cluster ids are in the data?
length = len(df_total['hdbscan_UMAP_2D'].unique())
print(f'Number of unique clusters: {length}')

2024-11-29 11:27:19,207 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\DreadDawn\db_embedded.json
2024-11-29 11:27:20,259 - INFO - Loaded 2422 valid entries with embeddings.
2024-11-29 11:27:20,298 - INFO - Applying HDBSCAN in the original high-dimensional space with params: {'min_cluster_size': 40, 'min_samples': 20, 'cluster_selection_epsilon': 0.2}
2024-11-29 11:27:23,561 - INFO - Applying UMAP for 2D visualization.
2024-11-29 11:27:23,561 - INFO - Applying UMAP with 2 components.
  warn(
2024-11-29 11:27:38,453 - INFO - Applying UMAP for 3D visualization.
2024-11-29 11:27:38,454 - INFO - Applying UMAP with 3 components.
  warn(
2024-11-29 11:27:47,538 - INFO - Applying tSNE for 2D visualization.
2024-11-29 11:27:47,539 - INFO - Applying tSNE with 2 components.
2024-11-29 11:27:47,539 - INFO - Perplexity not provided, setting to 30 based on sample size.
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the sam

Number of unique clusters: 10


In [6]:
print(df_total['hdbscan_UMAP_2D'].unique())

[-1  6  1  7  8  3  5  0  2  4]


In [7]:
# Apply KMeans (if needed)
dimensionality_methods = ['UMAP', 'tSNE']
kmeans_clusters = [10, 20, 50]

df_total = load_embedded_data(path_db_clustered)
df_total = apply_kmeans(
    df_total,
    mat,
    dimensionality_methods,
    kmeans_clusters,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)
logger.info(f"Results saved to {path_db_clustered}")

2024-11-29 11:29:03,277 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\DreadDawn\db_clustered.json
2024-11-29 11:29:04,393 - INFO - Loaded 2422 valid entries with embeddings.
2024-11-29 11:29:04,405 - INFO - Applying KMeans with 10 clusters in high-dimensional space.
2024-11-29 11:29:04,477 - INFO - Applying UMAP in 2D.
2024-11-29 11:29:04,477 - INFO - Applying UMAP with 2 components.
  warn(
2024-11-29 11:29:13,684 - INFO - Applying UMAP in 3D.
2024-11-29 11:29:13,686 - INFO - Applying UMAP with 3 components.
  warn(
2024-11-29 11:29:23,539 - INFO - Applying tSNE in 2D.
2024-11-29 11:29:23,540 - INFO - Applying tSNE with 2 components.
2024-11-29 11:29:23,540 - INFO - Perplexity not provided, setting to 30 based on sample size.
2024-11-29 11:29:31,895 - INFO - Applying tSNE in 3D.
2024-11-29 11:29:31,896 - INFO - Applying tSNE with 3 components.
2024-11-29 11:29:31,897 - INFO - Perplexity not provided, setting to 30 based on sampl

# Cluster naming

In [8]:
# Parameters
dimensionality_methods = ["UMAP", "tSNE"]
clustering_algorithms = ["hdbscan", 'kmeans']  # No KMeans here
max_centers = 12

#kmeans_clusters = [15, 20, 25, 50]  # Number of clusters for KMeans
# kmeans_clusters = [5, 8, 12, 15]
# Load data
df_total = load_json_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(df_total, dimensionality_methods, clustering_algorithms, max_centers, api_settings, kmeans_clusters) # insert kmeans_clusters in the function when needed


# Save results
save_data_for_streamlit(df_total, path_db_final)

2024-11-29 11:31:31,287 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\Steamapps\DreadDawn\db_clustered.json
2024-11-29 11:31:31,990 - INFO - Found 12 Topics for hdbscan_UMAP_2D ID: 6
2024-11-29 11:31:32,543 - INFO - Generated cluster name: Gameplay Experience and Mechanics
2024-11-29 11:31:32,559 - INFO -  Tokens used so far: Prompt Tokens: 81, Completion Tokens: 4
2024-11-29 11:31:32,559 - INFO - HDBSCAN Cluster ID 6 (UMAP 2D): Gameplay Experience and Mechanics
2024-11-29 11:31:32,559 - INFO - Found 12 Topics for hdbscan_UMAP_2D ID: 1
2024-11-29 11:31:33,020 - INFO - Generated cluster name: Frequent Updates Overview
2024-11-29 11:31:33,020 - INFO -  Tokens used so far: Prompt Tokens: 162, Completion Tokens: 8
2024-11-29 11:31:33,020 - INFO - HDBSCAN Cluster ID 1 (UMAP 2D): Frequent Updates Overview
2024-11-29 11:31:33,020 - INFO - Found 12 Topics for hdbscan_UMAP_2D ID: 7
2024-11-29 11:31:33,500 - INFO - Generated cluster name: Zombie Apo