In [3]:
# General modules
import os
import openai
from dotenv import load_dotenv
from nltk.chunk.named_entity import shape

# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"

# Paths
root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC'
steam_title = 'Community'



path_input = os.path.join(root_dir, steam_title, "Transcript_pinehaven_stream.txt")
path_db_prepared = os.path.join(root_dir, steam_title, "db_prepared.json")
path_db_translated = os.path.join(root_dir, steam_title, "db_translated.json")
path_db_analysed = os.path.join(root_dir, steam_title, "db_analysed.json")
path_db_embedded = os.path.join(root_dir, steam_title, "db_embedded.json")
path_db_clustered = os.path.join(root_dir, steam_title, "db_clustered.json")
path_db_final = os.path.join(root_dir, steam_title, "db_final.json")

# Transcript Preparation

In [3]:
# Read the transcript
with open(path_input, 'r', encoding='utf-8') as file:
    transcript = file.readlines()

In [4]:
# concatenate every 100 lines + get rid of '\n'

transcript_joined = []
for i in range(0, len(transcript), 40):
    transcript_joined.append(' '.join([line.strip() for line in transcript[i:i+100]]))


In [4]:
len(transcript_joined)

137

In [12]:
len(transcript_joined[5])

3759

# Analysis

In [None]:
from helper.utils import *
from helper.prompt_templates import *

configure_api(client, chat_model_name)

all_entries = []

for i in range(0, len(transcript_joined)):
    logger.info(f"Processing text {i}")

    transcript = transcript_joined[i]
    
    prompt_influencer = prompt_template_influencer.format(transcript=transcript)
    response = api_settings["client"].chat.completions.create(
        model=api_settings["model"],
        messages=[
            {"role": "system", "content": "You are an expert in extracting video game topics from Youtube Transcripts."},
            {"role": "user", "content": prompt_influencer},
        ],
        response_format={"type": "json_object"},
        max_tokens=4096
    )
    response_json = json.loads(response.choices[0].message.content)
    
    # Dynamically handle varying keys at the root of the response
    if isinstance(response_json, dict):
        for key, value in response_json.items():
            if isinstance(value, list):  # Ensure the value is a list
                all_entries.extend(value)
            else:
                logger.warning(f"Unexpected format for key '{key}' in response {i}")
    else:
        logger.warning(f"Unexpected response structure for text {i}: {response_json}")

# save the entries
with open(path_db_prepared, "w") as output_file:
    json.dump(all_entries, output_file, indent=4)

In [21]:
# generat unique ID

from helper.utils import *

# A unique ID is generated in the new column / key "response_ID"
data = read_json(path_db_prepared)
data_prepared = generate_ID(data)
save_to_json(data_prepared, path_db_prepared)

2025-01-16 10:39:18,182 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\Community\db_prepared.json


# Sentiment Analysis

In [35]:
all_entries = []
data_prepared = read_json(path_db_prepared)

for i in range(0, len(data_prepared)):
    entry = data_prepared[i]
    logger.info(f"Process Sentiment for text {i}")
    try:
        prompt_sentiment = prompt_template_sentiment.format(
            review=entry["Context"], topic=entry["Topic"]
        )
        response = api_settings["client"].chat.completions.create(
            model=api_settings["model"],
            messages=[
                {"role": "system", "content": "You are an expert for sentiment analysis."},
                {"role": "user", "content": prompt_sentiment},
            ],
            max_tokens=1024,
        )
        sentiment = response.choices[0].message.content.strip()
        # rename keys
        entry["topic"] = entry["Topic"]
        entry["sentiment"] = sentiment
        entry["category"] = entry["Category"]
        entry["sentence"] = entry["Context"]
        entry.pop("Context")
        entry.pop("Category")
        entry.pop("Topic")

        all_entries.append(entry)
    except Exception as e:
        logger.error(f"Error analyzing sentiment for topic '{entry['Topic']}' (Entry ID {entry['response_ID']}): {e}")
        raise



2025-01-16 13:27:17,662 - INFO - Process Sentiment for text 0
2025-01-16 13:27:18,325 - INFO - Process Sentiment for text 1
2025-01-16 13:27:18,749 - INFO - Process Sentiment for text 2
2025-01-16 13:27:19,328 - INFO - Process Sentiment for text 3
2025-01-16 13:27:19,868 - INFO - Process Sentiment for text 4
2025-01-16 13:27:20,366 - INFO - Process Sentiment for text 5
2025-01-16 13:27:20,877 - INFO - Process Sentiment for text 6
2025-01-16 13:27:21,378 - INFO - Process Sentiment for text 7
2025-01-16 13:27:21,836 - INFO - Process Sentiment for text 8
2025-01-16 13:27:24,972 - INFO - Process Sentiment for text 9
2025-01-16 13:27:25,394 - INFO - Process Sentiment for text 10
2025-01-16 13:27:25,886 - INFO - Process Sentiment for text 11
2025-01-16 13:27:26,514 - INFO - Process Sentiment for text 12
2025-01-16 13:27:27,212 - INFO - Process Sentiment for text 13
2025-01-16 13:27:27,720 - INFO - Process Sentiment for text 14
2025-01-16 13:27:28,154 - INFO - Process Sentiment for text 15
20

In [36]:
# Save the entries
with open(path_db_analysed, "w") as output_file:
    json.dump(all_entries, output_file, indent=4)

# Embedding 

In [40]:
from helper.embedding import *

embed_key = "topic"  # "topic" or "sentence"

data = read_json(path_db_analysed)
embed_model = initialize_embedding_model(embed_model_name)

def process_embedding(data, embed_key):
    for i in range(0, len(data)):
        if i % 10 == 0:
            logger.info(f"Processing entry {i}")
        entry = data[i]
        text = entry[embed_key]
        embedding = embed_text(text, embed_model)
        entry["embedding"] = embedding
    return data

data_embedded = process_embedding(data, embed_key)

# Save the embedded data    
with open(path_db_embedded, "w") as output_file:
    json.dump(data_embedded, output_file, indent=4)


2025-01-16 13:41:30,189 - INFO - Loading embedding model: all-MiniLM-L6-v2
2025-01-16 13:41:30,195 - INFO - Use pytorch device_name: cuda
2025-01-16 13:41:30,195 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-01-16 13:41:32,899 - INFO - Processing entry 0
2025-01-16 13:41:33,063 - INFO - Processing entry 10
2025-01-16 13:41:33,176 - INFO - Processing entry 20
2025-01-16 13:41:33,279 - INFO - Processing entry 30
2025-01-16 13:41:33,380 - INFO - Processing entry 40
2025-01-16 13:41:33,460 - INFO - Processing entry 50
2025-01-16 13:41:33,547 - INFO - Processing entry 60
2025-01-16 13:41:33,629 - INFO - Processing entry 70
2025-01-16 13:41:33,716 - INFO - Processing entry 80
2025-01-16 13:41:33,817 - INFO - Processing entry 90
2025-01-16 13:41:33,919 - INFO - Processing entry 100
2025-01-16 13:41:34,009 - INFO - Processing entry 110
2025-01-16 13:41:34,093 - INFO - Processing entry 120
2025-01-16 13:41:34,178 - INFO - Processing entry 130
2025-01-16 13:41:34,266 - INFO

# Clustering

In [41]:
from helper.cluster_analysis import *

# Adjustable parameters
dimensionality_methods = ['UMAP','PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 7, "min_samples": 2, "cluster_selection_epsilon": 0.4}

# Load data
df_total = load_embedded_data(path_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat,
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)

2025-01-16 13:42:13,926 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\Community\db_embedded.json
2025-01-16 13:42:14,093 - INFO - Loaded 753 valid entries with embeddings.
2025-01-16 13:42:14,121 - INFO - Applying HDBSCAN with: {'min_cluster_size': 7, 'min_samples': 2, 'cluster_selection_epsilon': 0.4}
2025-01-16 13:42:14,527 - INFO - Found 24 clusters.
2025-01-16 13:42:14,538 - INFO - Applying UMAP with 2 components.
2025-01-16 13:42:15,232 - INFO - Applying UMAP with 3 components.
2025-01-16 13:42:15,854 - INFO - Applying PCA with 2 components.
2025-01-16 13:42:15,870 - INFO - Applying PCA with 3 components.
2025-01-16 13:42:15,884 - INFO - Applying tSNE with 2 components.
2025-01-16 13:42:15,886 - INFO - Perplexity not provided, setting to 30 based on sample size.
2025-01-16 13:42:17,911 - INFO - Applying tSNE with 3 components.
2025-01-16 13:42:17,911 - INFO - Perplexity not provided, setting to 30 based on sample size.
2025-01-16 

Number of unique clusters: 24


# Cluster Naming

In [42]:
from helper.cluster_naming import *

# Parameters
dimensionality_methods = ["UMAP",'PCA', "tSNE"]
clustering_algorithms = ["hdbscan"]  # No KMeans here
max_centers = 10

#kmeans_clusters = [15, 20, 25, 50]  # Number of clusters for KMeans

# Load data
df_total = load_json_into_df(path_db_clustered)

# Process clusters and generate names
df_total = process_clusters(
    df_total, 
    dimensionality_methods, 
    clustering_algorithms, 
    max_centers, 
    api_settings) # insert kmeans_clusters in the function when needed


# Save results
save_data_for_streamlit(df_total, path_db_final)

2025-01-16 13:42:34,329 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\Community\db_clustered.json
2025-01-16 13:42:34,452 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 12
2025-01-16 13:42:35,410 - INFO - Generated cluster name: Equestrian Game Mechanics Discussion
2025-01-16 13:42:35,412 - INFO - Tokens used so far: Prompt Tokens: 322, Completion Tokens: 6
2025-01-16 13:42:35,417 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 15
2025-01-16 13:42:35,855 - INFO - Generated cluster name: Coat Color Identification Challenges
2025-01-16 13:42:35,855 - INFO - Tokens used so far: Prompt Tokens: 652, Completion Tokens: 12
2025-01-16 13:42:35,855 - INFO - Found 9 Topics for hdbscan_cluster_id ID: 13
2025-01-16 13:42:36,594 - INFO - Generated cluster name: Horse Animation and Modeling Issues
2025-01-16 13:42:36,594 - INFO - Tokens used so far: Prompt Tokens: 899, Completion Tokens: 18
2025-01-16 13:42:36,602 - INFO - Found 10 Topics f

# HRC Steam reviews

In [5]:
# Paths
root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC'
steam_title = 'Steam'


path_input = os.path.join(root_dir, steam_title, "Transcript_pinehaven_stream.txt")
path_db_prepared = os.path.join(root_dir, steam_title, "db_prepared.json")
path_db_translated = os.path.join(root_dir, steam_title, "db_translated.json")
path_db_analysed = os.path.join(root_dir, steam_title, "db_analysed.json")
path_db_embedded = os.path.join(root_dir, steam_title, "db_embedded.json")
path_db_clustered = os.path.join(root_dir, steam_title, "db_clustered.json")
path_db_final = os.path.join(root_dir, steam_title, "db_final.json")

## Redshift query

In [31]:
# My imports
from helper.redshift_conector_standalone import *

# https://store.steampowered.com/app/1166860/Rival_Stars_Horse_Racing_Desktop_Edition/

# SQL Query Redshift
sql_query = """
SELECT *
FROM steam_review
where app_id_name = '1166860_Rival_Stars_Horse_Racing_Desktop_Edition'
"""
logger.info(f"Query Redshift with: {sql_query}")

try:
    results_json, results_df = fetch_query_results(sql_query)
    # Print the first row of the DataFrame
    logger.info("Successfully fetched query results, with shape: %s", results_df.shape)
except Exception as e:
    logger.error(f"Error fetching query results: {e}")
    raise

# Save the json
parsed_json = json.loads(results_json)

# 2) Then pretty-print with indentation
save_to_json(parsed_json, path_db_prepared)

2025-01-17 16:23:15,659 - INFO - Query Redshift with: 
SELECT *
FROM steam_review
where app_id_name = '1166860_Rival_Stars_Horse_Racing_Desktop_Edition'

2025-01-17 16:23:23,125 - INFO - Successfully fetched query results, with shape: (2051, 14)
2025-01-17 16:23:23,195 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\Steam\db_prepared.json


# Translation

In [33]:
from helper.data_analysis import translate_reviews

id_column = "recommendationid"              # The column that contains unique IDs
text_col = "review_text"                    # The column that contains the text to be translated
language_col = "language"                   # The column that contains the language tag

data_translated = translate_reviews(df=results_df,
                                    file_path=path_db_translated,
                                    id_column=id_column,
                                    text_column=text_col,
                                    language_column='language')

# Save the translated data
save_df_as_json(data_translated, path_db_translated)

2025-01-17 16:24:32,743 - INFO - No existing file found. Starting fresh.
2025-01-17 16:24:32,747 - INFO - Found 2051 new reviews to process.
2025-01-17 16:24:32,751 - INFO - Translating review ID: 177351085 (Detected Language: french)
2025-01-17 16:24:33,779 - INFO - Translating review ID: 164987485 (Detected Language: french)
2025-01-17 16:24:34,341 - INFO - Translating review ID: 184017706 (Detected Language: spanish)
2025-01-17 16:24:35,478 - INFO - Translating review ID: 172578193 (Detected Language: french)
2025-01-17 16:24:36,070 - INFO - Translating review ID: 166297692 (Detected Language: german)
2025-01-17 16:24:38,746 - INFO - Translating review ID: 163322895 (Detected Language: french)
2025-01-17 16:24:41,306 - INFO - Translating review ID: 161924755 (Detected Language: german)
2025-01-17 16:24:42,297 - INFO - Translating review ID: 138333050 (Detected Language: german)
2025-01-17 16:24:43,067 - INFO - Translating review ID: 137109808 (Detected Language: german)
2025-01-17 1

TypeError: Object of type DataFrame is not JSON serializable

In [34]:
save_df_as_json(data_translated, path_db_translated)

2025-01-17 17:37:50,139 - INFO - Saving data to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\Steam\db_translated.json


# Data Analysis

In [6]:
import os
from helper.utils import *
from helper.prompt_templates import *
from helper.data_analysis import normalize_topics_key, process_entry

# Configure API
configure_api(client, chat_model_name)

data_prepared = read_json(path_db_translated)

id_column = "recommendationid"              # The column that contains unique IDs
columns_of_interest = ["review_text"]       # The column(s) that are going to be analyzed
all_entries = []                            # List to store all processed entries
processed_ids = set()                       # Set to store IDs of processed entries

# If the analyzed file already exists, load it
if os.path.exists(path_db_analysed):
    all_entries = read_json(path_db_analysed)
    processed_ids = {entry[id_column] for entry in all_entries}  # set for O(1) membership checks

# Process all unprocessed entries
for i, entry in enumerate(data_prepared):
    current_id = entry[id_column]

    # If we've already processed this entry, skip it
    if current_id in processed_ids:
        logger.info(f"Skipping entry {i} (ID: {current_id}) - already processed.")
        continue

    # Otherwise, process and append
    process_entry(
        entry,
        id_column,
        prompt_template_topic,
        prompt_template_sentiment,
        api_settings,
        columns_of_interest
    )
    all_entries.append(entry)
    processed_ids.add(current_id)  # mark as processed

    # Save intermediate progress every 10 entries
    if (i % 10) == 0 and i != 0:
        save_to_json(all_entries, path_db_analysed)
        logger.info(f"Progress saved at index {i}.")

# Final save after the loop
save_to_json(all_entries, path_db_analysed)
logger.info("All entries processed and final results saved.")


2025-01-21 08:24:04,810 - INFO - Tokens used so far: Prompt Tokens: 0, Completion Tokens: 0
2025-01-21 08:24:04,811 - INFO - Extracting topics for entry ID 179387177
2025-01-21 08:24:06,610 - INFO - Analyzing sentiment for topic 'Game Detail' (Entry ID 179387177)
2025-01-21 08:24:07,136 - INFO - Tokens used so far: Prompt Tokens: 759, Completion Tokens: 46
2025-01-21 08:24:07,137 - INFO - Extracting topics for entry ID 178743676
2025-01-21 08:24:09,112 - INFO - Analyzing sentiment for topic 'Horse Care Mechanics' (Entry ID 178743676)
2025-01-21 08:24:09,536 - INFO - Analyzing sentiment for topic 'Overall Enjoyment' (Entry ID 178743676)
2025-01-21 08:24:09,997 - INFO - Tokens used so far: Prompt Tokens: 1728, Completion Tokens: 153
2025-01-21 08:24:09,999 - INFO - Extracting topics for entry ID 178350425
2025-01-21 08:24:11,039 - INFO - Analyzing sentiment for topic 'Cross-Platform Updates' (Entry ID 178350425)
2025-01-21 08:24:11,592 - INFO - Tokens used so far: Prompt Tokens: 2501, Co

# Embedding

In [8]:
data = read_json(path_db_analysed)

In [14]:
data[0]['topics']

[{'topic': 'Game Detail',
  'sentiment': 'Positive',
  'category': 'fact',
  'sentence': "Very detailed and it has lot's to do."}]

In [16]:
from helper.embedding import *

embed_key = "topic"  # "topic" or "sentence"

data = read_json(path_db_analysed)
embed_model = initialize_embedding_model(embed_model_name)

def process_embedding(data, embed_key):
    for i in range(0, len(data)):
        if i % 10 == 0:
            logger.info(f"Processing entry {i}")

        for d_topic in data[i]["topics"]:
            if isinstance(d_topic, dict):
                d_topic["embedding"] = embed_text(d_topic[embed_key], embed_model)
    return data

data_embedded = process_embedding(data, embed_key)

# Flatten
def flatten_data(data):
    flattened = []
    for entry in data:
        base_copy = dict(entry)
        topics = base_copy.pop("topics", [])

        for topic in topics:
            new_entry = dict(base_copy)
            new_entry.update(topic)
            flattened.append(new_entry)
    return flattened

data_flattened = flatten_data(data_embedded)


# Save the embedded data
with open(path_db_embedded, "w") as output_file:
    json.dump(data_flattened, output_file, indent=4)


2025-01-21 17:19:56,581 - INFO - Loading embedding model: all-MiniLM-L6-v2
2025-01-21 17:19:56,597 - INFO - Use pytorch device_name: cuda
2025-01-21 17:19:56,599 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-01-21 17:19:59,327 - INFO - Processing entry 0
2025-01-21 17:19:59,552 - INFO - Processing entry 10
2025-01-21 17:19:59,798 - INFO - Processing entry 20
2025-01-21 17:19:59,980 - INFO - Processing entry 30
2025-01-21 17:20:00,249 - INFO - Processing entry 40
2025-01-21 17:20:00,578 - INFO - Processing entry 50
2025-01-21 17:20:00,894 - INFO - Processing entry 60
2025-01-21 17:20:01,098 - INFO - Processing entry 70
2025-01-21 17:20:01,307 - INFO - Processing entry 80
2025-01-21 17:20:01,536 - INFO - Processing entry 90
2025-01-21 17:20:01,769 - INFO - Processing entry 100
2025-01-21 17:20:02,042 - INFO - Processing entry 110
2025-01-21 17:20:02,305 - INFO - Processing entry 120
2025-01-21 17:20:02,550 - INFO - Processing entry 130
2025-01-21 17:20:02,741 - INFO

# Clustering

In [18]:
from helper.cluster_analysis import *

# Adjustable parameters
dimensionality_methods = ['UMAP','PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 15, "min_samples": 5, "cluster_selection_epsilon": 0.4}

# Load data
df_total = load_embedded_data(path_db_embedded)
mat = np.array(df_total['embedding'].tolist())

# Apply HDBSCAN
df_total = apply_hdbscan(
    df_total,
    mat,
    dimensionality_methods,
    hdbscan_params=hdbscan_params,
    include_2d=True,
    include_3d=True
)

# Save results
save_df_as_json(df_total, path_db_clustered)

2025-01-21 17:23:49,505 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\Steam\db_embedded.json
2025-01-21 17:23:50,932 - INFO - Loaded 5797 valid entries with embeddings.
2025-01-21 17:23:51,026 - INFO - Applying HDBSCAN with: {'min_cluster_size': 15, 'min_samples': 5, 'cluster_selection_epsilon': 0.4}
2025-01-21 17:24:06,098 - INFO - Found 77 clusters.
2025-01-21 17:24:06,099 - INFO - Applying UMAP with 2 components.
2025-01-21 17:24:26,195 - INFO - Applying UMAP with 3 components.
2025-01-21 17:24:29,271 - INFO - Applying PCA with 2 components.
2025-01-21 17:24:29,370 - INFO - Applying PCA with 3 components.
2025-01-21 17:24:29,467 - INFO - Applying tSNE with 2 components.
2025-01-21 17:24:29,468 - INFO - Perplexity not provided, setting to 30 based on sample size.
2025-01-21 17:24:52,750 - INFO - Applying tSNE with 3 components.
2025-01-21 17:24:52,752 - INFO - Perplexity not provided, setting to 30 based on sample size.
2025-01-21 17

# Cluster Naming

In [19]:
from helper.cluster_naming import *

# Parameters
dimensionality_methods = ["UMAP",'PCA', "tSNE"]
clustering_algorithms = ["hdbscan"]  # No KMeans here
max_centers = 10

#kmeans_clusters = [15, 20, 25, 50]  # Number of clusters for KMeans

# Load data
df_total = load_json_into_df(path_db_clustered)

# Process clusters and generstate names
df_total = process_clusters(
    df_total,
    dimensionality_methods,
    clustering_algorithms,
    max_centers,
    api_settings) # insert kmeans_clusters in the function when needed


# Save results
save_data_for_streamlit(df_total, path_db_final)

2025-01-21 17:27:24,707 - INFO - Loading data from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\HRC\Steam\db_clustered.json
2025-01-21 17:27:26,176 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 66
2025-01-21 17:27:27,059 - INFO - Generated cluster name: Ultimate Game Love Fest
2025-01-21 17:27:27,059 - INFO - Tokens used so far: Prompt Tokens: 142, Completion Tokens: 5
2025-01-21 17:27:27,063 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 17
2025-01-21 17:27:27,632 - INFO - Generated cluster name: Top Equestrian Game Reviews
2025-01-21 17:27:27,633 - INFO - Tokens used so far: Prompt Tokens: 301, Completion Tokens: 12
2025-01-21 17:27:27,636 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 64
2025-01-21 17:27:28,352 - INFO - Generated cluster name: Game Enthusiasm and Enjoyment
2025-01-21 17:27:28,352 - INFO - Tokens used so far: Prompt Tokens: 413, Completion Tokens: 19
2025-01-21 17:27:28,360 - INFO - Found 10 Topics for hdbscan_cluster_id ID: 65
20