In [None]:
import numpy as np
import pandas as pd
import json
import os
os.environ["OMP_NUM_THREADS"] = '1'


from langchain.prompts import PromptTemplate
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain.embeddings import HuggingFaceEmbeddings

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from umap import umap_ as UMAP
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt


In [None]:
from dotenv import load_dotenv
load_dotenv()

import os
import openai

openai_api_key = os.getenv("OPENAI_API_KEY")

openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embedding_model_name = 'sentence-transformers/all-mpnet-base-v2'

s_root = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis/'
s_db_json = 'Data/survey_results_with_topics.json'

s_db_embed_json = 'Data/review_db_embed.json'
s_db_table_json = 'Data/review_db_table.json'
s_db_table_xlsx = 'Data/review_db_table.xlsx'
s_db_table_pca_json = 'Data/review_db_table_pca.json'
s_db_table_pca_xlsx = 'Data/review_db_table_pca.xlsx'
s_kmeans_centers = 'Data/kmeans_centers.json'
b_override = False

In [None]:

prompt_template_translation = PromptTemplate.from_template(
'''Please translate each section into English if it is not. The sections are separated by labels "REASON" and "WISH".

[h0]==================================================================[\h0]
REASON: "兄弟们，我把星空退款的钱拿来买这个了，我做的对吗"
WISH: "加动态模糊和垂直同步选项"

TRANSLATION:

REASON: "Brothers, I used the refund money from the stars to buy this. Did I do the right thing?"
WISH: "Add dynamic blur and vertical sync options."


[h0]==================================================================[\h0]
REASON: "My first D&D experience and I'm enjoying it a lot."
WISH: "I would like more guidance in the game."

TRANSLATION:

REASON: "My first D&D experience and I'm enjoying it a lot."
WISH: "I would like more guidance in the game."

[h0]==================================================================[\h0]
REASON: "{reason}"
WISH: "{wish}"

TRANSLATION:

'''
)

prompt_template_topic = PromptTemplate.from_template(
'''Please list the most important topics and their respective original context in the review of a game in a json format with "Topic", "Category", "Context" arguments.  No more than 10 topics.
Topics should be game features.  A feature in the game should be a noun rather than a verb or an adjective.
Each topic should be categorized as a "fact" or a "request".
Respond in JSON format.

[h0]==================================================================[\h0]
REVIEW: 

"The weapon durability in this game is frustrating; my sword breaks after just a few swings. The combat itself is fun, but I wish the durability lasted longer. Also, the audio effects are very immersive during battles."

TOPICS:

{{"Topics":
    [
        {{
            "Topic": "Weapon Durability",
            "Category": "request",
            "Context": "My sword breaks after just a few swings. I wish the durability lasted longer."
        }},
        {{
            "Topic": "Combat and Fighting",
            "Category": "fact",
            "Context": "The combat itself is fun."
        }},
        {{
            "Topic": "Audio",
            "Category": "fact",
            "Context": "The audio effects are very immersive during battles."
        }}
    ]
}}

[h0]==================================================================[\h0]
REVIEW: 

"Playing during the night adds a thrilling layer to the game. The lack of a proper save feature makes it hard to enjoy it though. Also, there are way too many random encounters that make progress difficult."

TOPICS:

{{"Topics":
    [
        {{
            "Topic": "Night",
            "Category": "fact",
            "Context": "Playing during the night adds a thrilling layer to the game."
        }},
        {{
            "Topic": "Save Feature",
            "Category": "request",
            "Context": "The lack of a proper save feature makes it hard to enjoy fully."
        }},
        {{
            "Topic": "Randomness",
            "Category": "request",
            "Context": "There are way too many random encounters that make progress difficult."
        }}
    ]
}}

[h0]==================================================================[\h0]
REVIEW: 

"{review}"

TOPICS:

'''
)

prompt_template_topic_view = PromptTemplate.from_template(
'''What's the sentiment of the review with regard to the topic?
Always answer with 'Positive' or 'Negative' or 'Inconclusive'.

REVIEW: My first D&D experience and I'm enjoying it a lot.
TOPIC: D&D
SENTIMENT: Positive 

REVIEW: This game lacks a proper ending or epilog
TOPIC: epilogue
SENTIMENT: Negative

REVIEW: Posted: August 8
TOPIC: release date
SENTIMENT: Inconclusive 

REVIEW: {review}
TOPIC: {topic}
SENTIMENT: '''
)

In [None]:
# Read in the JSON file with survey results 

with open(s_root + 'Data/survey_results_clean.json', 'r') as f:
    db = json.load(f)

## Translate reviews

In [None]:
import json
from lingua import Language, LanguageDetectorBuilder

# Load JSON data from file
input_file_path = 'Data/survey_results_clean.json'  # Adjust the path if needed
output_file_path = 'Data/survey_results_trans.json'  # New JSON with language and translations

# Initialize the language detector
detector = LanguageDetectorBuilder.from_languages(
    Language.ENGLISH, Language.SPANISH, Language.CHINESE, Language.GERMAN, Language.FRENCH
).build()

# Load JSON data
with open(input_file_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Process each entry
for entry in data:
    # Get the values for reason and wish fields, making sure NaNs are handled properly
    reason_text = entry.get("Please tell us why you chose the rating above:")
    wish_text = entry.get("If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?")

    # Initialize detected language as unknown
    detected_language = "unknown"
    
    # Determine language only for fields with actual text
    if isinstance(reason_text, str) and reason_text.strip():
        detected_language_reason = detector.detect_language_of(reason_text).name.lower()
    else:
        detected_language_reason = "none"

    if isinstance(wish_text, str) and wish_text.strip():
        detected_language_wish = detector.detect_language_of(wish_text).name.lower()
    else:
        detected_language_wish = "none"
    
    # Set the overall detected language based on valid fields
    if detected_language_reason != "none" and detected_language_reason == detected_language_wish:
        detected_language = detected_language_reason
    elif detected_language_reason != "none" and detected_language_reason != detected_language_wish:
        detected_language = "mixed"
    elif detected_language_reason != "none":
        detected_language = detected_language_reason
    elif detected_language_wish != "none":
        detected_language = detected_language_wish
    
    # Save the detected language in the JSON entry
    entry["language"] = detected_language

    # Only proceed with translation if:
    # - There is text in either reason_text or wish_text
    # - The detected language is not English
    if detected_language not in ["english", "none"]:
        # Prepare the prompt with only the fields that have text
        reason_text_for_prompt = reason_text if detected_language_reason != "none" else "N/A"
        wish_text_for_prompt = wish_text if detected_language_wish != "none" else "N/A"
        
        prompt_translation = prompt_template_translation.format(
            reason=reason_text_for_prompt, 
            wish=wish_text_for_prompt
        )

        # Make the OpenAI API call to translate the review
        response = client.chat.completions.create(
            model=chat_model_name,
            messages=[
                {"role": "system", "content": "You are a helpful assistant expertised in game review analysis."},
                {"role": "user", "content": prompt_translation},
            ],
            max_tokens=1024,
        )

        # Extract the translation response
        translation_response = response.choices[0].message.content

        # Update the entry only with translated text if present
        if "REASON:" in translation_response and detected_language_reason != "none":
            reason_translation = translation_response.split("REASON:")[1].split("WISH:")[0].strip()
            entry["Please tell us why you chose the rating above:"] = reason_translation

        if "WISH:" in translation_response and detected_language_wish != "none":
            wish_translation = translation_response.split("WISH:")[1].strip()
            entry["If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?"] = wish_translation

# Save the modified data with translations
with open(output_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, indent=4, ensure_ascii=False)

print(f"Translated data saved to {output_file_path}")


## Extract Topics

In [None]:
with open(s_root + 'Data/survey_results_trans.json', 'r') as f:
    db = json.load(f)

entry = db[0]

# Extract important information from the 2nd and 3rd keys
review_text = entry["Please tell us why you chose the rating above:"]
additional_feedback = entry["If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?"]

# Combine both into a single review input for the prompt
combined_review = f"{review_text} {additional_feedback}"

# Format the prompt for the LLM
prompt_topic = prompt_template_topic.format(review=combined_review)

# Make the OpenAI API call
response = client.chat.completions.create(
    model=chat_model_name,
    messages=[
        {"role": "system", "content": "You are a helpful assistant expertised in game review analysis. Respond in JSON format."},
        {"role": "user", "content": prompt_topic},
    ],
    max_tokens=1024,
    response_format={
    "type": "json_object"
  }
)

# Print the response content
print(response.choices[0].message.content)

In [None]:
combined_review

## Sentiment Analysis

In [None]:
topic_response = response.choices[0].message.content
topics = json.loads(topic_response)

In [None]:
topics

In [None]:
# Iterate over each topic in the nested structure
for topic in topics['Topics']:
    topic_text = topic["Topic"]
    topic_context = topic["Context"]
    
    # Format the prompt for sentiment analysis
    prompt_sentiment = prompt_template_topic_view.format(review=topic_context, topic=topic_text)
    
    # Call the API for sentiment analysis
    sentiment_response = client.chat.completions.create(
        model=chat_model_name,
        messages=[
            {"role": "system", "content": "You are a helpful assistant expertised in sentiment analysis."},
            {"role": "user", "content": prompt_sentiment},
        ],
        max_tokens=1024,
    )
    
    # Extract the sentiment from the response
    sentiment = sentiment_response.choices[0].message.content.strip()
    
    # Ensure 'topics' key is initialized in the entry
    if "topics" not in entry:
        entry["topics"] = []

    # Append the topic information with sentiment to the "topics" list
    entry["topics"].append({
        "topic": topic_text,
        "context": topic_context,
        "category": topic["Category"],  # Add category from original data
        "sentiment": sentiment
    })

    # Print for confirmation
    print(f"Topic: {topic_text}\nContext: {topic_context}\nCategory: {topic['Category']}\nSentiment: {sentiment}\n")


In [None]:
output_file_path = s_root + 'Data/survey_results_with_topics.json'
with open(output_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(db, json_file, indent=4, ensure_ascii=False)

## Everything put together 
### in a loop

In [None]:
import json
import logging
from langchain.prompts import PromptTemplate
from lingua import Language, LanguageDetectorBuilder

# Initialize the language detector
detector = LanguageDetectorBuilder.from_languages(
    Language.ENGLISH, Language.SPANISH, Language.CHINESE, Language.GERMAN, Language.FRENCH
).build()

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize token counters
prompt_tokens = 0
completion_tokens = 0

# Load the survey results JSON file
input_file_path = s_root + 'Data/sample_size.json'
output_file_path = s_root + 'Data/sample_survey_results_with_topics.json'

with open(input_file_path, 'r', encoding='utf-8') as f:
    db = json.load(f)

# Initialize a counter for unique IDs
id_counter = 1

# Loop through each entry in the survey results
for entry in db:
    # Add a unique ID to the entry
    entry["ID"] = id_counter
    logging.info(f"Processing entry ID: {entry['ID']}")

    # Step 1: Detect language for each field and decide if translation is needed
    reason_text = entry.get("Please tell us why you chose the rating above:", "")
    wish_text = entry.get("If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?", "")
    
    # Initialize detected language as unknown
    detected_language = "unknown"
    
    # Detect language only for fields with actual text
    if isinstance(reason_text, str) and reason_text.strip():
        detected_language_reason = detector.detect_language_of(reason_text).name.lower()
    else:
        detected_language_reason = "none"

    if isinstance(wish_text, str) and wish_text.strip():
        detected_language_wish = detector.detect_language_of(wish_text).name.lower()
    else:
        detected_language_wish = "none"
    
    # Set the overall detected language based on valid fields
    if detected_language_reason != "none" and detected_language_reason == detected_language_wish:
        detected_language = detected_language_reason
    elif detected_language_reason != "none" and detected_language_reason != detected_language_wish:
        detected_language = "mixed"
    elif detected_language_reason != "none":
        detected_language = detected_language_reason
    elif detected_language_wish != "none":
        detected_language = detected_language_wish
    
    # Save the detected language in the JSON entry
    entry["language"] = detected_language

    # Only proceed to translation if:
    # - There is text in either reason_text or wish_text
    # - The detected language is not English
    if detected_language not in ["english", "none"]:
        # Prepare the translation prompt with only the fields that have text
        reason_text_for_prompt = reason_text if detected_language_reason != "none" else "N/A"
        wish_text_for_prompt = wish_text if detected_language_wish != "none" else "N/A"
        
        prompt_translation = prompt_template_translation.format(
            reason=reason_text_for_prompt, 
            wish=wish_text_for_prompt
        )

        # Make the OpenAI API call to translate the review
        translation_response = client.chat.completions.create(
            model=chat_model_name,
            messages=[
                {"role": "system", "content": "You are a helpful assistant for translation."},
                {"role": "user", "content": prompt_translation},
            ],
            max_tokens=1024,
        )
        
        # Track tokens used
        prompt_tokens += translation_response.usage.prompt_tokens
        completion_tokens += translation_response.usage.completion_tokens
        logging.info(f"Translation API call: Prompt tokens used: {translation_response.usage.prompt_tokens}, Completion tokens used: {translation_response.usage.completion_tokens}")

        # Parse and update entry with translated text, preserving original language info
        translation_text = translation_response.choices[0].message.content

        if "REASON:" in translation_text and detected_language_reason != "none":
            reason_translation = translation_text.split("REASON:")[1].split("WISH:")[0].strip()
            entry["Please tell us why you chose the rating above:"] = reason_translation

        if "WISH:" in translation_text and detected_language_wish != "none":
            wish_translation = translation_text.split("WISH:")[1].strip()
            entry["If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?"] = wish_translation

    # Rebuild combined_review with the updated (translated) values
    combined_review = f"{entry.get('Please tell us why you chose the rating above:', '')} {entry.get('If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?', '')}"

    # Step 2: Topic Extraction using the translated text
    prompt_topic = prompt_template_topic.format(review=combined_review)
    topic_response = client.chat.completions.create(
        model=chat_model_name,
        messages=[
            {"role": "system", "content": "You are a helpful assistant for game review analysis."},
            {"role": "user", "content": prompt_topic},
        ],
        max_tokens=1024,
        response_format={
            "type": "json_object"
          }
    )

    # Track tokens used
    prompt_tokens += topic_response.usage.prompt_tokens
    completion_tokens += topic_response.usage.completion_tokens
    logging.info(f"Topic Extraction API call: Prompt tokens used: {topic_response.usage.prompt_tokens}, Completion tokens used: {topic_response.usage.completion_tokens}")
    
    # Parse topics from JSON response
    topics = json.loads(topic_response.choices[0].message.content)  # JSON parse the response content

    # Initialize the "topics" key in the entry if it doesn't exist
    entry["topics"] = []

    # Step 3: Sentiment Analysis on each extracted topic
    for topic in topics["Topics"]:
        topic_text = topic["Topic"]
        topic_context = topic["Context"]
        topic_category = topic["Category"]  # Preserve "fact" or "request"

        prompt_sentiment = prompt_template_topic_view.format(review=topic_context, topic=topic_text)
        sentiment_response = client.chat.completions.create(
            model=chat_model_name,
            messages=[
                {"role": "system", "content": "You are a helpful assistant expertised in sentiment analysis."},
                {"role": "user", "content": prompt_sentiment},
            ],
            max_tokens=1024
        )
        
        # Track tokens used
        prompt_tokens += sentiment_response.usage.prompt_tokens
        completion_tokens += sentiment_response.usage.completion_tokens
        logging.info(f"Sentiment Analysis API call for topic '{topic_text}': Prompt tokens used: {sentiment_response.usage.prompt_tokens}, Completion tokens used: {sentiment_response.usage.completion_tokens}")

        sentiment = sentiment_response.choices[0].message.content.strip()
        
        # Append the topic information with sentiment and category to the "topics" list
        entry["topics"].append({
            "topic": topic_text,
            "sentiment": sentiment,
            "category": topic_category,
            "sentence": topic_context
        })
        
    logging.info(f"Completed processing entry ID: {entry['ID']}")
    
    # Increment the ID counter for the next entry
    id_counter += 1

# Save the final JSON with topics and sentiments
with open(output_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(db, json_file, indent=4, ensure_ascii=False)

# Print or log the total tokens used
logging.info(f"Total prompt tokens used: {prompt_tokens}")
logging.info(f"Total completion tokens used: {completion_tokens}")
logging.info(f"Processed data with topics, sentiments, language info, and IDs saved to {output_file_path}")


In [None]:
import json
import logging
from langchain.prompts import PromptTemplate
from lingua import Language, LanguageDetectorBuilder

# Initialize the language detector
detector = LanguageDetectorBuilder.from_languages(
    Language.ENGLISH, Language.SPANISH, Language.CHINESE, Language.GERMAN, Language.FRENCH
).build()

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize token counters
prompt_tokens = 0
completion_tokens = 0

# Track corrupted entries
corrupted_entries = []

# Load the survey results JSON file
input_file_path = s_root + 'Data/sample_size.json'
output_file_path = s_root + 'Data/sample_survey_results_with_topics.json'

with open(input_file_path, 'r', encoding='utf-8') as f:
    db = json.load(f)

# Initialize a counter for unique IDs
id_counter = 1

# Loop through each entry in the survey results
for entry in db:
    # Add a unique ID to the entry
    entry["ID"] = id_counter
    logging.info(f"Processing entry ID: {entry['ID']}")

    # Step 1: Detect language for each field and decide if translation is needed
    reason_text = entry.get("Please tell us why you chose the rating above:", "")
    wish_text = entry.get("If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?", "")

    # Detect language only for fields with actual text
    detected_language_reason = detector.detect_language_of(reason_text).name.lower() if isinstance(reason_text, str) and reason_text.strip() else "none"
    detected_language_wish = detector.detect_language_of(wish_text).name.lower() if isinstance(wish_text, str) and wish_text.strip() else "none"

    # Determine the overall detected language
    if detected_language_reason != "none" and detected_language_reason == detected_language_wish:
        detected_language = detected_language_reason
    elif detected_language_reason != "none" and detected_language_reason != detected_language_wish:
        detected_language = "mixed"
    elif detected_language_reason != "none":
        detected_language = detected_language_reason
    elif detected_language_wish != "none":
        detected_language = detected_language_wish
    else:
        detected_language = "unknown"

    # Save detected language in the JSON entry
    entry["language"] = detected_language

    # Translation step if necessary
    if detected_language not in ["english", "none"]:
        try:
            reason_text_for_prompt = reason_text if detected_language_reason != "none" else "N/A"
            wish_text_for_prompt = wish_text if detected_language_wish != "none" else "N/A"
            
            prompt_translation = prompt_template_translation.format(
                reason=reason_text_for_prompt,
                wish=wish_text_for_prompt
            )

            # Make translation API call
            translation_response = client.chat.completions.create(
                model=chat_model_name,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant for translation."},
                    {"role": "user", "content": prompt_translation},
                ],
                max_tokens=1024
            )

            # Track tokens
            prompt_tokens += translation_response.usage.prompt_tokens
            completion_tokens += translation_response.usage.completion_tokens

            # Parse and update entry with translated text
            translation_text = translation_response.choices[0].message.content
            if "REASON:" in translation_text and detected_language_reason != "none":
                reason_translation = translation_text.split("REASON:")[1].split("WISH:")[0].strip()
                entry["Please tell us why you chose the rating above:"] = reason_translation
            if "WISH:" in translation_text and detected_language_wish != "none":
                wish_translation = translation_text.split("WISH:")[1].strip()
                entry["If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?"] = wish_translation

        except Exception as e:
            logging.error(f"Error translating entry ID: {entry['ID']}: {e}")
            corrupted_entries.append(entry)
            id_counter += 1
            continue

    # Rebuild combined_review with updated values
    combined_review = f"{entry.get('Please tell us why you chose the rating above:', '')} {entry.get('If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?', '')}"

    # Step 2: Topic Extraction
    try:
        prompt_topic = prompt_template_topic.format(review=combined_review)
        topic_response = client.chat.completions.create(
            model=chat_model_name,
            messages=[
                {"role": "system", "content": "You are a helpful assistant for game review analysis."},
                {"role": "user", "content": prompt_topic},
            ],
            max_tokens=1024,        
            response_format={
                "type": "json_object"
            }
        )

        # Track tokens used
        prompt_tokens += topic_response.usage.prompt_tokens
        completion_tokens += topic_response.usage.completion_tokens

        # Ensure JSON response parsing works correctly
        try:
            topics = json.loads(topic_response.choices[0].message.content)
        except json.JSONDecodeError as json_err:
            logging.error(f"JSON parsing error for entry ID {entry['ID']}: {json_err}")
            corrupted_entries.append(entry)
            id_counter += 1
            continue

        # Initialize the "topics" key
        entry["topics"] = []

        # Step 3: Sentiment Analysis for each topic
        for topic in topics["Topics"]:
            topic_text = topic["Topic"]
            topic_context = topic["Context"]
            topic_category = topic["Category"]

            prompt_sentiment = prompt_template_topic_view.format(review=topic_context, topic=topic_text)
            try:
                sentiment_response = client.chat.completions.create(
                    model=chat_model_name,
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant expertised in sentiment analysis."},
                        {"role": "user", "content": prompt_sentiment},
                    ],
                    max_tokens=1024
                )

                # Track tokens used
                prompt_tokens += sentiment_response.usage.prompt_tokens
                completion_tokens += sentiment_response.usage.completion_tokens

                # Extract and store sentiment result
                sentiment = sentiment_response.choices[0].message.content.strip()
                entry["topics"].append({
                    "topic": topic_text,
                    "sentiment": sentiment,
                    "category": topic_category,
                    "sentence": topic_context
                })

            except Exception as e:
                logging.error(f"Error processing sentiment analysis for entry ID {entry['ID']} topic '{topic_text}': {e}")
                corrupted_entries.append(entry)
                break  # Continue to next entry if sentiment analysis fails for any topic

        logging.info(f"Completed processing entry ID: {entry['ID']}")

    except Exception as e:
        logging.error(f"Error extracting topics for entry ID: {entry['ID']}: {e}")
        corrupted_entries.append(entry)
        id_counter += 1
        continue  # Move to next entry if topic extraction fails

    # Increment ID counter for the next entry
    id_counter += 1

# Save final JSON with topics and sentiments
with open(output_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(db, json_file, indent=4, ensure_ascii=False)

# Save corrupted entries separately
with open(s_root + 'Data/corrupted_entries.json', 'w', encoding='utf-8') as json_file:
    json.dump(corrupted_entries, json_file, indent=4, ensure_ascii=False)

# Log the total tokens used
logging.info(f"Total prompt tokens used: {prompt_tokens}")
logging.info(f"Total completion tokens used: {completion_tokens}")
logging.info(f"Processed data with topics, sentiments, language info, and IDs saved to {output_file_path}")
logging.info(f"Corrupted entries saved to {s_root + 'Data/corrupted_entries.json'}")


In [None]:
import json
import logging
from langchain.prompts import PromptTemplate
from lingua import Language, LanguageDetectorBuilder

# Initialize the language detector
detector = LanguageDetectorBuilder.from_languages(
    Language.ENGLISH, Language.SPANISH, Language.CHINESE, Language.GERMAN, Language.FRENCH
).build()

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize token counters
prompt_tokens = 0
completion_tokens = 0

# Track corrupted entries
corrupted_entries = []

# Load the survey results JSON file
input_file_path = s_root + 'Data/sample_size.json'
output_file_path = s_root + 'Data/sample_survey_results_with_topics.json'

with open(input_file_path, 'r', encoding='utf-8') as f:
    db = json.load(f)

# Initialize a counter for unique IDs
id_counter = 1

# Loop through each entry in the survey results
for entry in db:
    # Add a unique ID to the entry
    entry["ID"] = id_counter
    logging.info(f"Processing entry ID: {entry['ID']}")

    # Step 1: Detect language for each field and decide if translation is needed
    reason_text = entry.get("Please tell us why you chose the rating above:", "")
    wish_text = entry.get("If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?", "")

    # Detect language only for fields with actual text and skip if either field is None or empty
    if isinstance(reason_text, str) and reason_text.strip():
        try:
            detected_language_reason = detector.detect_language_of(reason_text)
            detected_language_reason = detected_language_reason.name.lower() if detected_language_reason else "none"
        except AttributeError:
            detected_language_reason = "none"
    else:
        detected_language_reason = "none"

    if isinstance(wish_text, str) and wish_text.strip():
        try:
            detected_language_wish = detector.detect_language_of(wish_text)
            detected_language_wish = detected_language_wish.name.lower() if detected_language_wish else "none"
        except AttributeError:
            detected_language_wish = "none"
    else:
        detected_language_wish = "none"

    # Set the overall detected language based on valid fields
    if detected_language_reason != "none" and detected_language_reason == detected_language_wish:
        detected_language = detected_language_reason
    elif detected_language_reason != "none" and detected_language_reason != detected_language_wish:
        detected_language = "mixed"
    elif detected_language_reason != "none":
        detected_language = detected_language_reason
    elif detected_language_wish != "none":
        detected_language = detected_language_wish
    else:
        detected_language = "unknown"

    # Save detected language in the JSON entry
    entry["language"] = detected_language

    # Translation step if necessary
    if detected_language not in ["english", "none"]:
        logging.info(f"Entry ID {entry['ID']}: Translation required (Language: {detected_language})")
        try:
            reason_text_for_prompt = reason_text if detected_language_reason != "none" else "N/A"
            wish_text_for_prompt = wish_text if detected_language_wish != "none" else "N/A"
            
            prompt_translation = prompt_template_translation.format(
                reason=reason_text_for_prompt,
                wish=wish_text_for_prompt
            )

            # Make translation API call
            translation_response = client.chat.completions.create(
                model=chat_model_name,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant for translation."},
                    {"role": "user", "content": prompt_translation},
                ],
                max_tokens=1024
            )

            # Track tokens
            prompt_tokens += translation_response.usage.prompt_tokens
            completion_tokens += translation_response.usage.completion_tokens

            # Parse and update entry with translated text
            translation_text = translation_response.choices[0].message.content
            if "REASON:" in translation_text and detected_language_reason != "none":
                reason_translation = translation_text.split("REASON:")[1].split("WISH:")[0].strip()
                entry["Please tell us why you chose the rating above:"] = reason_translation
            if "WISH:" in translation_text and detected_language_wish != "none":
                wish_translation = translation_text.split("WISH:")[1].strip()
                entry["If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?"] = wish_translation

        except Exception as e:
            logging.error(f"Error translating entry ID: {entry['ID']}: {e}")
            corrupted_entries.append(entry)
            id_counter += 1
            continue
    else:
        logging.info(f"Entry ID {entry['ID']}: No translation needed (Language: {detected_language})")

    # Rebuild combined_review with updated values
    combined_review = f"{entry.get('Please tell us why you chose the rating above:', '')} {entry.get('If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?', '')}"

    # Step 2: Topic Extraction
    logging.info(f"Entry ID {entry['ID']}: Starting topic extraction")
    try:
        prompt_topic = prompt_template_topic.format(review=combined_review)
        topic_response = client.chat.completions.create(
            model=chat_model_name,
            messages=[
                {"role": "system", "content": "You are a helpful assistant for game review analysis."},
                {"role": "user", "content": prompt_topic},
            ],
            max_tokens=1024,        
            response_format={
                "type": "json_object"
            }
        )

        # Track tokens used
        prompt_tokens += topic_response.usage.prompt_tokens
        completion_tokens += topic_response.usage.completion_tokens

        # Ensure JSON response parsing works correctly
        try:
            topics = json.loads(topic_response.choices[0].message.content)
        except json.JSONDecodeError as json_err:
            logging.error(f"JSON parsing error for entry ID {entry['ID']}: {json_err}")
            corrupted_entries.append(entry)
            id_counter += 1
            continue

        # Initialize the "topics" key
        entry["topics"] = []

        # Step 3: Sentiment Analysis for each topic
        for topic in topics["Topics"]:
            logging.info(f"Entry ID {entry['ID']}: Starting sentiment analysis for topic '{topic['Topic']}'")
            topic_text = topic["Topic"]
            topic_context = topic["Context"]
            topic_category = topic["Category"]

            prompt_sentiment = prompt_template_topic_view.format(review=topic_context, topic=topic_text)
            try:
                sentiment_response = client.chat.completions.create(
                    model=chat_model_name,
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant expertised in sentiment analysis."},
                        {"role": "user", "content": prompt_sentiment},
                    ],
                    max_tokens=1024
                )

                # Track tokens used
                prompt_tokens += sentiment_response.usage.prompt_tokens
                completion_tokens += sentiment_response.usage.completion_tokens

                # Extract and store sentiment result
                sentiment = sentiment_response.choices[0].message.content.strip()
                entry["topics"].append({
                    "topic": topic_text,
                    "sentiment": sentiment,
                    "category": topic_category,
                    "sentence": topic_context
                })

            except Exception as e:
                logging.error(f"Error processing sentiment analysis for entry ID {entry['ID']} topic '{topic_text}': {e}")
                corrupted_entries.append(entry)
                break  # Continue to next entry if sentiment analysis fails for any topic

        logging.info(f"Completed topic and sentiment analysis for entry ID: {entry['ID']}")

    except Exception as e:
        logging.error(f"Error extracting topics for entry ID: {entry['ID']}: {e}")
        corrupted_entries.append(entry)
        id_counter += 1
        continue  # Move to next entry if topic extraction fails

    # Increment ID counter for the next entry
    id_counter += 1

# Save final JSON with topics and sentiments
with open(output_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(db, json_file, indent=4, ensure_ascii=False)

# Save corrupted entries separately
with open(s_root + 'Data/corrupted_entries.json', 'w', encoding='utf-8') as json_file:
    json.dump(corrupted_entries, json_file, indent=4, ensure_ascii=False)

# Log the total tokens used
logging.info(f"Total prompt tokens used: {prompt_tokens}")
logging.info(f"Total completion tokens used: {completion_tokens}")
logging.info(f"Processed data with topics, sentiments, language info, and IDs saved to {output_file_path}")
logging.info(f"Corrupted entries saved to {s_root + 'Data/corrupted_entries.json'}")


In [None]:
try:
    topics = json.loads(topic_response.choices[0].message.content)
except json.JSONDecodeError as json_err:
    logging.error(f"JSON parsing error for entry ID {entry['ID']}: {json_err}")
    corrupted_entries.append(entry)
    id_counter += 1


In [None]:
topic_response.choices[0].message.content

## Cost Calculation

In [None]:
#Usage and pricing for GPT4o-mini
total_prompt_cost = (prompt_tokens / 1_000_000) * 0.15
total_completion_cost = (completion_tokens / 1_000_000) * 0.6


print(f"Total prompt tokens used: {prompt_tokens}")
print(f"Total completion tokens used: {completion_tokens}")
print(f"Total prompt token cost: ${total_prompt_cost:.4f}")
print(f"Total completion token cost: ${total_completion_cost:.4f}")



In [None]:
# Read in the JSON file with survey results
with open(s_root + 'Data/sample_survey_results_with_topics.json', 'r', encoding='utf-8') as f:
    db = json.load(f)
    

## Embed the reviews
### Only the topics of the reviews

In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("Current CUDA device:", torch.cuda.current_device())
print("CUDA device name:", torch.cuda.get_device_name(torch.cuda.current_device()))


In [None]:
embed_MiniLM = 'sentence-transformers/all-MiniLM-L6-v2'

embed_MPNET = 'sentence-transformers/all-mpnet-base-v2'

s_db_json = 'Data/survey_results_with_topics.json'
s_db_json_sample = 'Data/sample_survey_results_with_topics.json'

In [38]:
def index_embedding(text, model_name=embed_MiniLM):
    text = text.encode(encoding='ASCII', errors='ignore').decode()
    embed_model = LangchainEmbedding(
        HuggingFaceEmbeddings(model_name=model_name)
    )
    vector = embed_model.get_text_embedding(text)
    return vector

In [39]:
embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=embed_MiniLM))

2024-11-14 11:05:18,633 - INFO - Use pytorch device_name: cuda
2024-11-14 11:05:18,633 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [42]:
import gc

batch_size = 10  # Adjust based on available memory and dataset size
b_embedding = True

if b_embedding:
    with open(s_root + s_db_json_sample, 'r', encoding='utf-8') as f:
        d_review_output = json.load(f)
        print('Loaded JSON data')

    for batch_start in range(0, len(d_review_output), batch_size):
        batch_end = min(batch_start + batch_size, len(d_review_output))
        batch = d_review_output[batch_start:batch_end]
        
        for i, review_entry in enumerate(batch):
            print(f"Processing Review {i + batch_start + 1}")
            
            if isinstance(review_entry, dict) and 'topics' in review_entry and isinstance(review_entry['topics'], list):
                d_topics = review_entry['topics']
                
                for d_topic in d_topics:
                    if isinstance(d_topic, dict):
                        if 'embedding' not in d_topic or b_override:
                            if 'topic' in d_topic:
                                d_topic['embedding'] = embed_model.get_text_embedding(d_topic['topic'])
                                
                                # Release memory
                                torch.cuda.empty_cache()
                                gc.collect()
                            else:
                                d_topic['embedding'] = 0
                print('.', end='')
        print(f"\nBatch {batch_start // batch_size + 1} processed.")

    # Save updated JSON with embeddings
    with open(s_root + s_db_embed_json, 'w', encoding='utf-8') as f:
        json.dump(d_review_output, f)
    print("Embeddings saved.")


Loaded JSON data
Processing Review 1
.Processing Review 2
.Processing Review 3
.Processing Review 4
.Processing Review 5
.Processing Review 6
.Processing Review 7
.Processing Review 8
.Processing Review 9
.Processing Review 10
.
Batch 1 processed.
Processing Review 11
.Processing Review 12
.Processing Review 13
.Processing Review 14
.Processing Review 15
.Processing Review 16
.Processing Review 17
.Processing Review 18
.Processing Review 19
.Processing Review 20
.
Batch 2 processed.
Processing Review 21
.Processing Review 22
.Processing Review 23
.Processing Review 24
.Processing Review 25
.Processing Review 26
.Processing Review 27
.Processing Review 28
.Processing Review 29
.Processing Review 30
.
Batch 3 processed.
Processing Review 31
.Processing Review 32
.Processing Review 33
.Processing Review 34
.Processing Review 35
.Processing Review 36
.Processing Review 37
.Processing Review 38
.Processing Review 39
.Processing Review 40
.
Batch 4 processed.
Processing Review 41
.Processing

In [43]:
torch.cuda.empty_cache()

## Convert to Table
#### One review can have multiple topics. If we want a table structure, we need to have every topic in one row, essentially duplicating the review information.

In [44]:
import pandas as pd
import json

b_to_table = True

if b_to_table:
    # Initialize an empty DataFrame to hold all topics
    df_total = pd.DataFrame()
    
    # Load the JSON data with embeddings
    with open(s_root + s_db_embed_json, 'r', encoding='utf-8') as f:
        d_review_output = json.load(f)
        print("Loaded JSON with embeddings")

    # Iterate over each review entry in the JSON data
    for review_entry in d_review_output:
        # Check if 'topics' exists in each review entry
        if 'topics' in review_entry and isinstance(review_entry['topics'], list):
            # Create a DataFrame for the current review's topics
            df_gp = pd.DataFrame(review_entry['topics'])
            
            # Add additional columns from the review entry
            for key, value in review_entry.items():
                if key != 'topics':  # Skip the topics column itself
                    df_gp[key] = value  # Assign each additional field to each row in df_gp

            # Concatenate this review's DataFrame to the total DataFrame
            df_total = pd.concat([df_total, df_gp], ignore_index=True)
    
    # Save the combined DataFrame to JSON and Excel
    df_total.to_json(s_root + s_db_table_json, orient='records')
    df_total.to_excel(s_root + s_db_table_xlsx, index=False)
    print("Data saved to JSON and Excel")


Loaded JSON with embeddings
Data saved to JSON and Excel


## PCA

In [None]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
import matplotlib.pyplot as plt

b_pca_kmeans = True
b_update_kmeans_centers = True

if b_pca_kmeans:
    keyword = 'zombie'
    
    # Load existing K-means centers if available
    if os.path.isfile(s_root + s_kmeans_centers):
        with open(s_root + s_kmeans_centers, 'r') as f:
            d_kmeans_centers = json.load(f)
    else:
        d_kmeans_centers = {}

    # Load the DataFrame with embeddings
    df_total = pd.read_json(s_root + s_db_table_json, orient='records')
    df_total = df_total[df_total['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
    
    # Convert embeddings to matrix
    mat = np.array(df_total['embedding'].tolist())

    # Embed the keyword and compute similarity for each entry
    keyword_embed = index_embedding(keyword)
    df_total['similarity'] = mat @ keyword_embed  # Dot product for cosine similarity

    # Define number of clusters for main clustering and sub-clustering
    n_clusters = 20
    n_clusters_sub = 2
    s_key = str(n_clusters)
    
    # Initialize K-means clusters or use precomputed centers
    if s_key in d_kmeans_centers:
        mat_init = np.array(d_kmeans_centers[s_key])
    else:
        mat_init = np.zeros((n_clusters, mat.shape[1]))
        np.fill_diagonal(mat_init, 1.0)
    
    # Main K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, init=mat_init, n_init=1).fit(mat)
    df_total['kmeans'] = kmeans.labels_
    
    if b_update_kmeans_centers:
        d_kmeans_centers[s_key] = kmeans.cluster_centers_.tolist()

    # Sub-clustering for larger clusters
    df_total['kmeans_sub'] = 0
    for i_c in range(n_clusters):
        cluster_indices = np.where(df_total['kmeans'] == i_c)[0]
        if len(cluster_indices) > len(df_total) / n_clusters:
            mat_i_c = mat[cluster_indices, :]
            s_key_sub = f"{n_clusters}_{i_c}_{n_clusters_sub}"
    
            # Adjust n_clusters_sub if fewer samples than sub-clusters
            adjusted_n_clusters_sub = min(n_clusters_sub, len(mat_i_c))
    
            if s_key_sub in d_kmeans_centers and adjusted_n_clusters_sub == n_clusters_sub:
                mat_init_sub = np.array(d_kmeans_centers[s_key_sub])
            else:
                mat_init_sub = np.zeros((adjusted_n_clusters_sub, mat.shape[1]))
                np.fill_diagonal(mat_init_sub, 1.0)
    
            # Apply KMeans with adjusted number of clusters
            kmeans_sub = KMeans(n_clusters=adjusted_n_clusters_sub, init=mat_init_sub, n_init=1).fit(mat_i_c)
            df_total.loc[cluster_indices, 'kmeans_sub'] = kmeans_sub.labels_
    
            if b_update_kmeans_centers and adjusted_n_clusters_sub == n_clusters_sub:
                d_kmeans_centers[s_key_sub] = kmeans_sub.cluster_centers_.tolist()


    # Dimensionality Reduction Techniques
    methods = [
        ('PCA', PCA(n_components=3)),
        ('t-SNE', TSNE(n_components=2)),
        ('UMAP', UMAP(n_components=2))
    ]

    # Visualization
    plt.figure(figsize=(30, 10))
    for i, (name, model) in enumerate(methods):
        print(name)
        plt.subplot(1, 3, i + 1)

        if name == 'PCA':
            X_embedded = model.fit_transform(mat)
            df_total['first_dim_PCA'] = X_embedded[:, 0]
            df_total['second_dim_PCA'] = X_embedded[:, 1]
            df_total['third_dim_PCA'] = X_embedded[:, 2]
            plt.scatter(X_embedded[:, 1], X_embedded[:, 2], c=df_total['kmeans'], cmap='tab20')
        else:
            X_embedded = model.fit_transform(mat)
            df_total[f'first_dim_{name}'] = X_embedded[:, 0]
            df_total[f'second_dim_{name}'] = X_embedded[:, 1]
            plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=df_total['kmeans'], cmap='tab20')

        plt.title(f"{name} Visualization")
        plt.colorbar()

    plt.show()

    # Save K-means centers and DataFrame with clustering results
    with open(s_root + s_kmeans_centers, 'w') as f:
        json.dump(d_kmeans_centers, f)

    df_total.to_json(s_root + s_db_table_pca_json, orient='records')
    df_total.to_excel(s_root + s_db_table_pca_xlsx, index=False)
    print("Clustering and dimensionality reduction results saved.")


In [None]:
df_total.head()

## HDBSCAN

In [45]:
import os
import json
import numpy as np
import pandas as pd
import umap
import hdbscan

# Paths and parameters
s_root = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis/'
s_db_table_json = 'Data/review_db_table.json'  # Input JSON with embeddings
s_db_table_hdbscan_json = 'Data/review_db_table_hdbscan.json'  # Output JSON with UMAP + HDBSCAN clusters
keyword = 'zombie'  # Keyword for similarity calculation

# Load DataFrame with embeddings
df_total = pd.read_json(s_root + s_db_table_json, orient='records')
df_total = df_total[df_total['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

# Convert embeddings to matrix
mat = np.array(df_total['embedding'].tolist())

# Embed the keyword and compute similarity for each entry
def index_embedding(keyword):
    # Random embedding for the keyword
    return np.random.rand(mat.shape[1])

keyword_embed = index_embedding(keyword)
df_total['similarity'] = mat @ keyword_embed  # Dot product for cosine similarity

# UMAP Dimensionality Reduction (3D)
umap_model = umap.UMAP(n_components=3, random_state=42)
umap_embeddings = umap_model.fit_transform(mat)
df_total['umap_x'] = umap_embeddings[:, 0]
df_total['umap_y'] = umap_embeddings[:, 1]
df_total['umap_z'] = umap_embeddings[:, 2]  # Third dimension for optional 3D visualization

# HDBSCAN Clustering
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=3, cluster_selection_epsilon=0.5)
cluster_labels = clusterer.fit_predict(umap_embeddings)
df_total['cluster_id'] = cluster_labels

# Save DataFrame with UMAP and HDBSCAN results to JSON
df_total.to_json(s_root + s_db_table_hdbscan_json, orient='records')
print(f"3D UMAP and HDBSCAN clustering results with similarity saved to {s_root + s_db_table_hdbscan_json}")


  warn(


3D UMAP and HDBSCAN clustering results with similarity saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis/Data/review_db_table_hdbscan.json


In [46]:
df_total.head()

Unnamed: 0,Please rate your overall experience playing Into the Dead: Our Darkest Days,Please tell us why you chose the rating above:,"If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?",Had you heard of Into the Dead before this demo?,What is your age group,What is your gender?,What are your favourite Steam games you have played in the last 3 months?,ID,language,topic,sentiment,category,sentence,embedding,similarity,umap_x,umap_y,umap_z,cluster_id
0,10,"Nice story,nice game","when move to another shelter, suplies which di...",Heard of it,18-24,Man,"CS2, callisto protocal, lockdown protocal",2,english,Story,Positive,fact,"Nice story, nice game.","[0.0332841799, 0.0589793995, 0.0548812076, 0.0...",0.402232,11.984763,11.016189,10.425184,86
1,10,"Nice story,nice game","when move to another shelter, suplies which di...",Heard of it,18-24,Man,"CS2, callisto protocal, lockdown protocal",2,english,Shelter,Negative,fact,"When moving to another shelter, supplies which...","[0.0102353394, 0.1051071808, 0.0266069323, 0.1...",-0.209047,9.949238,6.057717,11.481344,92
2,10,"Nice story,nice game","when move to another shelter, suplies which di...",Heard of it,18-24,Man,"CS2, callisto protocal, lockdown protocal",2,english,Supplies,Negative,request,They should be added to player's storage since...,"[-0.027585186100000002, 0.029896769700000003, ...",-0.091792,8.315664,6.911914,8.666392,90
3,8,"Reminds me a lot of This War of Mine, a game I...",Need a tutorial or guide because I got stuck a...,Never heard of it,45-54,Woman,"Cozy Grove, Death Stranding, I am Future",3,english,Graphics,Positive,fact,Your game has more detailed graphics and in co...,"[-0.012894654600000001, -0.0043537645, -0.0294...",0.491395,5.229348,3.590627,31.203501,8
4,8,"Reminds me a lot of This War of Mine, a game I...",Need a tutorial or guide because I got stuck a...,Never heard of it,45-54,Woman,"Cozy Grove, Death Stranding, I am Future",3,english,Tutorial,Negative,request,Need a tutorial or guide because I got stuck a...,"[-0.0567355193, -0.0038657677000000002, -0.053...",0.813603,8.222432,-6.434734,8.722463,11


## Kmeans

In [49]:
import os
import json
import numpy as np
import pandas as pd
import umap
import hdbscan
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Paths and parameters
s_root = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis/'
s_db_table_json = 'Data/review_db_table.json'  # Input JSON with embeddings
s_db_table_hdbscan_json = 'Data/review_db_table_hdbscan.json'  # Output JSON with UMAP + HDBSCAN clusters
keyword = 'zombie'  # Keyword for similarity calculation
n_kmeans_clusters = 20

# Load DataFrame with embeddings
df_total = pd.read_json(s_root + s_db_table_json, orient='records')
df_total = df_total[df_total['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

# Convert embeddings to matrix
mat = np.array(df_total['embedding'].tolist())

# Embed the keyword and compute similarity for each entry
def index_embedding(keyword):
    return np.random.rand(mat.shape[1])  # Example embedding; replace with real embedding function

keyword_embed = index_embedding(keyword)
df_total['similarity'] = mat @ keyword_embed  # Dot product for cosine similarity

# Dimensionality Reduction Techniques
# UMAP (3D)
umap_model = umap.UMAP(n_components=3, random_state=42)
umap_embeddings = umap_model.fit_transform(mat)
df_total['umap_x'] = umap_embeddings[:, 0]
df_total['umap_y'] = umap_embeddings[:, 1]
df_total['umap_z'] = umap_embeddings[:, 2]

# PCA (3D)
pca_model = PCA(n_components=3)
pca_embeddings = pca_model.fit_transform(mat)
df_total['pca_x'] = pca_embeddings[:, 0]
df_total['pca_y'] = pca_embeddings[:, 1]
df_total['pca_z'] = pca_embeddings[:, 2]

# t-SNE (3D)
tsne_model = TSNE(n_components=3, random_state=42)
tsne_embeddings = tsne_model.fit_transform(mat)
df_total['tsne_x'] = tsne_embeddings[:, 0]
df_total['tsne_y'] = tsne_embeddings[:, 1]
df_total['tsne_z'] = tsne_embeddings[:, 2]

# HDBSCAN Clustering
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=3, cluster_selection_epsilon=0.5)
hdbscan_labels = hdbscan_clusterer.fit_predict(umap_embeddings)  # Clustering on UMAP reduced space
df_total['hdbscan_cluster_id'] = hdbscan_labels

# KMeans Clustering (on original embeddings)
kmeans_model = KMeans(n_clusters=n_kmeans_clusters, random_state=42)
kmeans_labels = kmeans_model.fit_predict(mat)
df_total['kmeans_cluster_id'] = kmeans_labels

# Save DataFrame with all results to JSON
output_path = s_root + s_db_table_hdbscan_json
df_total.to_json(output_path, orient='records')
print(f"3D UMAP, PCA, t-SNE and clustering results (HDBSCAN and KMeans) saved to {output_path}")


  warn(
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



3D UMAP, PCA, t-SNE and clustering results (HDBSCAN and KMeans) saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis/Data/review_db_table_hdbscan.json


## Name the Clusters

In [52]:
import os
import json
import numpy as np
import pandas as pd
import umap
import hdbscan
from sklearn.metrics.pairwise import cosine_distances

# Define paths and parameters
s_root = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis/'
s_db_table_hdbscan_json = 'Data/review_db_table_hdbscan.json'
keyword = 'zombie'  # Example keyword for similarity calculation

# Load precomputed data
df_total = pd.read_json(s_root + s_db_table_hdbscan_json, orient='records')
mat = np.array(df_total['embedding'].tolist())

# Step 1: Find Representative Topics for Each Cluster
cluster_names = {}
unique_clusters = df_total['kmeans_cluster_id'].unique()

# Define a prompt template for cluster naming
prompt_template_cluster_naming = PromptTemplate.from_template(
'''Based on the following topics, generate a concise name (5 words or fewer) that best describes the general theme of this cluster.

TOPICS: {topics}
CLUSTER NAME: '''
)

def find_representative_topics(cluster_id, df, mat, max_topics=8):
    """Finds up to max_topics representative topics based on centroid proximity."""
    cluster_data = df[df['kmeans_cluster_id'] == cluster_id]
    cluster_embeddings = np.array(cluster_data['embedding'].tolist())
    centroid = np.mean(cluster_embeddings, axis=0)
    distances = cosine_distances([centroid], cluster_embeddings).flatten()
    closest_indices = np.argsort(distances)[:max_topics]
    return cluster_data.iloc[closest_indices]['topic'].tolist()

# Step 2: Generate Cluster Names Using LLM
for cluster_id in unique_clusters:
    topics = find_representative_topics(cluster_id, df_total, mat)
    prompt = f"Generate a concise name (5 words or fewer) for a cluster with these topics: {', '.join(topics)}"
    
    # API call to generate cluster names
    def generate_cluster_name(topics_list):
        # Format topics as a comma-separated string
        topics = ", ".join(topics_list)
    
        # Generate the prompt for the cluster name
        prompt_cluster_naming = prompt_template_cluster_naming.format(topics=topics)
        
        # API call to OpenAI's completion model
        cluster_name_response = client.chat.completions.create(
            model=chat_model_name,
            messages=[
                {"role": "system", "content": "You are an expert at summarizing topics into concise names."},
                {"role": "user", "content": prompt_cluster_naming},
            ],
            max_tokens=100  # Adjust tokens to limit response length
        )
    
        # Extract and return the generated cluster name
        cluster_name = cluster_name_response.choices[0].message.content.strip()
        return cluster_name
    
    cluster_name = generate_cluster_name(prompt)
    cluster_names[cluster_id] = cluster_name

# Step 3: Save Cluster Names to JSON
df_total['cluster_name'] = df_total['kmeans_cluster_id'].map(cluster_names)
df_total.to_json(s_root + s_db_table_hdbscan_json, orient='records')
print(f"Cluster names saved to {s_root + s_db_table_hdbscan_json}")


2024-11-14 13:47:25,322 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-14 13:47:25,823 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-14 13:47:26,292 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-14 13:47:26,945 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-14 13:47:27,436 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-14 13:47:27,883 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-14 13:47:28,462 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-14 13:47:28,918 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-14 13:47:31,298 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "

Cluster names saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis/Data/review_db_table_hdbscan.json


In [51]:
df_total.head()

Unnamed: 0,Please rate your overall experience playing Into the Dead: Our Darkest Days,Please tell us why you chose the rating above:,"If you had a magic wand and you could change, add, or remove anything from the game, what would it be and why?",Had you heard of Into the Dead before this demo?,What is your age group,What is your gender?,What are your favourite Steam games you have played in the last 3 months?,ID,language,topic,...,umap_y,umap_z,pca_x,pca_y,pca_z,tsne_x,tsne_y,tsne_z,hdbscan_cluster_id,kmeans_cluster_id
0,10,"Nice story,nice game","when move to another shelter, suplies which di...",Heard of it,18-24,Man,"CS2, callisto protocal, lockdown protocal",2,english,Story,...,11.016189,10.425184,0.028941,-0.02723,0.110504,11.269978,9.573381,-10.489752,86,5
1,10,"Nice story,nice game","when move to another shelter, suplies which di...",Heard of it,18-24,Man,"CS2, callisto protocal, lockdown protocal",2,english,Shelter,...,6.057717,11.481344,0.084692,-0.370481,0.150167,-1.712859,-23.50738,8.990909,92,5
2,10,"Nice story,nice game","when move to another shelter, suplies which di...",Heard of it,18-24,Man,"CS2, callisto protocal, lockdown protocal",2,english,Supplies,...,6.911914,8.666392,-0.027137,-0.029577,0.237652,-16.929251,2.47029,4.071044,90,13
3,8,"Reminds me a lot of This War of Mine, a game I...",Need a tutorial or guide because I got stuck a...,Never heard of it,45-54,Woman,"Cozy Grove, Death Stranding, I am Future",3,english,Graphics,...,3.590627,31.203501,0.199391,0.378013,0.071529,15.819509,17.81571,6.263166,8,1
4,8,"Reminds me a lot of This War of Mine, a game I...",Need a tutorial or guide because I got stuck a...,Never heard of it,45-54,Woman,"Cozy Grove, Death Stranding, I am Future",3,english,Tutorial,...,-6.434734,8.722463,0.026106,0.075713,0.176829,7.549045,20.893099,22.751272,11,14


## Embedd the sentence not the topic "title"

In [None]:
import gc
import json
import os
import numpy as np
import torch

# Paths and parameters
s_root = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis/'
s_db_json_sample = 'Data/sample_survey_results_with_topics.json'  # Input JSON file
s_db_embed_json = 'Data/review_db_table_with_sentence_embeddings.json'  # Output JSON file with embeddings
batch_size = 10  # Adjust based on available memory and dataset size
b_embedding = True
b_override = True  # Set to True if you want to overwrite existing embeddings

# Define your sentence embedding function
def index_embedding(sentence):
    # Placeholder for sentence embedding; replace with actual embedding model as needed
    return np.random.rand(300).tolist()  # Assuming 300 dimensions for embedding

# Load data and process in batches
if b_embedding:
    with open(os.path.join(s_root, s_db_json_sample), 'r', encoding='utf-8') as f:
        d_review_output = json.load(f)
        print('Loaded JSON data')

    for batch_start in range(0, len(d_review_output), batch_size):
        batch_end = min(batch_start + batch_size, len(d_review_output))
        batch = d_review_output[batch_start:batch_end]
        
        for i, review_entry in enumerate(batch):
            print(f"Processing Review {i + batch_start + 1}")
            
            if isinstance(review_entry, dict) and 'topics' in review_entry and isinstance(review_entry['topics'], list):
                d_topics = review_entry['topics']
                
                for d_topic in d_topics:
                    if isinstance(d_topic, dict):
                        if 'embedding' not in d_topic or b_override:
                            if 'sentence' in d_topic:  # Embed the sentence instead of the topic
                                d_topic['embedding'] = index_embedding(d_topic['sentence'])
                                
                                # Release memory
                                torch.cuda.empty_cache()
                                gc.collect()
                            else:
                                d_topic['embedding'] = 0
                print('.', end='')
        print(f"\nBatch {batch_start // batch_size + 1} processed.")

    # Save updated JSON with embeddings
    with open(os.path.join(s_root, s_db_embed_json), 'w', encoding='utf-8') as f:
        json.dump(d_review_output, f)
    print("Embeddings saved.")


In [None]:
#Check the results (why were they so fast?)