In [15]:
import pandas as pd
import json

input_path = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\Backup\DRS_NextFest\db_final.json'
with open(input_path, "r") as f:
    data = json.load(f)
df = pd.DataFrame(data)
df.head()

Unnamed: 0,pp_id,document,Collector ID,Custom Data 1,Email Address,First Name,Had you heard of Into the Dead before this demo?,IP Address,Last Name,Please rank what matters most to you about Into the Dead: Our Darkest Days:,...,session_count,session_length,spending,test_group,topic,version,hdbscan_id,hdbscan_UMAP_2D_x,hdbscan_UMAP_2D_y,hdbscan_id_name
0,next_fest_1,Stealth kills from cover.,459707592,,,,Never heard of it,,,5.0,...,10.0,5880.0,,,Stealth Mechanics,v0.1.7901,13,19.438429,5.584411,Stealth Zombie Kill Mechanics
1,next_fest_2,Random events,459707592,,,,Never heard of it,,,3.0,...,4.0,4814.0,,,Random Events,,2,7.944154,0.386788,Dynamic Randomization in Gameplay
2,next_fest_3,base defense upgrades,459707592,,,,Never heard of it,,,3.0,...,4.0,4814.0,,,Base Defense Upgrades,,-1,4.001101,6.561998,Noise
3,next_fest_4,weapon upgrades on basic weapons,459707592,,,,Never heard of it,,,3.0,...,4.0,4814.0,,,Weapon Upgrades,,1,6.407705,-9.599965,Weapon Customization and Upgrades
4,next_fest_5,The option to make your own notes or marks on ...,459707592,,,,Heard of it,,,3.0,...,7.0,5572.0,,,Map Note Feature,,4,0.132544,5.202775,Loot Tracking Enhancements


In [16]:
def generate_hard_facts(df):
    """
    Summarize a DataFrame containing columns:
      - 'sentence' : the user/player statement (str)
      - 'sentiment': the sentiment label (e.g. 'positive', 'negative', 'inconclusive')
      - 'hdbscan_id_name': name of the cluster
      - 'topic' : a short "headline" or summary for each statement

    Returns a DataFrame with:
      - total_data_points
      - sentiment counts (positive, negative, inconclusive)
      - a list of all unique topics for that cluster
    """

    # 1. Count the total data points (statements) per cluster
    cluster_counts = (
        df.groupby('hdbscan_id_name', dropna=False)['sentence']
          .count()
          .reset_index(name='total_data_points')
    )

    # 2. Count how many are positive, negative, etc. in each cluster
    sentiment_counts = (
        df.groupby(['hdbscan_id_name', 'sentiment'])['sentence']
          .count()
          .reset_index(name='count')
    )

    # 3. Pivot the sentiment counts so each cluster is one row, with separate sentiment columns
    sentiment_pivot = (
        sentiment_counts
        .pivot_table(index='hdbscan_id_name',
                     columns='sentiment',
                     values='count',
                     fill_value=0)
        .reset_index()
    )

    # 4. Merge the pivot back with the total cluster counts
    cluster_summary_df = cluster_counts.merge(
        sentiment_pivot,
        on='hdbscan_id_name',
        how='left'
    )
    # 4.5. Exclude "Noise" cluster
    cluster_summary_df = cluster_summary_df[cluster_summary_df['hdbscan_id_name'] != "Noise"]

    # 5. Rename the sentiment columns for clarity (if they exist)
    rename_map = {}
    for col in ['positive', 'negative', 'inconclusive']:
        if col in cluster_summary_df.columns:
            rename_map[col] = f"{col}_count"
    cluster_summary_df.rename(columns=rename_map, inplace=True)

    # 6. Get all unique topic values per cluster
    topics_per_cluster = (
        df.groupby('hdbscan_id_name')['topic']
          .unique()
          .reset_index(name='unique_topics')
    )

    # 7. Merge unique topics into the cluster summary
    cluster_summary_df = cluster_summary_df.merge(
        topics_per_cluster,
        on='hdbscan_id_name',
        how='left'
    )

    # Sort by cluster name if you like (optional)
    cluster_summary_df.sort_values(by='hdbscan_id_name', inplace=True)

    return cluster_summary_df


In [17]:
# Generate the cluster summary
summary_df = generate_hard_facts(df)

# Print or do further processing
print(summary_df.head())

                             hdbscan_id_name  total_data_points  Inconclusive  \
0  "Enhanced Movement and Animation Quality"                 21           9.0   
1              Auto Weapon Management System                 19          14.0   
2           Backpack and Inventory Expansion                 26          21.0   
3            Character Customization Options                 21          14.0   
4   Character Interactions and Relationships                 42          27.0   

   Negative  Positive                                      unique_topics  
0       7.0       5.0  [Performance Issues, Experience Variety, Game ...  
1       4.0       1.0  [Weapon Slot, Quick Switch Functionality, Auto...  
2       4.0       1.0  [Inventory Management, Backpack Crafting, Back...  
3       4.0       3.0  [Character Customization, Character Variety, T...  
4       8.0       7.0  [Language Option, Character Interaction, Chara...  


# AI Summary

In [14]:

import openai
import os
import logging
from dotenv import dotenv_values
import json

from helper.utils import configure_api
d = dotenv_values()
for k in d.keys():
    os.environ[k] = d[k]

# General modules

# Setup API keys
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

# Specify models
chat_model_name = 'gpt-4o-mini'
openai_embedding_model = "text-embedding-3-small"
local_embedding_model = "all-MiniLM-L6-v2"

configure_api(client, chat_model_name)

# Specify paths for storing (backup) data
root_dir = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\Backup'
project = 'HRC_Survey_T3_2024'

# Setup the logger
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

logging.getLogger("httpx").setLevel(logging.ERROR)      # Supress API HTTP request logs

In [15]:
import pandas as pd
from helper.prompt_templates import *
from helper.utils import api_settings


# Initialize global token counters
prompt_tokens = 0
completion_tokens = 0

def track_tokens(response):
    """
    Updates the global token counters based on the API response.

    Args:
        response: The API response containing token usage.
    """
    global prompt_tokens, completion_tokens
    prompt_tokens += response.usage.prompt_tokens
    completion_tokens += response.usage.completion_tokens


def generate_cluster_report(
    df: pd.DataFrame
):
        # 1. Count the total data points (statements) per cluster
    cluster_counts = (
        df.groupby('hdbscan_id_name', dropna=False)['sentence']
          .count()
          .reset_index(name='number_of_statements')
    )

    # 2. Count how many are positive, negative, etc. in each cluster
    sentiment_counts = (
        df.groupby(['hdbscan_id_name', 'sentiment'])['sentence']
          .count()
          .reset_index(name='count')
    )

    # 3. Pivot the sentiment counts so each cluster is one row, with separate sentiment columns
    sentiment_pivot = (
        sentiment_counts
        .pivot_table(index='hdbscan_id_name',
                     columns='sentiment',
                     values='count',
                     fill_value=0)
        .reset_index()
    )

    # 4. Merge the pivot back with the total cluster counts
    cluster_summary_df = cluster_counts.merge(
        sentiment_pivot,
        on='hdbscan_id_name',
        how='left'
    )
    # 4.5. Exclude "Noise" cluster
    cluster_summary_df = cluster_summary_df[cluster_summary_df['hdbscan_id_name'] != "Noise"]

    # 5. Rename the sentiment columns for clarity (if they exist)
    rename_map = {}
    for col in ['positive', 'negative', 'inconclusive']:
        if col in cluster_summary_df.columns:
            rename_map[col] = f"{col}_count"
    cluster_summary_df.rename(columns=rename_map, inplace=True)


    # Sort by total_data_points DESCENDING so the largest cluster is first
    cluster_summary_df.sort_values(by='number_of_statements', ascending=False, inplace=True)


    # We'll accumulate everything into one big string of Markdown
    markdown_report = "# Cluster Report\n\n"

    # Create a dict mapping cluster -> list of statements
    cluster_to_statements = (
        df.groupby('hdbscan_id_name')['sentence']
          .apply(list)
          .to_dict()
    )

    for _, row in cluster_summary_df.iterrows():
        cluster_name = row['hdbscan_id_name']

        total_points = row['number_of_statements']
        positive_count = int(row.get('Positive', 0))
        negative_count = int(row.get('Negative', 0))
        inconclusive_count = int(row.get('Inconclusive', 0))

        # ---------------------------------------------------
        # 2a) Write "hard facts" as a small Markdown table
        # ---------------------------------------------------
        markdown_report += f"## Cluster: {cluster_name}\n\n"
        markdown_report += "### Hard Facts\n\n"
        markdown_report += "| Metric              | Value |\n"
        markdown_report += "|---------------------|-------|\n"
        markdown_report += f"| **Total Statements**       | {total_points} |\n"
        markdown_report += f"| **Positive Count**          | {positive_count} |\n"
        markdown_report += f"| **Negative Count**          | {negative_count} |\n"
        markdown_report += f"| **Inconclusive Count**      | {inconclusive_count} |\n\n"

        # ---------------------------------------------------
        # 2b) Get statements and call the OpenAI API for summary
        # ---------------------------------------------------
        statements_for_this_cluster = cluster_to_statements[cluster_name]
        # If many statements, consider chunking or sampling
        statements_text = "\n".join(f"- {s}" for s in statements_for_this_cluster)

#######################
        prompt_topic = prompt_template_summary_short.format(cluster_name=cluster_name, statements =statements_text, video_game = "Rival Stars Horse Racing")
        logger.info(f"Generate AI summary for cluster {cluster_name}")

        try:
            response = api_settings["client"].chat.completions.create(
                model=api_settings["model"],
                messages=[
                    {"role": "system", "content": "You are an expert summarizing user statements for a video game."},
                    {"role": "user", "content": prompt_topic},
                ]
            )
            track_tokens(response)
            summary_text = response.choices[0].message.content.strip()
            logger.info(f"Total tokens used: {prompt_tokens + completion_tokens}")

        except Exception as e:
            logger.error(f"Error summarizing cluster {cluster_name}: {e}")
            return {"error": str(e)}



        # ---------------------------------------------------
        # 2c) Append the summary to the Markdown
        # ---------------------------------------------------
        markdown_report += "### Key Insights\n\n"
        markdown_report += summary_text + "\n\n"
        markdown_report += "---\n\n"

    # ---------------------------------------------------
    # STEP 3: Write the final Markdown to file
    # ---------------------------------------------------
    logger.info("Markdown Report has been written.")

    return markdown_report


In [16]:
input_path = os.path.join(root_dir, project, "db_final.json")
with open(input_path, "r") as f:
    data = json.load(f)
df = pd.DataFrame(data)
df.head()

Unnamed: 0,pp_id,document,"10_On a scale from 1-5, how has the Crossbreeding and Traits affected your OVERALL experience?",10_Optional] Please explain why you rated your experience with Crossbreeding & Traits this way.,"11_In the past 3 months, about how much money have you spent on mobile games?",12_In App Purchases (paying for exclusive content),"12_None of the above_In the past 3 months, have you spent real money to purchase the following in mobile games? (Select all that apply)",12_Premium (pay upfront for the entire game),12_Remove ads (pay once to remove in-game ads),"12_Subscriptions (paying weekly, monthly, or annually for content)",...,spending,standard_team_races,test_group,topic,hdbscan_id,hdbscan_UMAP_2D_x,hdbscan_UMAP_2D_y,hdbscan_tSNE_2D_x,hdbscan_tSNE_2D_y,hdbscan_id_name
0,HRC_survey_t3_2024_1,"Takes too long without star club, it's just wa...",(5) Made the game much better,I like any customization and strategy,$50+ USD,In App Purchases (paying for exclusive content),,,,,...,245.355994,505.0,,Star Club Duration,11,6.391001,16.935848,-21.588978,63.965237,Equal Access for All Players
1,HRC_survey_t3_2024_2,"In-app photo contests, voting like in Design g...",(5) Made the game much better,I like any customization and strategy,$50+ USD,In App Purchases (paying for exclusive content),,,,,...,245.355994,505.0,,Photo Contests and Voting,-1,10.835783,-1.045431,-8.794202,25.302008,Noise
2,HRC_survey_t3_2024_3,Ways to earn prizes for breeding horses for ae...,(5) Made the game much better,I like any customization and strategy,$50+ USD,In App Purchases (paying for exclusive content),,,,,...,245.355994,505.0,,Breeding Aesthetic Rewards,-1,8.322358,0.338659,7.157442,-8.243853,Noise
3,HRC_survey_t3_2024_4,More cross country.,(5) Made the game much better,I like any customization and strategy,$50+ USD,In App Purchases (paying for exclusive content),,,,,...,245.355994,505.0,,Cross Country Events,19,15.967313,4.1564,7.11719,-39.889538,Race Preferences: Flat vs. Cross-Country
4,HRC_survey_t3_2024_5,Sell a horse (because stable is maxed): view a...,(5) Made the game much better,I like any customization and strategy,$50+ USD,In App Purchases (paying for exclusive content),,,,,...,245.355994,505.0,,Horse Selling and Stats View,-1,1.321768,3.327364,26.394068,21.343136,Noise


In [17]:
markdown_report = generate_cluster_report(df)

# save the report as markdown
output_path = os.path.join(root_dir, project, "AI_report.md")

2025-03-18 14:21:15,584 - INFO - Generate AI summary for cluster Horse Training Time Frustration
2025-03-18 14:21:20,133 - INFO - Total tokens used: 3581
2025-03-18 14:21:20,133 - INFO - Generate AI summary for cluster Fun Factor and Enjoyment
2025-03-18 14:21:23,011 - INFO - Total tokens used: 6166
2025-03-18 14:21:23,013 - INFO - Generate AI summary for cluster Gold Earning Opportunities and Costs
2025-03-18 14:21:25,053 - INFO - Total tokens used: 8793
2025-03-18 14:21:25,053 - INFO - Generate AI summary for cluster Equestrian Sports and Disciplines
2025-03-18 14:21:26,558 - INFO - Total tokens used: 11226
2025-03-18 14:21:26,558 - INFO - Generate AI summary for cluster Enhanced Free Roam Experience
2025-03-18 14:21:28,561 - INFO - Total tokens used: 13491
2025-03-18 14:21:28,564 - INFO - Generate AI summary for cluster Equal Access for All Players
2025-03-18 14:21:30,496 - INFO - Total tokens used: 15717
2025-03-18 14:21:30,496 - INFO - Generate AI summary for cluster Horse Bonding

In [18]:
# save markdown report
with open(output_path, "w", encoding="utf-8") as f:
    f.write(markdown_report)


# AI Summary
### Indivuaidual subPrompts

In [6]:
import pandas as pd
from helper.prompt_templates import *
from helper.utils import api_settings


# Initialize global token counters
prompt_tokens = 0
completion_tokens = 0

def track_tokens(response):
    """
    Updates the global token counters based on the API response.

    Args:
        response: The API response containing token usage.
    """
    global prompt_tokens, completion_tokens
    prompt_tokens += response.usage.prompt_tokens
    completion_tokens += response.usage.completion_tokens


# Initialize logger
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Initialize global token counters
prompt_tokens = 0
completion_tokens = 0

def track_tokens(response):
    """
    Updates the global token counters based on the API response.
    """
    global prompt_tokens, completion_tokens
    prompt_tokens += response.usage.prompt_tokens
    completion_tokens += response.usage.completion_tokens


def generate_cluster_report(df: pd.DataFrame):

    # ---------------------------------------------------
    # STEP 1: Build the "hard facts" table for each cluster
    # ---------------------------------------------------
    cluster_counts = (
        df.groupby('hdbscan_id_name', dropna=False)['sentence']
          .count()
          .reset_index(name='total_data_points')
    )

    sentiment_counts = (
        df.groupby(['hdbscan_id_name', 'sentiment'])['sentence']
          .count()
          .reset_index(name='count')
    )

    sentiment_pivot = (
        sentiment_counts
        .pivot_table(index='hdbscan_id_name',
                     columns='sentiment',
                     values='count',
                     fill_value=0)
        .reset_index()
    )

    cluster_summary_df = cluster_counts.merge(
        sentiment_pivot,
        on='hdbscan_id_name',
        how='left'
    )

    # Exclude "Noise" if desired
    cluster_summary_df = cluster_summary_df[cluster_summary_df['hdbscan_id_name'] != "Noise"]

    rename_map = {}
    for col in ['positive', 'negative', 'inconclusive']:
        if col in cluster_summary_df.columns:
            rename_map[col] = f"{col}_count"
    cluster_summary_df.rename(columns=rename_map, inplace=True)

    # Sort by total_data_points DESCENDING so the largest cluster is first
    cluster_summary_df.sort_values(by='total_data_points', ascending=False, inplace=True)

    logger.info("Data is structured, summarizing clusters...")

    markdown_report = "# Cluster Report\n\n"

    cluster_to_statements = (
        df.groupby('hdbscan_id_name')['sentence']
          .apply(list)
          .to_dict()
    )

    for _, row in cluster_summary_df.iterrows():
        cluster_name = row['hdbscan_id_name']

        total_points = int(row.get('total_data_points', 0))
        positive_count = int(row.get('Positive', 0))
        negative_count = int(row.get('Negative', 0))
        inconclusive_count = int(row.get('Inconclusive', 0))

        # ---------------------------------------------------
        # Hard facts table
        # ---------------------------------------------------
        markdown_report += f"## Cluster: {cluster_name}\n\n"
        markdown_report += "### Hard Facts\n\n"
        sentiment_distribution = (f"| Metric              | Value |\n"
                                  f"|---------------------|-------|\n"
                                  f"| **Total player statements**       | {total_points} |\n"
                                  f"| **Positive statements**          | {positive_count} |\n"
                                  f"| **Negative statements**          | {negative_count} |\n"
                                  f"| **Inconclusive statements**      | {inconclusive_count} |\n\n")

        markdown_report += sentiment_distribution


        # ---------------------------------------------------
        # Prepare statements text & define sub-prompts
        # ---------------------------------------------------
        statements_for_this_cluster = cluster_to_statements[cluster_name]
        statements_text = "\n".join(f"- {s}" for s in statements_for_this_cluster)

        # Create each sub-prompt
        prompt_short_summary = prompt_template_summary_short.format(
            video_game="Into the Dead",
            cluster_name=cluster_name,
            statements=statements_text
        )

        prompt_pain_points = prompt_template_pain_points.format(
            video_game="Into the Dead",
            cluster_name=cluster_name,
            statements=statements_text
        )

        prompt_highlights = prompt_template_highlights.format(
            video_game="Into the Dead",
            cluster_name=cluster_name,
            statements=statements_text
        )

        prompt_summary = prompt_template_summary.format(
            video_game="Into the Dead",
            cluster_name=cluster_name,
            statements=statements_text,
            sentiment_distribution = sentiment_distribution
        )

        # ---------------------------------------------------
        # Make the LLM calls
        # ---------------------------------------------------
        try:
            # logger.info(f"Generate short summary for cluster {cluster_name}")
            # response_short = api_settings["client"].chat.completions.create(
            #     model=api_settings["model"],
            #     messages=[
            #         {"role": "system", "content": "You are an expert summarizing user statements for a video game."},
            #         {"role": "user", "content": prompt_short_summary},
            #     ]
            # )
            # track_tokens(response_short)
            # summary_short_text = response_short.choices[0].message.content.strip()

            logger.info(f"Generate AI summary for cluster {cluster_name}")
            response = api_settings["client"].chat.completions.create(
                model=api_settings["model"],
                messages=[
                    {"role": "system", "content": "You are an expert summarizing user statements for a video game."},
                    {"role": "user", "content": prompt_summary},
                ]
            )
            track_tokens(response)
            summary_text = response.choices[0].message.content.strip()
            logger.info(f"Total tokens used: {prompt_tokens + completion_tokens}")

            # logger.info(f"Generate pain points for cluster {cluster_name}")
            # response_pain = api_settings["client"].chat.completions.create(
            #     model=api_settings["model"],
            #     messages=[
            #         {"role": "system", "content": "You are an expert summarizing user statements for a video game."},
            #         {"role": "user", "content": prompt_pain_points},
            #     ]
            # )
            # track_tokens(response_pain)
            # pain_points_text = response_pain.choices[0].message.content.strip()
            #
            # logger.info(f"Generate positive highlights for cluster {cluster_name}")
            # response_highlights = api_settings["client"].chat.completions.create(
            #     model=api_settings["model"],
            #     messages=[
            #         {"role": "system", "content": "You are an expert summarizing user statements for a video game."},
            #         {"role": "user", "content": prompt_highlights},
            #     ]
            # )
            # track_tokens(response_highlights)
            # highlights_text = response_highlights.choices[0].message.content.strip()

        except Exception as e:
            logger.error(f"Error summarizing cluster {cluster_name}: {e}")
            return {"error": str(e)}

        # ---------------------------------------------------
        # Append sub-sections to the Markdown
        # ---------------------------------------------------
        # markdown_report += "### Brief\n\n"
        # markdown_report += summary_short_text + "\n\n"

        markdown_report += "### Summary\n\n"
        markdown_report += summary_text + "\n\n"

        markdown_report += "---\n\n"

    logger.info("Markdown Report has been generated.")
    return markdown_report



In [7]:
input_path = os.path.join(root_dir, project, "db_final.json")
with open(input_path, "r") as f:
    data = json.load(f)
df = pd.DataFrame(data)
markdown_report = generate_cluster_report(df)

# save the report as markdown
output_path = os.path.join(root_dir, project, "AI_report.md")

2025-03-17 11:50:24,032 - INFO - Data is structured, summarizing clusters...
2025-03-17 11:50:24,032 - INFO - Generate AI summary for cluster Character Interactions and Relationships
2025-03-17 11:50:29,253 - INFO - Total tokens used: 1116
2025-03-17 11:50:29,261 - INFO - Generate AI summary for cluster Weapon Customization and Upgrades
2025-03-17 11:50:36,481 - INFO - Total tokens used: 2021
2025-03-17 11:50:36,481 - INFO - Generate AI summary for cluster Combat Evasion and Defense Mechanics
2025-03-17 11:50:40,017 - INFO - Total tokens used: 2823
2025-03-17 11:50:40,033 - INFO - Generate AI summary for cluster Loot Tracking Enhancements
2025-03-17 11:50:46,568 - INFO - Total tokens used: 3910
2025-03-17 11:50:46,568 - INFO - Generate AI summary for cluster Shelter Management and Expansion
2025-03-17 11:50:52,913 - INFO - Total tokens used: 4929
2025-03-17 11:50:52,913 - INFO - Generate AI summary for cluster Dynamic Randomization in Gameplay
2025-03-17 11:50:57,237 - INFO - Total tok

In [8]:
# save markdown report
with open(output_path, "w", encoding="utf-8") as f:
    f.write(markdown_report)

# Big Picture Summary

In [112]:
def generate_big_picture_summary(df: pd.DataFrame) -> str:
    """
    Produces a Markdown string with overall stats (time range, top clusters, etc.).
    - Calculates top negative/positive clusters by **percentage**.
    - Ensures total size is calculated correctly (excluding noise).
    - Formats percentages to **one decimal place** with a `%` sign.
    """

    # Ensure timestamps are datetime (if not already)
    if not pd.api.types.is_datetime64_any_dtype(df['pp_timestamp']):
        df['pp_timestamp'] = pd.to_datetime(df['pp_timestamp'], errors='coerce')

    # Aggregate cluster data
    cluster_counts = df.groupby('hdbscan_id_name', dropna=False)['sentence'].count().reset_index(name='total_data_points')

    # Summaries of sentiment distribution
    sentiment_counts = df.groupby(['hdbscan_id_name', 'sentiment'])['sentence'].count().reset_index(name='count')
    pivot_sent = (
        sentiment_counts
        .pivot_table(index='hdbscan_id_name', columns='sentiment', values='count', fill_value=0)
        .reset_index()
    )

    cluster_summary_df = cluster_counts.merge(pivot_sent, on='hdbscan_id_name', how='left')


    # Convert numeric columns to int
    for col in ['total_data_points', 'Positive', 'Negative', 'Inconclusive']:
        if col in cluster_summary_df.columns:
            cluster_summary_df[col] = cluster_summary_df[col].astype(int, errors='ignore')



    # Compute sentiment percentages, rounded to one decimal place
    cluster_summary_df['negative_percentage'] = (
        cluster_summary_df['Negative'] / cluster_summary_df['total_data_points'] * 100
    ).fillna(0).round(1)

    cluster_summary_df['positive_percentage'] = (
        cluster_summary_df['Positive'] / cluster_summary_df['total_data_points'] * 100
    ).fillna(0).round(1)

    # 2) Count requests
    request_df = (
        df[df['category'] == 'request']
        .groupby('hdbscan_id_name')['sentence']
        .count()
        .reset_index(name='request_count')
    )
    cluster_summary_df = cluster_summary_df.merge(request_df, on='hdbscan_id_name', how='left')
    cluster_summary_df['request_count'] = cluster_summary_df['request_count'].fillna(0).astype(int)

    # Exclude Noise for ranking
    non_noise_df = cluster_summary_df[cluster_summary_df['hdbscan_id_name'] != "Noise"]

    # Identify Noise Cluster Size
    noise_count = cluster_summary_df.loc[cluster_summary_df['hdbscan_id_name'] == "Noise", 'total_data_points'].values[0] if "Noise" in cluster_summary_df['hdbscan_id_name'].values else 0

    # Top clusters sorted by **percentage** sentiment
    top_3_neg = non_noise_df.sort_values(by='negative_percentage', ascending=False).head(5)
    top_3_pos = non_noise_df.sort_values(by='positive_percentage', ascending=False).head(5)

    top_3_req = non_noise_df.sort_values(by='request_count', ascending=False).head(5)
    top_5_overall = non_noise_df.sort_values(by='total_data_points', ascending=False).head(5)

    # Convert tables to Markdown string format
    def table_to_md(table_df, columns, percentage_cols=None):
        """
        Converts a DataFrame into a Markdown table.
        - Rounds and adds a `%` sign to specified percentage columns.
        """
        md_table = "| " + " | ".join(columns) + " |\n"
        md_table += "|-" + "-|-".join(["-" * len(col) for col in columns]) + "-|\n"
        for _, row in table_df.iterrows():
            row_values = []
            for col in columns:
                if percentage_cols and col in percentage_cols:
                    row_values.append(f"{row[col]:.1f}%")  # Format percentage with one decimal
                else:
                    row_values.append(f"{row[col]}")
            md_table += "| " + " | ".join(row_values) + " |\n"
        return md_table

    # Markdown generation
    markdown_report = "# Big Picture Report\n\n"

    # General Stats
    markdown_report += f" **Data Source:** {project}\n\n"
    markdown_report += f" **Time Range:** {df['pp_timestamp'].min()} **-** {df['pp_timestamp'].max()}\n\n"
    markdown_report += f" **Total Statements:** {len(df)}, Noise: {noise_count}\n\n"

    # Top 5 Negative Clusters**
    markdown_report += "### Top 5 Negative Clusters \n\n"
    markdown_report += table_to_md(top_3_neg, ['hdbscan_id_name', 'negative_percentage'], percentage_cols=['negative_percentage']) + "\n\n"

    # Top 5 Positive Clusters**
    markdown_report += "### Top 5 Positive Clusters \n\n"
    markdown_report += table_to_md(top_3_pos, ['hdbscan_id_name', 'positive_percentage'], percentage_cols=['positive_percentage']) + "\n\n"

    # Top 5 request clusters
    markdown_report += "### Top 5 Request Clusters \n\n"
    markdown_report += table_to_md(top_3_req, ['hdbscan_id_name', 'request_count']) + "\n\n"

    # Top 5 clusters overall
    markdown_report += "### Top 5 Clusters\n\n"
    markdown_report += table_to_md(top_5_overall, ['hdbscan_id_name', 'total_data_points']) + "\n\n"

    logger.info("Big Picture has been generated.")

    return markdown_report


In [113]:
test = generate_big_picture_summary(df)

2025-03-17 16:41:26,837 - INFO - Big Picture has been generated.


In [114]:
output_path = os.path.join(root_dir, project, "BigPicture_report.md")
# save markdown report
with open(output_path, "w", encoding="utf-8") as f:
    f.write(test)

# Big Picture with LLM

In [10]:

import openai
import os
import logging
from dotenv import dotenv_values
import json

from helper.utils import configure_api
d = dotenv_values()
for k in d.keys():
    os.environ[k] = d[k]

# General modules

# Setup API keys
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

# Specify models
chat_model_name = 'gpt-4o-mini'
openai_embedding_model = "text-embedding-3-small"
local_embedding_model = "all-MiniLM-L6-v2"

configure_api(client, chat_model_name)

# Specify paths for storing (backup) data
root_dir = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\Backup'
data_source = 'DRS_Next_Fest'

# Setup the logger
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

logging.getLogger("httpx").setLevel(logging.ERROR)      # Supress API HTTP request logs

In [11]:
import pandas as pd
import logging
import random

logger = logging.getLogger(__name__)

# Global token counters for demonstration
prompt_tokens = 0
completion_tokens = 0

def track_tokens(response):
    """
    Updates the global token counters based on the API response.
    """
    global prompt_tokens, completion_tokens
    prompt_tokens += response.usage.prompt_tokens
    completion_tokens += response.usage.completion_tokens

def generate_big_picture_summary_with_llm(df: pd.DataFrame) -> str:
    """
    Produces a Markdown string with overall stats tables AND
    a single LLM-based summary for the top N clusters in each category,
    using your existing prompt framework and api_settings for the OpenAI client.

    Unlike before, we do *not* summarize each cluster individually.
    Instead, we gather all statements from the top 5 clusters of each category,
    then produce a single summary per category.
    """

    from helper.prompt_templates import prompt_template_top5
    from helper.utils import api_settings

    # Ensure timestamps are datetime (if not already)
    if not pd.api.types.is_datetime64_any_dtype(df['pp_timestamp']):
        df['pp_timestamp'] = pd.to_datetime(df['pp_timestamp'], errors='coerce')

    # --- 1) Aggregate cluster data ---
    cluster_counts = (
        df.groupby('hdbscan_id_name', dropna=False)['sentence']
          .count()
          .reset_index(name='total_data_points')
    )

    # --- 2) Summaries of sentiment distribution ---
    sentiment_counts = (
        df.groupby(['hdbscan_id_name', 'sentiment'])['sentence']
          .count()
          .reset_index(name='count')
    )
    pivot_sent = (
        sentiment_counts
        .pivot_table(index='hdbscan_id_name', columns='sentiment',
                     values='count', fill_value=0)
        .reset_index()
    )

    cluster_summary_df = cluster_counts.merge(pivot_sent, on='hdbscan_id_name', how='left')

    # Convert numeric columns to int
    for col in ['total_data_points', 'Positive', 'Negative', 'Inconclusive']:
        if col in cluster_summary_df.columns:
            cluster_summary_df[col] = cluster_summary_df[col].fillna(0).astype(int)

    # Compute sentiment percentages
    cluster_summary_df['negative_percentage'] = (
        cluster_summary_df['Negative'] / cluster_summary_df['total_data_points'] * 100
    ).fillna(0).round(1)

    cluster_summary_df['positive_percentage'] = (
        cluster_summary_df['Positive'] / cluster_summary_df['total_data_points'] * 100
    ).fillna(0).round(1)

    # Count requests
    request_df = (
        df[df['category'] == 'request']
        .groupby('hdbscan_id_name')['sentence']
        .count()
        .reset_index(name='request_count')
    )
    cluster_summary_df = cluster_summary_df.merge(request_df, on='hdbscan_id_name', how='left')
    cluster_summary_df['request_count'] = cluster_summary_df['request_count'].fillna(0).astype(int)

    # Exclude Noise for ranking
    non_noise_df = cluster_summary_df[cluster_summary_df['hdbscan_id_name'] != "Noise"]

    # Identify Noise Cluster Size
    noise_count = 0
    if "Noise" in cluster_summary_df['hdbscan_id_name'].values:
        noise_count = cluster_summary_df.loc[
            cluster_summary_df['hdbscan_id_name'] == "Noise", 'total_data_points'
        ].values[0]

    # --- 3) Select top clusters to display in tables ---
    top_5_neg = non_noise_df.sort_values(by='negative_percentage', ascending=False).head(5)
    top_5_pos = non_noise_df.sort_values(by='positive_percentage', ascending=False).head(5)
    top_5_req = non_noise_df.sort_values(by='request_count', ascending=False).head(5)
    top_5_overall = non_noise_df.sort_values(by='total_data_points', ascending=False).head(5)

    # --- Helper: Convert DataFrame to Markdown ---
    def table_to_md(table_df, columns, percentage_cols=None):
        """
        Converts a DataFrame into a Markdown table.
        - Rounds and adds a '%' sign to specified percentage columns.
        """
        md_table = "| " + " | ".join(columns) + " |\n"
        md_table += "|-" + "-|-".join(["-" * len(col) for col in columns]) + "-|\n"
        for _, row in table_df.iterrows():
            row_values = []
            for col in columns:
                if percentage_cols and col in percentage_cols:
                    row_values.append(f"{row[col]:.1f}%")
                else:
                    row_values.append(f"{row[col]}")
            md_table += "| " + " | ".join(row_values) + " |\n"
        return md_table

    # --- 4) Summarize top clusters in a single prompt ---

    def summarize_topN_clusters(
        df: pd.DataFrame,
        top_df: pd.DataFrame,
        cluster_group: str,
        sentiment: str
    ) -> str:
        """
        Gathers ALL statements from the top 'N' clusters in top_df.
        If there are more than 150 statements, randomly sample 150.
        Then calls the LLM *once* to summarize them all together.
        """

        from helper.prompt_templates import prompt_template_top5
        from helper.utils import api_settings

        # 1) Collect all cluster names
        top_cluster_names = top_df['hdbscan_id_name'].unique().tolist()

        # 2) Gather all statements from these clusters
        mask = df['hdbscan_id_name'].isin(top_cluster_names)
        selected_statements = df.loc[mask, 'sentence'].tolist()

        # 3) If more than 150, randomly sample
        if len(selected_statements) > 150:
            logger.info(f"More than 150 statements in the cluster group {cluster_group}. Sampling 150.")
            selected_statements = random.sample(selected_statements, 150)
        else:
            logger.info(f"{cluster_group} has {len(selected_statements)} player statements. Continuing...")

        # 4) Build the prompt text
        statements_text = "\n".join(f"- {s}" for s in selected_statements)

        # 5) Format the prompt
        prompt_topic = prompt_template_top5.format(
            video_game="Into the Dead",
            cluster_group=cluster_group,
            sentiment=sentiment,
            statements=statements_text
        )

        logger.info(f"Generate AI summary for top 5 {cluster_group} clusters.")
        try:
            response = api_settings["client"].chat.completions.create(
                model=api_settings["model"],
                messages=[
                    {
                        "role": "system",
                        "content": "You are an expert summarizing user statements for a video game."
                    },
                    {
                        "role": "user",
                        "content": prompt_topic
                    }
                ]
            )
            track_tokens(response)
            summary_text = response.choices[0].message.content.strip()
            logger.info(f"Tokens used so far: {prompt_tokens + completion_tokens}")
            return summary_text

        except Exception as e:
            logger.error(f"Error summarizing top {cluster_group}: {e}")
            return f"Error summarizing top {cluster_group} clusters: {e}"

    # --- 5) Construct the Markdown Report ---
    markdown_report = "# Big Picture Report\n\n"
    markdown_report += f"**Data Source:** {data_source}\n\n"
    markdown_report += f"**Time Range:** {df['pp_timestamp'].min()} - {df['pp_timestamp'].max()}\n\n"
    markdown_report += f"**Total Statements:** {len(df)}, Noise: {noise_count}\n\n"

    # -- 5a) Top 5 Negative Clusters
    markdown_report += "### Top 5 Negative Clusters\n\n"
    markdown_report += table_to_md(
        top_5_neg,
        ['hdbscan_id_name', 'negative_percentage'],
        percentage_cols=['negative_percentage']
    ) + "\n\n"

    neg_summary = summarize_topN_clusters(
        df,
        top_5_neg,
        cluster_group="Negative",
        sentiment="negative"
    )
    markdown_report += f"**Summary for Top 5 Negative Clusters:**\n{neg_summary}\n\n"

    # -- 5b) Top 5 Positive Clusters
    markdown_report += "### Top 5 Positive Clusters\n\n"
    markdown_report += table_to_md(
        top_5_pos,
        ['hdbscan_id_name', 'positive_percentage'],
        percentage_cols=['positive_percentage']
    ) + "\n\n"

    pos_summary = summarize_topN_clusters(
        df,
        top_5_pos,
        cluster_group="Positive",
        sentiment="positive"
    )
    markdown_report += f"**Summary for Top 5 Positive Clusters:**\n{pos_summary}\n\n"

    # -- 5c) Top 5 Request Clusters
    markdown_report += "### Top 5 Request Clusters\n\n"
    markdown_report += table_to_md(
        top_5_req,
        ['hdbscan_id_name', 'request_count']
    ) + "\n\n"

    req_summary = summarize_topN_clusters(
        df,
        top_5_req,
        cluster_group="Request",
        sentiment="wishes or requests"
    )
    markdown_report += f"**Summary for Top 5 Request Clusters:**\n{req_summary}\n\n"

    # -- 5d) Top 5 Clusters Overall
    markdown_report += "### Top 5 Clusters (Overall)\n\n"
    markdown_report += table_to_md(top_5_overall, ['hdbscan_id_name', 'total_data_points']) + "\n\n"

    overall_summary = summarize_topN_clusters(
        df,
        top_5_overall,
        cluster_group="biggest",
        sentiment="largest"
    )
    markdown_report += f"**Summary for Top 5 Overall Largest Clusters:**\n{overall_summary}\n\n"

    logger.info("Big Picture + LLM Summaries have been generated.")
    return markdown_report


In [12]:
input_path = os.path.join(root_dir, data_source, "db_final.json")
with open(input_path, "r") as f:
    data = json.load(f)
df = pd.DataFrame(data)
markdown_report = generate_big_picture_summary_with_llm(df)

2025-03-18 13:30:22,966 - INFO - Negative has 135 player statements. Continuing...
2025-03-18 13:30:22,966 - INFO - Generate AI summary for top 5 Negative clusters.
2025-03-18 13:30:24,477 - INFO - Tokens used so far: 2439
2025-03-18 13:30:24,477 - INFO - Positive has 143 player statements. Continuing...
2025-03-18 13:30:24,479 - INFO - Generate AI summary for top 5 Positive clusters.
2025-03-18 13:30:25,641 - INFO - Tokens used so far: 4470
2025-03-18 13:30:25,647 - INFO - More than 150 statements in the cluster group Request. Sampling 150.
2025-03-18 13:30:25,647 - INFO - Generate AI summary for top 5 Request clusters.
2025-03-18 13:30:27,121 - INFO - Tokens used so far: 7214
2025-03-18 13:30:27,136 - INFO - More than 150 statements in the cluster group biggest. Sampling 150.
2025-03-18 13:30:27,141 - INFO - Generate AI summary for top 5 biggest clusters.
2025-03-18 13:30:28,712 - INFO - Tokens used so far: 10020
2025-03-18 13:30:28,712 - INFO - Big Picture + LLM Summaries have been 

In [13]:
# save the report as markdown
output_path = os.path.join(root_dir, data_source, "Big_picture_summary.md")

with open(output_path, "w", encoding="utf-8") as f:
    f.write(markdown_report)