In [1]:
# After getting the data into the Database so the user generated the data they wanted, we can use this data now to generate an AI summary.
import pandas as pd
import json
from helper.utils import configure_api
import os

# Specify models
chat_model_name = 'gpt-4o-mini'
openai_embedding_model = "text-embedding-3-small"
local_embedding_model = "all-MiniLM-L6-v2"

configure_api(chat_model_name)

# Specify paths for storing (backup) data
root_dir = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data'
project = 'HRC\Zendesk_test'

In [2]:
# Load the data that should be analysed. (It has to be the data that was downloaded from the streamlit app 'streamlit_chroma.py')
input_path = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\HRC_Survey_T3_2024\db_final.json'
with open(input_path, "r") as f:
    data = json.load(f)
df = pd.DataFrame(data)

In [4]:
# Generate the cluster report
from helper.ai_summary import generate_cluster_report
cluster_report = generate_cluster_report(df)

2025-03-24 08:48:16,599 - INFO - Generate AI summary for cluster Horse Training Time Frustration
2025-03-24 08:48:18,465 - INFO - Total tokens used: 14648
2025-03-24 08:48:18,465 - INFO - Generate AI summary for cluster Fun Factor and Enjoyment
2025-03-24 08:48:20,186 - INFO - Total tokens used: 17234
2025-03-24 08:48:20,186 - INFO - Generate AI summary for cluster Gold Earning Opportunities and Costs
2025-03-24 08:48:21,720 - INFO - Total tokens used: 19879
2025-03-24 08:48:21,720 - INFO - Generate AI summary for cluster Equestrian Sports and Disciplines
2025-03-24 08:48:23,413 - INFO - Total tokens used: 22308
2025-03-24 08:48:23,413 - INFO - Generate AI summary for cluster Enhanced Free Roam Experience
2025-03-24 08:48:25,476 - INFO - Total tokens used: 24584
2025-03-24 08:48:25,476 - INFO - Generate AI summary for cluster Equal Access for All Players
2025-03-24 08:48:27,239 - INFO - Total tokens used: 26812
2025-03-24 08:48:27,239 - INFO - Generate AI summary for cluster Horse Bond

In [6]:
# save cluster report
output_path = os.path.join(root_dir, project, "Cluster_report.md")
with open(output_path, "w", encoding="utf-8") as f:
    f.write(cluster_report)

In [3]:
# Generate Big Picture report
from helper.ai_summary import generate_big_picture_summary
big_picture_report = generate_big_picture_summary(df, project)

2025-03-24 08:48:00,530 - INFO - More than 150 statements in the cluster group Negative. Sampling 150.
2025-03-24 08:48:00,530 - INFO - Generate AI summary for top 5 Negative clusters.
2025-03-24 08:48:04,305 - INFO - Tokens used so far: 2966
2025-03-24 08:48:04,305 - INFO - More than 150 statements in the cluster group Positive. Sampling 150.
2025-03-24 08:48:04,305 - INFO - Generate AI summary for top 5 Positive clusters.
2025-03-24 08:48:06,666 - INFO - Tokens used so far: 5533
2025-03-24 08:48:06,685 - INFO - More than 150 statements in the cluster group Request. Sampling 150.
2025-03-24 08:48:06,685 - INFO - Generate AI summary for top 5 Request clusters.
2025-03-24 08:48:08,486 - INFO - Tokens used so far: 8497
2025-03-24 08:48:08,490 - INFO - More than 150 statements in the cluster group biggest. Sampling 150.
2025-03-24 08:48:08,490 - INFO - Generate AI summary for top 5 biggest clusters.
2025-03-24 08:48:10,905 - INFO - Tokens used so far: 11072
2025-03-24 08:48:10,905 - INFO 

In [5]:
# save big picture report
output_path = os.path.join(root_dir, project, "Big_Picture.md")
with open(output_path, "w", encoding="utf-8") as f:
    f.write(big_picture_report)

In [9]:
# Store combined markdown report
combined_report = big_picture_report + "\n\n" + cluster_report
output_path = os.path.join(root_dir, project, f"HRC_Zendesk_full_report.md")
with open(output_path, "w", encoding="utf-8") as f:
    f.write(combined_report)