In [1]:
import numpy as np
import pandas as pd
import openai
import os
import logging
from dotenv import load_dotenv, dotenv_values
import json

#from Example_steam_reviews import path_db_embedded
from sqlalchemy.util import counter

# from helper.utils import *
# personally would not recommend this importing style, 
# it's recommend to use "from package import foo1, foo2", or "import package as p"
from helper.utils import configure_api
from helper.data_pipeline import gather_data, translate_data, analyse_data, embed_data


# load_dotenv()  # 
d = dotenv_values()
for k in d.keys():
    os.environ[k] = d[k]

# General modules

# Setup API keys
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

# Specify models
chat_model_name = 'gpt-4o-mini'
openai_embedding_model = "text-embedding-3-small"
local_embedding_model = "all-MiniLM-L6-v2"

configure_api(client, chat_model_name)

# Specify paths for storing (backup) data
root_dir = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\Backup'

# Setup the logger
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

logging.getLogger("httpx").setLevel(logging.ERROR)      # Supress API HTTP request logs

# path_db_prepared = os.path.join(root_dir, data_source, "db_prepared.json")          #backup
# path_db_translated = os.path.join(root_dir, data_source, "db_translated.json")      #backup
# path_db_analysed = os.path.join(root_dir, data_source, "db_analysed.json")          #backup
# path_db_embedded = os.path.join(root_dir, data_source, "db_embedded.json")          #backup


# path_db_clustered = os.path.join(root_dir, data_source, "db_clustered.json")        #backup
# path_db_final = os.path.join(root_dir, data_source, "db_final.json")                #final file

In [2]:
# Steam Reviews
from helper.redshift_conector_standalone import fetch_query_results


data_source = 'Steam'
longname = 'com.pikpok.hrc'
id_column = 'recommendationid'
text_column = 'review_text'
timestamp_column = 'timestamp_updated'
language_column = 'language'
embed_key = "sentence"  # "topic" or "sentence"


def steam_query_function():
    # SQL Query Redshift
    sql_query = """
    SELECT *
    FROM steam_review
    where app_id_name = '1166860_Rival_Stars_Horse_Racing_Desktop_Edition'
    limit 5
    """
    logger.info(f"Query Redshift with: {sql_query}")

    try:
        results_json, results_df = fetch_query_results(sql_query)
        # Print the first row of the DataFrame
        logger.info("Successfully fetched query results, with shape: %s", results_df.shape)
    except Exception as e:
        logger.error(f"Error fetching query results: {e}")
        raise
    
    return results_json


path_dir = os.path.join(root_dir, data_source)
if not os.path.exists(path_dir):
    os.mkdir(path_dir)
    
gather_data(root_dir, data_source,  
            query_function=steam_query_function, 
            id_column=id_column, 
            text_column=text_column, 
            timestamp_column=timestamp_column,
            longname=longname)
translate_data(root_dir, data_source, language_column)
analyse_data(root_dir, data_source, client, chat_model_name)
embed_data(root_dir, data_source, client, embed_key)

2025-02-18 15:43:49,502 - INFO - Query Redshift with: 
    SELECT *
    FROM steam_review
    where app_id_name = '1166860_Rival_Stars_Horse_Racing_Desktop_Edition'
    limit 5
    
2025-02-18 15:43:50,643 - INFO - Successfully fetched query results, with shape: (5, 14)
2025-02-18 15:43:50,649 - INFO - Data successfully saved to C:\Users\mshen\Documents\Msheng_Domestic\sentiment_analysis\Steam\db_prepared.json
2025-02-18 15:43:50,656 - INFO - 'language' column already exists. Skipping language detection.
2025-02-18 15:43:50,657 - INFO - Loading existing reviews from: C:\Users\mshen\Documents\Msheng_Domestic\sentiment_analysis\Steam\db_translated.json
2025-02-18 15:43:50,659 - INFO - Found 0 new reviews to process.
2025-02-18 15:43:50,660 - INFO - No new reviews to add. All IDs already exist.
2025-02-18 15:43:50,661 - INFO - Translation completed. Total reviews translated: 0
2025-02-18 15:43:50,666 - INFO - Skipping entry 0 (ID: 182030680) - already processed.
2025-02-18 15:43:50,667 - 

In [None]:
# Google play review
import datetime
import google_play_scraper as gps

data_source = 'Google Play'
id_column = 'reviewId'
text_column = 'content'
timestamp_column = 'at'
language_column = 'language'
embed_key = "sentence"  # "topic" or "sentence"


def google_play_query_function():
    app_id = "com.pikpok.hrc.play"
    longname = '.'.join(app_id.split('.')[:3])
    result, continuation_token = gps.reviews(
        app_id,
#         lang="en",  # Language (English)
#         country="us",  # Country (United States)
        count=2000,  # Number of reviews to fetch
        sort=gps.Sort.NEWEST
    )
    for e in result:
        e['longname'] = longname
        for k in e.keys():
            if isinstance(e[k], datetime.datetime):
                e[k] = int(e[k].timestamp())
    return json.dumps(result)


path_dir = os.path.join(root_dir, data_source)
if not os.path.exists(path_dir):
    os.mkdir(path_dir)
    
gather_data(root_dir, data_source, 
            query_function=google_play_query_function, 
            id_column=id_column, 
            text_column=text_column, 
            timestamp_column=timestamp_column)
translate_data(root_dir, data_source, language_column)
analyse_data(root_dir, data_source, client, chat_model_name)
embed_data(root_dir, data_source, client, embed_key)

In [6]:
# Zendesk CS tickets (temporarily using subcategory as the context)
import psycopg2


data_source = 'Zendesk'
id_column = 'ticket_id'
text_column = 'content'
timestamp_column = 'created_at'
language_column = 'language'
embed_key = "sentence"  # "topic" or "sentence"


def zendesk_query_function():
    conn = psycopg2.connect(host=os.getenv("POSTGRESQL_HOST"), 
                            database=os.getenv("POSTGRESQL_DATABASE"),
                            user=os.getenv("POSTGRESQL_USER"), 
                            password=os.getenv("POSTGRESQL_PASSWORD"), 
                            port=os.getenv("POSTGRESQL_PORT"))
    curr = conn.cursor()  # TODO: please fill in the query and replace the date filter
    s_query = """
    select coalesce(intent_subcategory2, coalesce(intent_subcategory1, intent_primary)) as content,
           * 
    from zendesk 
    where product = 'HRC' 
    and coalesce(intent_subcategory2, coalesce(intent_subcategory1, intent_primary)) is not null
    order by created_at desc
    limit 5
    """
    s_query = s_query.replace('>>', '>').replace('<<', '<')
    curr.execute(s_query)
    df = pd.DataFrame(curr.fetchall(), columns=[i[0] for i in curr.description])
    df['created_at'] = df['created_at'].apply(lambda x: int(x.timestamp()))
    df['longname'] = ['com.pikpok.' + str(x).lower() if x is not None else None for x in df['product']]
    return df.to_json(orient='records')


path_dir = os.path.join(root_dir, data_source)
if not os.path.exists(path_dir):
    os.mkdir(path_dir)
    
gather_data(root_dir, data_source, 
            query_function=zendesk_query_function, 
            id_column=id_column, 
            text_column=text_column, 
            timestamp_column=timestamp_column)
translate_data(root_dir, data_source, language_column)
analyse_data(root_dir, data_source, client, chat_model_name)
embed_data(root_dir, data_source, client, embed_key)

2025-02-18 15:45:50,314 - INFO - Data successfully saved to C:\Users\mshen\Documents\Msheng_Domestic\sentiment_analysis\Zendesk\db_prepared.json
2025-02-18 15:45:50,326 - INFO - Starting language detection for column: 'pp_review'
2025-02-18 15:45:50,328 - INFO - Language detection completed. Added column 'language'.
2025-02-18 15:45:50,328 - INFO - No existing file found. Starting fresh.
2025-02-18 15:45:50,331 - INFO - Found 5 new reviews to process.
2025-02-18 15:45:50,336 - INFO - Updated file saved to: C:\Users\mshen\Documents\Msheng_Domestic\sentiment_analysis\Zendesk\db_translated.json
2025-02-18 15:45:50,336 - INFO - Translation completed. Total reviews translated: 0
2025-02-18 15:45:50,353 - INFO - Tokens used so far: Prompt Tokens: 4432, Completion Tokens: 384
2025-02-18 15:45:50,354 - INFO - Extracting topics for entry ID 101047
2025-02-18 15:45:51,303 - INFO - Analyzing sentiment for topic 'Store Accessibility' (Entry ID 101047)
2025-02-18 15:45:51,983 - INFO - Tokens used s

In [32]:
# Surveys
# concat different cols to one

import pandas as pd

# Replace 'input.json' with the path to your JSON file
file = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\DRS_NextFest_Data_Fixed.csv'
df = pd.read_csv(file)

# Specify the columns you want to combine
cols_to_concat = ['Add,', 'Change,', 'Remove,']

# Create a new column "combined_text" that concatenates the values of the given columns
# If a column is empty or NaN, it will be skipped
df['combined_text'] = df[cols_to_concat].apply(
    lambda row: ' '.join(str(x) for x in row if pd.notnull(x) and str(x).strip() != ''),
    axis=1
)

# Mutate the col "session_length" to minutes by dividing the entries by 60
df['session_length'] = df['session_length'] / 60

# rename session_length to playtime_at_review_minutes
df.rename(columns={'session_length':'playtime_at_review_minutes'}, inplace=True)

# Save the DataFrame to an Excel file
df.to_excel('output.xlsx', index=False)


In [5]:
# Surveys
data_source = 'Survey'
file = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\DRS\DRS new\output.xlsx'
longname = 'com.pikpok.drs'
id_column = 'main_id'
text_column = "combined_text"   #["If you had a magic wand and you could add, change or remove anything from the game, what would it be and why?", "Unnamed: 19", "Unnamed: 20"]
timestamp_column = 'End Date'
language_column = 'language'
embed_key = "sentence"  # "topic" or "sentence"


def survey_query_function(file):
    file_name, file_extension = os.path.splitext(file)
    if '.csv' == file_extension:
        df = pd.read_csv(file)
        js = json.loads(df.to_json(orient='records'))
    elif '.xls' == file_extension or '.xlsx' == file_extension:
        df = pd.read_excel(file)
        js = json.loads(df.to_json(orient='records'))
    elif '.txt' == file_extension:
        with open(file, 'r') as f:
            js = json.load(f)
    for e in js:
        if 'Respondent ID' in e.keys() and 'pcubed_id' in e.keys() and 'review_id' not in e.keys():
            e['review_id'] = str(e['Respondent ID']) + ':' + str(e['pcubed_id'])
    return json.dumps(js)


path_dir = os.path.join(root_dir, data_source)
if not os.path.exists(path_dir):
    os.mkdir(path_dir)
    
gather_data(root_dir, data_source, 
            query_function=survey_query_function,
            query_function_args=[file,],
            id_column=id_column, 
            text_column=text_column, 
            timestamp_column=timestamp_column)
translate_data(root_dir, data_source, language_column)
analyse_data(root_dir, data_source, client, chat_model_name)
embed_data(root_dir, data_source, client, embed_key)

2025-03-06 14:14:41,771 - INFO - Data successfully saved to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\Backup\Survey\db_prepared.json
2025-03-06 14:14:41,801 - INFO - Starting language detection for column: 'pp_review'
2025-03-06 14:14:42,238 - INFO - Language detection completed. Added column 'language'.
2025-03-06 14:14:42,239 - INFO - Loading existing reviews from: S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\Backup\Survey\db_translated.json
2025-03-06 14:14:42,273 - INFO - Found 68 new reviews to process.
2025-03-06 14:14:42,280 - INFO - No new reviews to add. All IDs already exist.
2025-03-06 14:14:42,281 - INFO - Translation completed. Total reviews translated: 0
2025-03-06 14:14:42,321 - INFO - Skipping entry 0 (ID: 267) - already processed.
2025-03-06 14:14:42,322 - INFO - Skipping entry 1 (ID: 306) - already processed.
2025-03-06 14:14:42,323 - INFO - Skipping entry 2 (ID: 193) - 

KeyboardInterrupt: 

#### Finalise data

In [19]:
# Reduce dimensions a priori
# The clustering does not perform to good. Some datapoints that clearly should be in a cluster based on eyeballing and their topic name but they are not. Rather often some points in a dense cluster are categorized as noise.
# I will try to improve this by first perform a dimension reduction and then perform clustering. Reason being, that in high dimensions the data might be too sparse for the clustering algorithm to work properly.


import umap

path_db_embedded = os.path.join(root_dir, data_source, "db_embedded.json")

with open(path_db_embedded, "r", encoding="utf-8") as f:
    data = json.load(f)


In [20]:
counter = 0
for entry in data:
    print(counter)
    counter += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [21]:
# Gather all embeddings
embeddings = [entry['embedding'] for entry in data]

# Convert to numpy array
X = np.array(embeddings)

# Perform UMAP
X_embedded = umap.UMAP(n_components=10).fit_transform(X) # 40 dimensions

# Store the updated embeddings in the data
for i, entry in enumerate(data):
    entry['embedding'] = X_embedded[i].tolist()



In [22]:
from helper.cluster_analysis import *
from helper.utils import *

# Adjustable parameters
dimensionality_methods = ['UMAP', 'PCA', 'tSNE']
hdbscan_params = {"min_cluster_size": 10, "min_samples": 1, "cluster_selection_epsilon": 0.15, "cluster_selection_method": "leaf"}           #, "min_samples": 2, "cluster_selection_epsilon": 0.15

data = read_json(path_db_embedded)
df = pd.DataFrame(data)
df = df[df['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
print(f"Loaded {len(df)} valid entries with embeddings.")

# Extract embeddings
mat = np.array(df['embedding'].tolist())

hdbscan_clusterer = hdbscan.HDBSCAN(**hdbscan_params)
cluster_labels = hdbscan_clusterer.fit_predict(mat)

reduction_results = {}

for method in dimensionality_methods:
    coords_2d = dimensionality_reduction(mat, method, n_components=2)
    reduction_results[f'hdbscan_{method}_2D'] = {
        'x': coords_2d[:, 0],
        'y': coords_2d[:, 1]
    }

# 3D Reduction
    coords_3d = dimensionality_reduction(mat, method, n_components=3)
    reduction_results[f'hdbscan_{method}_3D'] = {
        'x': coords_3d[:, 0],
        'y': coords_3d[:, 1],
        'z': coords_3d[:, 2]
    }

# Add dimensional coordinates to DataFrame
for method_dim, coords in reduction_results.items():
    for axis, values in coords.items():
        df[f'{method_dim}_{axis}'] = values

# Add the cluster labels to the DataFrame
df['hdbscan_id'] = cluster_labels


Loaded 998 valid entries with embeddings.


2025-03-05 17:15:16,852 - INFO - Applying UMAP with 2 components.
2025-03-05 17:15:17,385 - INFO - Applying UMAP with 3 components.
2025-03-05 17:15:17,972 - INFO - Applying PCA with 2 components.
2025-03-05 17:15:18,003 - INFO - Applying PCA with 3 components.
2025-03-05 17:15:18,035 - INFO - Applying tSNE with 2 components.
2025-03-05 17:15:18,036 - INFO - Perplexity not provided, setting to 30 based on sample size.
2025-03-05 17:15:19,207 - INFO - Applying tSNE with 3 components.
2025-03-05 17:15:19,208 - INFO - Perplexity not provided, setting to 30 based on sample size.


In [23]:
# print unique cluster labels

df['hdbscan_id'].unique()

array([-1,  2,  8,  9, 11,  5, 13, 10,  3,  1, 12, 15, 14, 16,  4,  6,  7,
        0], dtype=int64)

In [7]:
path_db_clustered = os.path.join(root_dir, data_source, "db_clustered.json")

save_df_as_json(df, path_db_clustered)

In [8]:
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

from helper.cluster_naming import *

api_settings = {"client": client, "model": chat_model_name}

def name_clusters(
    df,
    cluster_columns,
    embedding_col="embedding",
    text_col="sentence",
    top_k=25,
    skip_noise_label=-1
):
    for col in cluster_columns:
        # Prepare a dict to store {cluster_id -> cluster_name}
        cluster_id_to_name = {}
        logger.info((f'Preparing to name clusters in column "{col}"'))

        # Get unique cluster IDs from this column
        cluster_ids = df[col].unique()

        for cluster_id in cluster_ids:

            if skip_noise_label is not None and cluster_id == skip_noise_label:
                continue

            # Select rows belonging to this cluster
            cluster_data = df[df[col] == cluster_id]
            if cluster_data.empty:
                continue

            # Compute centroid of embeddings
            embeddings = np.array(cluster_data[embedding_col].tolist())
            centroid = embeddings.mean(axis=0, dtype=np.float32, keepdims=True)

            # Find top_k closest points to centroid
            distances = cosine_distances(centroid, embeddings).flatten()
            closest_indices = np.argsort(distances)[:top_k]
            representative_texts = cluster_data.iloc[closest_indices][text_col].tolist()

            # Call your naming function
            cluster_name = generate_cluster_name(representative_texts, api_settings)
            cluster_id_to_name[cluster_id] = cluster_name

        # Create a new column with the cluster name for each row
        name_col = f"{col}_name"
        df[name_col] = df[col].apply(lambda cid: cluster_id_to_name.get(cid, "Noise"))

    return df


data = read_json(path_db_clustered)  # data is probably a list of dicts
df = pd.DataFrame(data)              # Convert to DataFrame

cluster_columns = ['hdbscan_id'] #, 'kmeans_7_id', 'kmeans_10_id', 'kmeans_12_id', 'kmeans_15_id'

df_named = name_clusters(
    df,
    cluster_columns,
    embedding_col="embedding",
    text_col="sentence",
    top_k=10,
    skip_noise_label=-1  # for HDBSCAN noise
)


2025-03-06 14:15:36,367 - INFO - Preparing to name clusters in column "hdbscan_id"
2025-03-06 14:15:37,025 - INFO - Generated cluster name: Enhanced Tutorial for Better Gameplay
2025-03-06 14:15:37,026 - INFO - Tokens used so far: Prompt Tokens: 249, Completion Tokens: 6
2025-03-06 14:15:37,581 - INFO - Generated cluster name: Enhanced Survivor Action Options
2025-03-06 14:15:37,584 - INFO - Tokens used so far: Prompt Tokens: 499, Completion Tokens: 11
2025-03-06 14:15:38,019 - INFO - Generated cluster name: Character Interaction and Depth
2025-03-06 14:15:38,019 - INFO - Tokens used so far: Prompt Tokens: 742, Completion Tokens: 16
2025-03-06 14:15:38,456 - INFO - Generated cluster name: Automatic Weapon Management
2025-03-06 14:15:38,457 - INFO - Tokens used so far: Prompt Tokens: 985, Completion Tokens: 20
2025-03-06 14:15:38,983 - INFO - Generated cluster name: Game Difficulty Adjustment Suggestions
2025-03-06 14:15:38,985 - INFO - Tokens used so far: Prompt Tokens: 1135, Completio

In [9]:
# Save the named clusters
from helper.utils import save_data_for_streamlit

path_db_final = os.path.join(root_dir, data_source, "db_final.json")

#save_data_for_streamlit(df_named, path_db_final)

In [27]:
# rename timestamp col
df_named.rename(columns={'pp_timestamp':'timestamp_updated'}, inplace=True)
#save_data_for_streamlit(df_named, path_db_final)

In [32]:
import pandas as pd
from datetime import datetime

def convert_timestamp_string_to_unix_ms(date_str):
    """
    Parse a date string of the form 'MM/DD/YYYY HH:MM:SS AM/PM'
    and convert it to a Unix timestamp in milliseconds.
    """
    dt = datetime.strptime(date_str, "%m/%d/%Y %I:%M:%S %p")
    return 1741101588

# We assume 'timestamp_updated' is the column to convert
def parse_or_preserve(val):
    """
    If the value is a string, try converting it.
    If it's already numeric (maybe you have mixed data?), leave it as is.
    """
    return 1741101588

# Apply the function to each row in the 'timestamp_updated' column
df_named["timestamp_updated"] = df_named["timestamp_updated"].apply(parse_or_preserve)

# Save back to JSON
# orient="records" creates a list of JSON objects (arrays of dictionaries


In [33]:
save_data_for_streamlit(df_named, path_db_final)

2025-03-05 17:21:17,103 - INFO - Saving updated data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\Backup\Survey\db_final.json
2025-03-05 17:21:17,756 - INFO - Data saved successfully.


In [38]:
data = read_json("S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\Backup\Survey\db_final.json")

In [43]:
import json



for entry in data:
    if "session_length" in entry:
        # Only convert if not None
        if entry["session_length"] is not None:
            entry["session_length"] = entry["session_length"] / 60.0  # seconds -> minutes

        # Now rename the key
        entry["playtime_at_review_minutes"] = entry.pop("session_length")




In [44]:
data[0]

{'Respondent ID': 118808000000,
 'Collector ID': 459707592,
 'Start Date': '02/26/2025 04:20:00 AM',
 'IP Address': None,
 'Email Address': None,
 'First Name': None,
 'Last Name': None,
 'Custom Data 1': None,
 'Please rate your overall experience playing Into the Dead: Our Darkest Days': 9,
 'Please rate the overall difficulty playing Into the Dead: Our Darkest Days': 9,
 'Weapon Durability,': 3,
 'Combat,': 4.0,
 'Stealth, ': 1.0,
 'Tutorial,': 1.0,
 'Managing Survivors in Shelter,': 4.0,
 'Game Difficulty,': 4.0,
 'Zombie Behaviour,': 4.0,
 'Add,': ' the game needs to have replay value to be successful, I believe that adding different starts depending on the chosen duo would be a good way to add this value. Everyone starting from the same base can be monotonous over time.',
 'Change,': ' Increase the variety of bases and weapons available depending on the duo, to further differentiate the gameplay between them. I also think there should be three shifts of gameplay, morning, afterno

In [41]:
for entry in data:
    entry.pop("embedding")

In [42]:
data[0]

{'Respondent ID': 118808000000,
 'Collector ID': 459707592,
 'Start Date': '02/26/2025 04:20:00 AM',
 'IP Address': None,
 'Email Address': None,
 'First Name': None,
 'Last Name': None,
 'Custom Data 1': None,
 'Please rate your overall experience playing Into the Dead: Our Darkest Days': 9,
 'Please rate the overall difficulty playing Into the Dead: Our Darkest Days': 9,
 'Weapon Durability,': 3,
 'Combat,': 4.0,
 'Stealth, ': 1.0,
 'Tutorial,': 1.0,
 'Managing Survivors in Shelter,': 4.0,
 'Game Difficulty,': 4.0,
 'Zombie Behaviour,': 4.0,
 'Add,': ' the game needs to have replay value to be successful, I believe that adding different starts depending on the chosen duo would be a good way to add this value. Everyone starting from the same base can be monotonous over time.',
 'Change,': ' Increase the variety of bases and weapons available depending on the duo, to further differentiate the gameplay between them. I also think there should be three shifts of gameplay, morning, afterno

In [47]:
# convert json to df
df = pd.DataFrame(data)
df.head(10)

Unnamed: 0,Respondent ID,Collector ID,Start Date,IP Address,Email Address,First Name,Last Name,Custom Data 1,Please rate your overall experience playing Into the Dead: Our Darkest Days,Please rate the overall difficulty playing Into the Dead: Our Darkest Days,...,hdbscan_PCA_3D_y,hdbscan_PCA_3D_z,hdbscan_tSNE_2D_x,hdbscan_tSNE_2D_y,hdbscan_tSNE_3D_x,hdbscan_tSNE_3D_y,hdbscan_tSNE_3D_z,hdbscan_id,hdbscan_id_name,playtime_at_review_minutes
0,118808000000,459707592,02/26/2025 04:20:00 AM,,,,,,9,9,...,-0.036199,0.047932,9.702198,-14.83631,12.054599,-11.692336,11.198197,-1,Noise,605.633333
1,118808000000,459707592,02/26/2025 04:20:00 AM,,,,,,9,9,...,0.19463,-0.202012,-4.958783,28.009903,0.704799,18.252508,4.860467,-1,Noise,605.633333
2,118808000000,459707592,02/26/2025 04:20:00 AM,,,,,,9,9,...,-0.063228,0.068984,-15.168633,-11.463284,-14.410099,-13.88093,-13.493813,-1,Noise,605.633333
3,118808000000,459707592,02/26/2025 04:20:00 AM,,,,,,9,9,...,-0.129826,-0.023552,9.655229,-16.443623,30.611814,1.451048,-13.610722,-1,Noise,605.633333
4,118808000000,459707592,02/26/2025 04:20:00 AM,,,,,,9,9,...,-0.172843,0.028235,12.192753,-23.781454,43.830521,-7.462946,-19.306606,-1,Noise,605.633333
5,118808000000,459707592,02/26/2025 04:20:00 AM,,,,,,9,9,...,0.104338,0.099757,23.097298,0.36716,10.457623,8.907899,22.650236,2,Enhanced Tutorial Improvements Needed,605.633333
6,118807000000,459707592,02/24/2025 02:20:29 PM,,,,,,10,7,...,-0.361507,-0.15922,-3.968874,-22.280281,-0.087085,-12.603081,-36.12447,8,Enhanced Survivor Engagement Options,555.083333
7,118807000000,459707592,02/24/2025 02:20:29 PM,,,,,,10,7,...,-0.267633,-0.218185,-16.112844,-21.151455,20.211393,4.539205,-18.108576,9,Character Interaction and Relationship Depth,555.083333
8,118807000000,459707592,02/24/2025 02:20:29 PM,,,,,,10,7,...,-0.133891,0.067835,-34.61808,-4.546374,-30.011858,0.678308,-17.694992,-1,Noise,555.083333
9,118810000000,459707592,02/28/2025 04:26:23 AM,,,,,,8,8,...,0.26691,-0.060933,-30.268156,22.613737,-17.750408,11.560346,26.78602,-1,Noise,522.916667


In [48]:
save_data_for_streamlit(df, "S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\db_final.json")

2025-03-06 14:44:56,708 - INFO - Saving updated data to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\db_final.json
2025-03-06 14:44:56,779 - INFO - Data saved successfully.
