In [1]:
import json
import pandas as pd
import numpy as np
import umap
import os

def prepare_dataframe(
    input_json_path: str
) -> pd.DataFrame:
    """
    1) Load JSON data into a Pandas DataFrame.
    2) Rename 'embedding' -> 'embedding_long'.
    3) Reduce embedding dimensions to 25 via UMAP, save as 'embedding_short'.
    4) Generate ID: 'steam_1', 'steam_2', ...
    5) (Optional) Write the resulting DataFrame to a CSV file.
    """

    # 1) Load JSON data
    with open(input_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)  # data is expected to be a list of dicts

    df = pd.DataFrame(data)

    # 2) Rename column 'embedding' -> 'embedding_long'
    df.rename(columns={"embedding": "embedding_long"}, inplace=True)

    # 3) Use UMAP to reduce embedding dimensions to 25
    #    - First convert 'embedding_long' to a matrix of shape [num_rows, original_dim]
    embeddings_matrix = np.vstack(df["embedding_long"].values)
    reducer = umap.UMAP(n_components=25, random_state=42)
    embedding_25d = reducer.fit_transform(embeddings_matrix)

    # embedding_25d is now a NumPy array of shape [num_rows, 25].
    # Convert each row back to a Python list so it can be stored easily in the DataFrame
    df["embedding_short"] = embedding_25d.tolist()

    # 4) Generate an incremental ID: steam_1, steam_2, ...
    df["id"] = [f"steam_{i+1}" for i in range(len(df))]


    return df



In [2]:

input_path = r"S:\SID\Analytics\Working Files\Individual\Florian\Projects\semantic_search\Data\db_embedded.json"
output_path = r"S:\SID\Analytics\Working Files\Individual\Florian\Projects\semantic_search\Data\db_embedded_prepared.csv"

df_prepared = prepare_dataframe(input_path)

df_prepared.to_pickle(output_path)

print("Preview of the resulting DataFrame:")
print(df_prepared.head())
print("\nNumber of rows:", len(df_prepared))


  warn(


Preview of the resulting DataFrame:
                                        app_id_name  recommendationid  \
0  1166860_Rival_Stars_Horse_Racing_Desktop_Edition         179387177   
1  1166860_Rival_Stars_Horse_Racing_Desktop_Edition         178743676   
2  1166860_Rival_Stars_Horse_Racing_Desktop_Edition         178743676   
3  1166860_Rival_Stars_Horse_Racing_Desktop_Edition         178350425   
4  1166860_Rival_Stars_Horse_Racing_Desktop_Edition         177351085   

   playtime_at_review_minutes  last_played  \
0                        1775   1735228376   
1                        1016   1731228389   
2                        1016   1731228389   
3                        2492   1736827844   
4                        3678   1729362177   

                                         review_text  timestamp_updated  \
0              Very detailed and it has lot's to do.         1732047413   
1  definitely a favourite go-to game of mine, I c...         1731227648   
2  definitely a favouri

In [3]:
df_prepared.head(


)

Unnamed: 0,app_id_name,recommendationid,playtime_at_review_minutes,last_played,review_text,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,received_for_free,written_during_early_access,language,topic,sentiment,category,sentence,embedding_long,embedding_short,id
0,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,179387177,1775,1735228376,Very detailed and it has lot's to do.,1732047413,True,0,0,0.0,...,False,False,english,Game Detail,Positive,fact,Very detailed and it has lot's to do.,"[-0.04577818512916565, 0.03463919833302498, -0...","[6.846414089202881, 3.336561918258667, 6.70872...",steam_1
1,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,False,False,english,Horse Care Mechanics,Negative,request,I DO wish you could care for your horses (e.g....,"[0.010292216204106808, 0.007472959812730551, -...","[2.2799088954925537, 3.364882469177246, 4.4340...",steam_2
2,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,False,False,english,Overall Enjoyment,Positive,fact,"definitely a favourite go-to game of mine, I c...","[-0.018795032054185867, 0.004901772830635309, ...","[7.930993556976318, 2.9434573650360107, 8.1327...",steam_3
3,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178350425,2492,1736827844,i love this game one phone and pc. i just wish...,1730675154,True,0,0,0.0,...,False,False,english,Cross-Platform Updates,Negative,request,I just wish it would update at the same time!,"[-0.05961838737130165, -0.033980824053287506, ...","[5.842147350311279, 5.372000217437744, 6.28370...",steam_4
4,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,177351085,3678,1729362177,"""It's wonderful, simply one of the best horse ...",1729361303,True,0,0,0.0,...,False,False,french,Overall Quality,Positive,fact,"It's wonderful, simply one of the best horse g...","[-0.05994325876235962, -0.017245933413505554, ...","[1.037112832069397, 1.2291319370269775, 4.2774...",steam_5


In [4]:
df = pd.read_pickle(output_path)

In [5]:
df.head()

Unnamed: 0,app_id_name,recommendationid,playtime_at_review_minutes,last_played,review_text,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,received_for_free,written_during_early_access,language,topic,sentiment,category,sentence,embedding_long,embedding_short,id
0,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,179387177,1775,1735228376,Very detailed and it has lot's to do.,1732047413,True,0,0,0.0,...,False,False,english,Game Detail,Positive,fact,Very detailed and it has lot's to do.,"[-0.04577818512916565, 0.03463919833302498, -0...","[6.846414089202881, 3.336561918258667, 6.70872...",steam_1
1,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,False,False,english,Horse Care Mechanics,Negative,request,I DO wish you could care for your horses (e.g....,"[0.010292216204106808, 0.007472959812730551, -...","[2.2799088954925537, 3.364882469177246, 4.4340...",steam_2
2,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178743676,1016,1731228389,"definitely a favourite go-to game of mine, I c...",1731227648,True,9,0,0.652082,...,False,False,english,Overall Enjoyment,Positive,fact,"definitely a favourite go-to game of mine, I c...","[-0.018795032054185867, 0.004901772830635309, ...","[7.930993556976318, 2.9434573650360107, 8.1327...",steam_3
3,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,178350425,2492,1736827844,i love this game one phone and pc. i just wish...,1730675154,True,0,0,0.0,...,False,False,english,Cross-Platform Updates,Negative,request,I just wish it would update at the same time!,"[-0.05961838737130165, -0.033980824053287506, ...","[5.842147350311279, 5.372000217437744, 6.28370...",steam_4
4,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,177351085,3678,1729362177,"""It's wonderful, simply one of the best horse ...",1729361303,True,0,0,0.0,...,False,False,french,Overall Quality,Positive,fact,"It's wonderful, simply one of the best horse g...","[-0.05994325876235962, -0.017245933413505554, ...","[1.037112832069397, 1.2291319370269775, 4.2774...",steam_5


In [9]:
query_data = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\semantic_search\query_results.pkl'
df = pd.read_pickle(query_data)

In [10]:
df.head()

Unnamed: 0,id,distance,document,app_id_name,category,embedding,language,last_played,playtime_at_review_minutes,received_for_free,...,sentence,sentiment,steam_purchase,timestamp_updated,topic,voted_up,votes_funny,votes_up,weighted_vote_score,written_during_early_access
0,steam_3931,0.757495,The gameplay is fun.,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,fact,"[7.42410945892334, 2.8956964015960693, 6.80429...",spanish,1728681752,1790,False,...,The gameplay is fun.,Positive,True,1712682932,Gameplay Enjoyment,True,0,0,0.0,False
1,steam_4419,0.76334,easy gameplay,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,fact,"[7.400673866271973, 2.7735776901245117, 6.6358...",english,1736136067,1747,True,...,easy gameplay,Positive,True,1736135330,Gameplay Accessibility,True,0,0,0.0,False
2,steam_5585,0.764409,The gameplay is nice.,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,fact,"[7.46079683303833, 2.6682136058807373, 6.86503...",english,1736536103,4820,False,...,The gameplay is nice.,Positive,True,1736698840,Gameplay Experience,True,0,0,0.0,False
3,steam_4684,0.794787,"Gameplay is great, graphics are good.",1166860_Rival_Stars_Horse_Racing_Desktop_Edition,fact,"[6.983322620391846, 1.5380569696426392, 7.5259...",english,1705736149,911,False,...,"Gameplay is great, graphics are good.",Positive,True,1666557300,Gameplay Quality,True,0,0,0.0,False
4,steam_625,0.821912,Amazing gameplay.,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,fact,"[7.462830066680908, 2.661752462387085, 6.84729...",english,1733426372,811,False,...,Amazing gameplay.,Positive,True,1703285730,Gameplay Experience,True,0,0,0.0,False


Hier muesste ja eigentlich jetzt wieder der ganz normale code aus cluster_analysis rein kommen koennen.
Moegliche probleme:
- Embedding might still be a string
- input file is not a json but a pickle


# Cluster Analysis

In [14]:
import hdbscan

# Adjustable parameters
dimensionality_methods = ['tSNE']
hdbscan_params = {"min_cluster_size": 10, "min_samples": 2, "cluster_selection_epsilon": 0.15}

query_data = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\semantic_search\query_results.pkl'
df = pd.read_pickle(query_data)
df = df[df['embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
print(f"Loaded {len(df)} valid entries with embeddings.")

# Extract embeddings
mat = np.array(df['embedding'].tolist())

hdbscan_clusterer = hdbscan.HDBSCAN(**hdbscan_params)
cluster_labels = hdbscan_clusterer.fit_predict(mat)

reduction_results = {}

for method in dimensionality_methods:
    model = umap.UMAP(n_components=2)
    coords_2d = model.fit_transform(mat)
    reduction_results[f'hdbscan_{method}_2D'] = {
        'x': coords_2d[:, 0],
        'y': coords_2d[:, 1]
    }

# Add dimensional coordinates to DataFrame
for method_dim, coords in reduction_results.items():
    for axis, values in coords.items():
        df[f'{method_dim}_{axis}'] = values

# Add the cluster labels to the DataFrame
df['hdbscan_id'] = cluster_labels

Loaded 200 valid entries with embeddings.




In [15]:
df.head()

Unnamed: 0,id,distance,document,app_id_name,category,embedding,language,last_played,playtime_at_review_minutes,received_for_free,...,timestamp_updated,topic,voted_up,votes_funny,votes_up,weighted_vote_score,written_during_early_access,hdbscan_tSNE_2D_x,hdbscan_tSNE_2D_y,hdbscan_id
0,steam_3931,0.757495,The gameplay is fun.,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,fact,"[7.42410945892334, 2.8956964015960693, 6.80429...",spanish,1728681752,1790,False,...,1712682932,Gameplay Enjoyment,True,0,0,0.0,False,11.279026,7.251918,5
1,steam_4419,0.76334,easy gameplay,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,fact,"[7.400673866271973, 2.7735776901245117, 6.6358...",english,1736136067,1747,True,...,1736135330,Gameplay Accessibility,True,0,0,0.0,False,11.460023,7.407729,5
2,steam_5585,0.764409,The gameplay is nice.,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,fact,"[7.46079683303833, 2.6682136058807373, 6.86503...",english,1736536103,4820,False,...,1736698840,Gameplay Experience,True,0,0,0.0,False,11.138933,7.09866,5
3,steam_4684,0.794787,"Gameplay is great, graphics are good.",1166860_Rival_Stars_Horse_Racing_Desktop_Edition,fact,"[6.983322620391846, 1.5380569696426392, 7.5259...",english,1705736149,911,False,...,1666557300,Gameplay Quality,True,0,0,0.0,False,-3.263226,4.317858,2
4,steam_625,0.821912,Amazing gameplay.,1166860_Rival_Stars_Horse_Racing_Desktop_Edition,fact,"[7.462830066680908, 2.661752462387085, 6.84729...",english,1733426372,811,False,...,1703285730,Gameplay Experience,True,0,0,0.0,False,11.157708,7.140869,5


In [16]:
# unique cluster ids
np.unique(cluster_labels)

array([-1,  0,  1,  2,  3,  4,  5,  6], dtype=int64)

In [17]:
# Save the DataFrame
output_path = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\semantic_search\query_results_clustered.pkl'
df.to_pickle(output_path)


# Test with distances

In [36]:
from main import *
# Adjust path and collection name as needed
csv_path = r"S:\SID\Analytics\Working Files\Individual\Florian\Projects\semantic_search\Data\db_embedded_prepared.csv"
collection_name = "my_collection_1536"
persist_dir = "S:\SID\Analytics\Working Files\Individual\Florian\Projects\semantic_search\Data\ChromaDB"  # folder name for storing the persistent DB

# 3) Query the database with some text
query_text = ("Clear communication")

# Define a similarity threshold (adjust as needed)
similarity_threshold = 0.7

# Query the database with the similarity threshold instead of top_n
results_df = query_chroma(query_text, collection_name, similarity_threshold=similarity_threshold,
                          persist_path=persist_dir)

logger.info("Done.")


2025-02-17 15:48:02,591 - INFO - Embedding query text: Clear communication
2025-02-17 15:48:03,432 - INFO - Connecting to Chroma DB.
2025-02-17 15:48:03,441 - INFO - Retrieving collection 'my_collection_1536'.
2025-02-17 15:48:03,443 - INFO - Querying top 5500 results from 'my_collection_1536'.
2025-02-17 15:48:03,701 - INFO - Received 5500 results. Filtering based on similarity threshold.
2025-02-17 15:48:03,701 - INFO - Filtered results count: 14
2025-02-17 15:48:03,711 - INFO - Done.


In [37]:
results_df.sort_values(by="distance", ascending =True, inplace=False)[["distance", "document"]].tail(n=10)

Unnamed: 0,distance,document
4,0.672173,An easy system to use.
5,0.676721,A smooth running game.
6,0.679294,"Very realistic, there are no mistakes, and it ..."
7,0.682824,It's easy to understand with simple buttons th...
8,0.684504,hard to understand
9,0.693084,no ads
10,0.694278,Easy.
11,0.694898,Beautiful Graphics
12,0.695854,Calm gameplay.
13,0.6975,easy gameplay


In [6]:
results_df.sort_values(by="distance", ascending=True, inplace=False)[["distance", "document"]].head()

Unnamed: 0,distance,document
0,0.305044,"Training, breeding, and racing horses."
1,0.353095,with there custom horses you can make breeding.
2,0.356503,It lets you breed horses.
3,0.358581,Breeding is terrible. Even if you use only you...
4,0.368992,"Breed, race, and bond with your horses."
