In [25]:
import re
def str_to_array(s):
    if s is None:
        return None
    s = s.strip()
    if s.startswith('[') and s.endswith(']'):
        s = s[1:-1]
    s = re.sub(r'[\r\n]+',' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return np.fromstring(s, sep=' ').astype(float)

p = pd.read_csv("./data/profile_embedding.csv")
p['embedding'] = p['embedding'].apply(str_to_array)

In [26]:
p

Unnamed: 0,id,embedding
0,ssse1024,"[-0.0339566134, 0.0346230232, -0.00970953177, ..."
1,d8334dc0,"[-0.00219186791, -0.0128878951, 0.0567540489, ..."
2,309e9139,"[0.030787738, 0.0438391864, 0.0544221997, -0.0..."
3,7899f9e2,"[0.0632467642, 0.0684221983, 0.0797635466, 0.0..."
4,fde9c9f3,"[-0.00119526187, 0.00390328514, 0.0205088446, ..."
...,...,...
9996,c25d98dc,"[-0.061014995, -0.0113675073, 0.00387942628, 0..."
9997,1fab6d6a,"[-0.0379287191, -0.022427015, -0.032286942, 0...."
9998,c2ae6461,"[-0.0503764041, -0.040498063, 0.0233088415, 0...."
9999,acaeab0e,"[-0.0025407786, -0.0498635843, -0.0346738808, ..."


In [34]:
import ast, numpy as np

def clean_vec(v):
    if isinstance(v, str):
        v = ast.literal_eval(v)
    arr = np.array(v, dtype=float)
    if not np.all(np.isfinite(arr)):
        raise ValueError("non-finite")
    return arr.tolist()

p['embedding'] = p['embedding'].apply(clean_vec)

In [42]:
import numpy as np
p['embedding'] = p['embedding'].apply(lambda v: np.asarray(v, dtype=np.float32).tolist())

In [45]:
import numpy as np
import chromadb

# 1️⃣ Convert embeddings to plain float32 lists
p["embedding"] = p["embedding"].apply(lambda x: np.asarray(x, dtype=np.float32).tolist())

# 2️⃣ Create a new Chroma PersistentClient (new DB path)
client = chromadb.PersistentClient(path="./data/chroma_db_final")

# 3️⃣ Create (or get) your collection
col = client.get_or_create_collection("profiles")

# 4️⃣ Insert in safe batches (to avoid batch size limits)
BATCH = 4000
for i in range(0, len(p), BATCH):
    chunk = p.iloc[i:i+BATCH]
    col.upsert(
        ids=chunk["id"].astype(str).tolist(),
        embeddings=chunk["embedding"].tolist()
    )

print("✅ Vector DB created successfully with", len(p), "rows.")

# 5️⃣ Test the search (query the first profile)
query_vec = p["embedding"].iloc[0]
results = col.query(query_embeddings=[query_vec], n_results=5)

print("\nTop similar profiles:")
for pid, dist in zip(results["ids"][0], results["distances"][0]):
    print(f"{pid}  (distance={dist:.4f})")

✅ Vector DB created successfully with 10001 rows.

Top similar profiles:
ssse1024  (distance=0.0000)
9e96ff8e  (distance=0.1676)
2ff49f07  (distance=0.2044)
c3bde312  (distance=0.2117)
272ebb96  (distance=0.2122)


In [46]:
# 5️⃣ Test the search (query the first profile)
query_vec = p["embedding"].iloc[0]
results = col.query(query_embeddings=[query_vec], n_results=5)

print("\nTop similar profiles:")
for pid, dist in zip(results["ids"][0], results["distances"][0]):
    print(f"{pid}  (distance={dist:.4f})")


Top similar profiles:
ssse1024  (distance=0.0000)
9e96ff8e  (distance=0.1676)
2ff49f07  (distance=0.2044)
c3bde312  (distance=0.2117)
272ebb96  (distance=0.2122)


In [47]:
client = chromadb.PersistentClient(path="./data/chroma_db_final")
col = client.get_collection("profiles")

# Run another query anytime
query_vec = p["embedding"].iloc[10]
res = col.query(query_embeddings=[query_vec], n_results=3)
print(res["ids"], res["distances"])

[['ef18b439', 'e0982574', '35f0397a']] [[0.0, 0.20386037230491638, 0.20867332816123962]]
