In [None]:
import pandas as pd
import chromadb
import os

CHROMA_PATH = "../chroma_db"


df_train = pd.read_pickle('../Data/processed/data_train.pkl')
df_test = pd.read_pickle('../Data/processed/data_test.pkl')

 Converted numpy arrays to standard lists for ChromaDB compatibility

In [2]:


df_train['embeddings'] = df_train['embeddings'].apply(lambda x: x.tolist())
df_test['embeddings'] = df_test['embeddings'].apply(lambda x: x.tolist())

In [3]:
clients= chromadb.PersistentClient(path=CHROMA_PATH)

collection_train=clients.create_collection(name="train_data",metadata={"hnsw:space" : "cosine"})
collection_test=clients.create_collection(name="test_data",metadata={"hnsw:space" : "cosine"})


In [5]:
from tqdm import tqdm

def add_to_collection(collection, df, batch_size=2000):
    for start in tqdm(range(0, len(df), batch_size), desc=f"→ {collection.name}"):
        
        batch = df[start : start + batch_size]

        collection.add(
            ids=batch["id"].tolist(),
            embeddings=batch["embeddings"].tolist(),
            metadatas=batch[["label", "label_text", "text"]].to_dict("records")
        )

print("Ingestion train...")
add_to_collection(collection_train, df_train)

print("Ingestion test...")
add_to_collection(collection_test, df_test)


Ingestion train...


→ train_data: 100%|██████████| 60/60 [00:46<00:00,  1.28it/s]


Ingestion test...


→ test_data: 100%|██████████| 4/4 [00:02<00:00,  1.64it/s]


In [6]:
count_train = collection_train.count()
count_test = collection_test.count()


In [7]:
print(f"Total items in Training Collection: {count_train}")
print(f"Total items in Test Collection:     {count_test}")


Total items in Training Collection: 120000
Total items in Test Collection:     7600


In [8]:
first_item = collection_train.peek(limit=1)


In [9]:
print(f"ID: {first_item['ids'][0]}")


ID: train0


In [11]:
print(f"Text Snippet: {first_item['metadatas'][0]['text'][:100]}...")


Text Snippet: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\b...
