In [3]:
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
import chromadb

# https://docs.trychroma.com/usage-guide
client = chromadb.PersistentClient(path="./vector_db")
# Create a new Chroma collection to store the supporting evidence. We don't need to specify an embedding fuction, and the default will be used.
collection = client.get_or_create_collection("clinical_trials")

We need to have the following:
- ids
- documents
- metadata

In [4]:
file_path = "data/NCT04414150.json"
folder = "data/ctg-studies.json"

In [5]:
from module.helpers import *

json_data = load_json_file(file_path)
clinical_trial_summary = process_clinical_trial_data(json_data)
clinical_trial_summary

# Go through all the clinical trials in the folder and add them to the collection
import os
from tqdm import tqdm

for file in tqdm(os.listdir(folder)):
    if file.endswith(".json"):
        json_data = load_json_file(os.path.join(folder, file))
        clinical_trial_summary = process_clinical_trial_data(json_data)
        summary_to_embed = format_clinical_trial_summary(clinical_trial_summary)
        metadata = turn_json_to_valid_metadata(clinical_trial_summary)
        collection.add(
            ids=[clinical_trial_summary["NCT ID"]],
            documents=[summary_to_embed],
            metadatas=[metadata],
        )

100%|██████████| 13424/13424 [08:21<00:00, 26.75it/s]


In [9]:
collection.query(
    query_texts=["Cancer"],
    n_results=10,
    # where={"metadata_field": "is_equal_to_this"},
    # where_document={"$contains":"search_string"}
)

{'ids': [['NCT03476070',
   'NCT00003329',
   'NCT03207594',
   'NCT00544336',
   'NCT00601406',
   'NCT00948337',
   'NCT00962494',
   'NCT03740503',
   'NCT01415089',
   'NCT05708703']],
 'distances': [[1.0686001777648926,
   1.072503924369812,
   1.0777868032455444,
   1.0817314386367798,
   1.0903290510177612,
   1.094771146774292,
   1.1004599332809448,
   1.1009414196014404,
   1.1017135381698608,
   1.1025917530059814]],
 'metadatas': [[{'Brief Summary': 'This study aims to evaluate the prevalence, biological mechanism and survivorship impact of cognitive toxicity among adolescent and young adult (AYA) patients diagnosed with curable cancers. The hypothesis is that cognitive impairment is clinically significant among AYA cancer patients treated with chemotherapy and that there will be detectable structural and functional changes in the brain for this patient group.',
    'Brief Title': 'Adolescent and Young Adult Cancer Patients: Cognitive Toxicity on Survivorship (ACTS)',
    '