In [13]:
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
import chromadb

# https://docs.trychroma.com/usage-guide
client = chromadb.PersistentClient(path="./vector_db")
# Create a new Chroma collection to store the supporting evidence. We don't need to specify an embedding fuction, and the default will be used.
collection = client.get_or_create_collection("clinical_trials")

We need to have the following:
- ids
- documents
- metadata

In [14]:
file_path = "data/NCT04414150.json"
folder = "data/ctg-studies.json"

In [15]:
from module.helpers import *

json_data = load_json_file(file_path)
clinical_trial_summary = process_clinical_trial_data(json_data)
clinical_trial_summary

# Go through all the clinical trials in the folder and add them to the collection
import os
from tqdm import tqdm

for file in tqdm(os.listdir(folder)):
    if file.endswith(".json"):
        json_data = load_json_file(os.path.join(folder, file))
        clinical_trial_summary = process_clinical_trial_data(json_data)
        summary_to_embed = format_clinical_trial_summary(clinical_trial_summary)
        metadata = turn_json_to_valid_metadata(clinical_trial_summary)
        collection.add(
            ids=[clinical_trial_summary["NCT ID"]],
            documents=[summary_to_embed],
            metadatas=[metadata],
        )

100%|██████████| 3226/3226 [01:47<00:00, 29.90it/s]


In [17]:
print(collection.query(
    query_texts=[
"""                            
The patient is a 60-year-old pre-menopausal woman presenting with a history of grade-3 locally advanced, triple-negative breast cancer on her left side. Despite neoadjuvant chemotherapy and subsequent mastectomy, the cancer has recurred, indicating progressive disease. Her current ECOG score is 1, signifying she is active but limited in strenuous physical activity. 

The breast tumor is triple-negative as denoted by ER negativity (IHC<1%), PR negativity (IHC<1%), and HER2 negativity (IHC-/+ or IHC++ but FISH/CISH negative). The invasive tumor measures 4 cm with one objectively measurable lesion via RECIST 1.1 standard. She has no prior treatment experience with PD-1/PD-L1 antibody, CTLA-4 antibody, or anti-vascular targeted therapy. 

Recent blood reports show Hemoglobin of 95 g/L, Neutrophil count of 2.5×109/L, Platelet count of 120×109/L. Biochemical tests results reveal Total Bilirubin of 1.2×ULN, Alanine aminotransferase and Aspartate aminotransferase both at 1.5×ULN, and Serum creatinine of 1.3 ULN with a creatinine clearance of 70mL/min.

She has a history of smoking but no family history of breast cancer. The patient reports no known autoimmune diseases, serious infections, interstitial lung disease, or non-infectious pneumonia, or other harmful medical conditions. There have been no major surgical treatments or open biopsies conducted within the past 4 weeks.                 
"""],
    n_results=10,
    # where={"metadata_field": "is_equal_to_this"},
    # This filters the results THEN ranks them with the embeddings.
    # where_document={"$contains":"search_string"}
)['documents'][0][0])

Clinical Trial Summary:

NCT ID: NCT04502680
Brief Title: Maintenance Treatment With Eribulin Mesylate Versus Observation in Triple
Negative Breast Cancer Patients
Brief Summary: This clinical trial is a multicenter, randomized, open-label, phase-II study to
evaluate the efficacy and safety of maintenance treatment with eribulin mesylate
following standard adjuvant chemotherapy in triple negative breast cancer
patients.
Official Title: A Multicenter, Randomized, Open-Label, Phase II Study to Evaluate the Efficacy
and Safety of Maintenance Treatment With Eribulin Mesylate Following Standard
Adjuvant Chemotherapy in Triple Negative Breast Cancer Patients
Conditions:
• Triple Negative Breast Cancer

Interventions Description:
• Eribulin mesylate 1.4mg/m2, administered intravenously on Days 1 and 8 of each 21 day cycle.

Eligibility Criteria:
Healthy Volunteers: False
Sex: FEMALE
Minimum Age: 18 Years
Standard Ages: ['ADULT', 'OLDER_ADULT']
Inclusion Criteria:  * The patient volunteers and

{'ids': [['NCT04502680',
   'NCT05447702',
   'NCT05402722',
   'NCT05205200',
   'NCT05909332',
   'NCT04567420',
   'NCT03077776',
   'NCT05749588',
   'NCT05722795',
   'NCT02926196']],
 'distances': [[0.5424139499664307,
   0.5643917918205261,
   0.5773933529853821,
   0.5781291127204895,
   0.589830756187439,
   0.6017122268676758,
   0.6144187450408936,
   0.6267913579940796,
   0.6281962394714355,
   0.6422033309936523]],
 'metadatas': [[{'Brief Summary': 'This clinical trial is a multicenter, randomized, open-label, phase-II study to evaluate the efficacy and safety of maintenance treatment with eribulin mesylate following standard adjuvant chemotherapy in triple negative breast cancer patients.',
    'Brief Title': 'Maintenance Treatment With Eribulin Mesylate Versus Observation in Triple Negative Breast Cancer Patients',
    'Conditions': 'Triple Negative Breast Cancer',
    'Eligibility': "Eligibility Criteria:\nHealthy Volunteers: False\nSex: FEMALE\nMinimum Age: 18 Years\n