In [1]:
import sqlite3

import numpy as np
import pandas as pd

from litreview import ClinicalTrials

In [2]:
db_connection = sqlite3.connect("../clinical_trials_lean.db")
trials = ClinicalTrials(
    connection=db_connection, schema_directory="../files/schema.json"
)

columns = ["BriefTitle", "BriefSummary"]
data = pd.DataFrame(trials.query(*columns), columns=columns)
data.dropna(inplace=True)
data

Unnamed: 0,BriefTitle,BriefSummary
0,Kinesiotape for Edema After Bilateral Total Kn...,The purpose of this study is to determine if k...
1,An Open-labeled Trial of Ramipril in Patients ...,Physiology of migraine involving renin-angiote...
2,OCTA in Epivascular Glia After Dex Implant,The aim of this prospective study was for the ...
3,Brain-imaging and Adolescent Neuroscience Cons...,This is a multi-site study of adolescents 12-2...
4,Leverage Noninvasive Transcutaneous Vagus Nerv...,"Suicidal thoughts, suicide attempts, and suici..."
...,...,...
995,Treatment of Non Union of Long Bone Fractures ...,"Treatment of nonunion, delayed union and malun..."
996,"Century Trial, a Randomized Lifestyle Modifica...",The Century Trial is a single center Phase III...
997,Reduced Craniospinal Radiation Therapy and Che...,This phase II trial studies how well reduced d...
998,Prospective Two-arm Study of Fertility in Men ...,COVID-19 infection is hypothesized to have a p...


In [33]:
from chromadb import PersistentClient
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import time
import uuid

In [23]:
# Create vector store - OOM issues (w/ 32 GB RAM) using in-memory client
client = PersistentClient(path="../chromadb")
collection = client.create_collection(
    "default",
    embedding_function=SentenceTransformerEmbeddingFunction(model_name="neuml/pubmedbert-base-embeddings")
)

In [51]:
start = time.time()
# Index data
collection.add(ids=[str(i) for i in range(len(data['BriefSummary']))], documents = data['BriefSummary'].tolist())

print(f"ELAPSED = {time.time() - start:.2f}s")

ELAPSED = 31.66s


In [53]:
query = 'suicide'
max_results = 5

In [54]:
results = collection.query(query_texts=query, n_results=max_results, include=['distances'])

In [55]:
results

{'ids': [['476', '864', '4', '745', '634']],
 'distances': [[138.98170471191406,
   192.12664794921875,
   203.3760528564453,
   214.7658233642578,
   273.4443664550781]],
 'metadatas': None,
 'embeddings': None,
 'documents': None,
 'uris': None,
 'data': None,
 'included': ['distances']}

In [56]:
data.iloc[results['ids'][0]]

Unnamed: 0,BriefTitle,BriefSummary
477,Pain Perception in Suicidal Behavior Vulnerabi...,"In France, almost 1 death on 50 is a suicide. ..."
865,Brief Intervention for Suicide Risk Reduction ...,Adolescent suicide is the 2nd leading cause of...
4,Leverage Noninvasive Transcutaneous Vagus Nerv...,"Suicidal thoughts, suicide attempts, and suici..."
746,"Randomized, Placebo-controlled Multicenter Tri...",The primary hypothesis of this confirmatory st...
635,Using D-cycloserine to Enhance the Benefits of...,This study will examine whether pretreatment w...
