In [1]:
import json
import snomed_ct as sc
import yaml
from neo4j import GraphDatabase
import shutil
from pathlib import Path
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import pandas as pd

import openai
import os
import umls

In [2]:
with open("config.yaml", "r") as stream:
    try:
        PARAM = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [3]:
f = open('ctg-studies(5).json')
#f = open('ctg-studies_test.json')
df = json.load(f)
f.close()
df[0]

{'protocolSection': {'identificationModule': {'nctId': 'NCT05925803',
   'orgStudyIdInfo': {'id': 'D3460C00002'},
   'secondaryIdInfos': [{'id': '2023-505976-31',
     'type': 'OTHER',
     'domain': 'EuCTR'}],
   'organization': {'fullName': 'AstraZeneca', 'class': 'INDUSTRY'},
   'briefTitle': 'Determine Effectiveness of Anifrolumab In SYstemic Sclerosis (DAISY)',
   'officialTitle': 'A Multicenter, Randomized, Parallel-group, Double-blind,Two-arm Phase III Study to Evaluate the Safety and Efficacy of Anifrolumab Compared With Placebo in Male and Female Participants 18 to 70 Years of Age Inclusive With Systemic Sclerosis',
   'acronym': 'DAISY'},
  'statusModule': {'statusVerifiedDate': '2023-06',
   'overallStatus': 'NOT_YET_RECRUITING',
   'expandedAccessInfo': {'hasExpandedAccess': False},
   'startDateStruct': {'date': '2023-07-13', 'type': 'ESTIMATED'},
   'primaryCompletionDateStruct': {'date': '2027-12-31', 'type': 'ESTIMATED'},
   'completionDateStruct': {'date': '2027-12-31'

In [4]:
node_trial = {}
node_institution = set()
node_condition = {}

edge_trial_condition = set()
edge_trial_institution = set()



In [5]:
condition_snomed = {}

for index, row in enumerate(df):
    NCT_mumber = row["protocolSection"]["identificationModule"]["nctId"]
    title = row["protocolSection"]["identificationModule"]["officialTitle"]
    status = row["protocolSection"]["statusModule"]["overallStatus"]
    start_date = row["protocolSection"]["statusModule"]["startDateStruct"]["date"]
    study_results = row["hasResults"]
    conditions = row["protocolSection"]["conditionsModule"]["conditions"]
    
    min_age = None
    if "minimumAge" in row["protocolSection"]["eligibilityModule"]:
        min_age = row["protocolSection"]["eligibilityModule"]["minimumAge"].replace("Years", "").strip()

    max_age = None
    if "maximumAge" in row["protocolSection"]["eligibilityModule"]:
        max_age = row["protocolSection"]["eligibilityModule"]["maximumAge"].replace("Years", "").strip()

    criteria = row["protocolSection"]["eligibilityModule"]["eligibilityCriteria"]
    healthy_volunteers = row["protocolSection"]["eligibilityModule"]["healthyVolunteers"]

    sampling_method = ""
    if "samplingMethod" in row["protocolSection"]["eligibilityModule"]:
        sampling_method = row["protocolSection"]["eligibilityModule"]["samplingMethod"]
    gender = row["protocolSection"]["eligibilityModule"]["sex"]

    

    snomed_exists = False
    for c in conditions:
        if c.endswith("."):
            c = c[:-1]
        
        snomed_id = None
        if c not in condition_snomed:
            snomed_id = sc.getConceptIdByTerm(c)
            condition_snomed[c] = snomed_id

        snomed_id = condition_snomed[c]

        if snomed_id is not None:
            
            snomed_exists = True
            node_condition[snomed_id] = c
            edge_trial_condition.add((NCT_mumber, snomed_id))

    interventions = []
    if snomed_exists == True:
        if "armsInterventionsModule" in row["protocolSection"] and "interventions" in row["protocolSection"]["armsInterventionsModule"]:
            interventions = "|".join([x["name"] for x in row["protocolSection"]["armsInterventionsModule"]["interventions"]])

            

        study_type = row["protocolSection"]["designModule"]["studyType"]
        
       
        study_designs = json.dumps(row["protocolSection"]["designModule"]["designInfo"])

        institutions = []

        if "leadSponsor" in row["protocolSection"]["sponsorCollaboratorsModule"]:
            name = row["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"]["name"]
            type = row["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"]["class"]
            node_institution.add((name, type))
            edge_trial_institution.add((NCT_mumber, name))
        
        if "collaborators" in row["protocolSection"]["sponsorCollaboratorsModule"]:
            for c in row["protocolSection"]["sponsorCollaboratorsModule"]["collaborators"]:
                name = c["name"]
                type = c["class"]
                node_institution.add((name, type))
                edge_trial_institution.add((NCT_mumber, name))

        description = row["protocolSection"]["descriptionModule"]["briefSummary"]
        
        locations = "|".join(set([f'{x.get("country")}' for x in row["protocolSection"]["contactsLocationsModule"]["locations"]]))

        phases = ""
        if "phases" in row["protocolSection"]["designModule"]:
            phases = "|".join(row["protocolSection"]["designModule"]["phases"])
        enrollment = row["protocolSection"]["designModule"]["enrollmentInfo"]["count"]

        url = f"https://ClinicalTrials.gov/show/{NCT_mumber}"

        outcome_measures = ""

        if "primaryOutcomes" in row["protocolSection"]["outcomesModule"]:
            outcome_measures += "|".join([f'{x["measure"]}; {x["timeFrame"]}' for x in row["protocolSection"]["outcomesModule"]["primaryOutcomes"]])

        if "secondaryOutcomes" in row["protocolSection"]["outcomesModule"]:
            outcome_measures += "|"
            outcome_measures += "|".join(set([f'{x["measure"]}; {x["timeFrame"]}' for x in row["protocolSection"]["outcomesModule"]["primaryOutcomes"]]))


        node_trial[NCT_mumber] = {"title": title, "status": status, "study_results": study_results, "study_type": study_type, 
                                "start_date": start_date, "min_age": min_age, "max_age": max_age, "gender": gender, "criteria": criteria,
                                "healthy_volunteers": healthy_volunteers, "sampling_method": sampling_method,
                                "locations": locations, "phases": phases, "enrollment": enrollment, "description": description,
                                "url": url, "outcome_measures": outcome_measures}


    


In [6]:
len(node_trial)

24

In [7]:
output_folder = "for_neo4j"

In [8]:
with open(f'{output_folder}/node_trial.tsv', 'w') as outfile:
    out_str = "\t".join(["NCT", 'title', 
                             'status', 'study_results', 'study_type',
                             'start_date', 'min_age', 'max_age', 'gender', "criteria",
                                "healthy_volunteers", "sampling_method",'locations', 
                             'phases', 'enrollment', 'description', 'url', 'outcome_measures'])
    outfile.write(f"{out_str}\n")
    for n in node_trial:
        node = node_trial[n]
        out_str = "\t".join([n, node['title'], 
                             node['status'], str(node['study_results']), node['study_type'],
                             node['start_date'], str(node['min_age']), str(node['max_age']), node['gender'],  node['criteria'].replace("\n", " "),
                             str(node['healthy_volunteers']), node['sampling_method'], node['locations'], node['phases'],
                             str(node['enrollment']), node['description'].replace("\n", " "), node['url'], node['outcome_measures']])
        outfile.write(f"{out_str}\n")

In [9]:
with open(f'{output_folder}/node_institution.tsv', 'w') as outfile:
    out_str = "\t".join(["name", 'type'])
    outfile.write(f"{out_str}\n")
    for n in node_institution:
        out_str = "\t".join([n[0], n[1]])
        outfile.write(f"{out_str}\n")

In [10]:
node_condition

{'89155008': 'Systemic Sclerosis',
 '422801000': 'Scleroderma',
 '13644009': 'Hypercholesterolemia',
 '128822004': 'Acute Lymphocytic Leukemia',
 '2704003': 'Acute Disease',
 '24079001': 'Atopic Dermatitis',
 '22352007': 'Acute Pyelonephritis',
 '57019003': 'Common Wart',
 '840533007': 'SARS-CoV-2',
 '1162767002': 'Squamous Cell Carcinoma',
 '1162576007': 'Multiple Myeloma',
 '399326009': 'Bladder Cancer',
 '28293008': 'Haemophilia A',
 '414916001': 'Obesity',
 '238131007': 'Overweight',
 '128833001': 'Aggressive NK Cell Leukemia',
 '37796009': 'Migraine',
 '55464009': 'Systemic Lupus Erythematosus',
 '405152002': 'Quality of Life',
 '59393003': 'Hidradenitis Suppurativa',
 '11399002': 'Pulmonary Arterial Hypertension',
 '735676003': 'Narcolepsy Type 1',
 '442685003': 'Nonalcoholic Steatohepatitis'}

In [11]:
with open(f'{output_folder}/node_condition.tsv', 'w') as outfile:
    out_str = "\t".join(['SNOMEDCT', "name", 'UMLS'])
    outfile.write(f"{out_str}\n")
    for n in node_condition:
        out_str = "\t".join([n, node_condition[n], ""])
        outfile.write(f"{out_str}\n")

In [12]:
with open(f'{output_folder}/edge_trial_condition.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_trial_condition:
        out_str = "\t".join([n[0], n[1]])
        outfile.write(f"{out_str}\n")

In [13]:
with open(f'{output_folder}/edge_trial_institution.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_trial_institution:
        out_str = "\t".join([n[0], n[1]])
        outfile.write(f"{out_str}\n")

Read the condition file

1. add the snomed ontology
2. add UMLS

In [14]:


UMLS_API_KEY = PARAM["UMLS_API_KEY"]

In [15]:
conditions = pd.read_csv("for_neo4j/node_condition.tsv", sep="\t")

conditions.head()

Unnamed: 0,SNOMEDCT,name,UMLS
0,89155008,Systemic Sclerosis,
1,422801000,Scleroderma,
2,13644009,Hypercholesterolemia,
3,128822004,Acute Lymphocytic Leukemia,
4,2704003,Acute Disease,


In [16]:
conditions.head()

Unnamed: 0,SNOMEDCT,name,UMLS
0,89155008,Systemic Sclerosis,
1,422801000,Scleroderma,
2,13644009,Hypercholesterolemia,
3,128822004,Acute Lymphocytic Leukemia,
4,2704003,Acute Disease,


In [17]:
node_category = set()
node_site = set()
node_morphology = set()

edge_condition_category = set()
edge_category_category = set()
edge_condition_site = set()
edge_site_site = set()
edge_condition_morphology = set()

done = set()

son_father = {}

for i, row in conditions.iterrows():
    id = row["SNOMEDCT"]
    name = row["name"]

    relationships = sc.getOnotologyById(str(id))
    print (str(id), relationships)

    for r in relationships:
        source = r[0]
        target = r[1]
        relation = r[2]
        value = relationships[r]
        #print (r)
        if relation == "Is a (attribute)":
            #print (value)
            if value["target_fsn"].endswith("(body structure)"):
                edge_site_site.add((source, target))
                node_site.add((target, value["target_fsn"], value["target_pt"]))
            else:
                if str(source) == str(id):
                    edge_condition_category.add((source, target))
                else:
                    edge_category_category.add((source, target))
            node_category.add((target, value["target_fsn"], value["target_pt"]))
            

        elif relation == "Finding site (attribute)":
            edge_condition_site.add((source, target))
            node_site.add((target, value["target_fsn"], value["target_pt"]))

        elif relation == "Associated morphology (attribute)":
            node_morphology.add((target, value["target_fsn"], value["target_pt"]))
            edge_condition_morphology.add((source, target))


    umls_concepts = umls.get_UMLS(name, UMLS_API_KEY)
    conditions.loc[i, "UMLS"] = ",".join(umls_concepts)

conditions.to_csv("for_neo4j/node_condition.tsv", sep="\t", index=False)

https://browser.ihtsdotools.org/snowstorm/snomed-ct/browser/MAIN/concepts/89155008


267874003
getTaxonomy 267874003 362992004
getTaxonomy 362992004 85828009
getTaxonomy 85828009 414029004
getTaxonomy 414029004 64572001
getTaxonomy 85828009 64572001
getTaxonomy 362992004 105969002
getTaxonomy 105969002 64572001
getTaxonomy 362992004 64572001
getTaxonomy 267874003 64572001
56019007
getTaxonomy 56019007 64572001
21793004
getTaxonomy 21793004 85756007
getTaxonomy 85756007 91723000
getTaxonomy 91723000 442083009
getTaxonomy 442083009 123037004
56019007
21793004
89155008 {('89155008', '267874003', 'Is a (attribute)'): {'fsn': 'Is a (attribute)', 'pt': 'Is a', 'conceptId': '267874003', 'target_fsn': 'Scleroderma (disorder)', 'target_pt': 'Scleroderma'}, ('267874003', '362992004', 'Is a (attribute)'): {'fsn': 'Is a (attribute)', 'pt': 'Is a', 'conceptId': '362992004', 'target_fsn': 'Autoimmune connective tissue disorder (disorder)', 'target_pt': 'Autoimmune connective tissue disorder'}, ('362992004', '85828009', 'Is a (attribute)'): {'fsn': 'Is a (attribute)', 'pt': 'Is a', '

In [18]:
with open(f'{output_folder}/node_category.tsv', 'w') as outfile:
    out_str = "\t".join(["SNOMEDCT", 'fsn', 'name'])
    outfile.write(f"{out_str}\n")
    for n in node_category:
        out_str = "\t".join([n[0], n[1], n[2]])
        outfile.write(f"{out_str}\n")

In [19]:
with open(f'{output_folder}/node_site.tsv', 'w') as outfile:
    out_str = "\t".join(["SNOMEDCT", 'fsn', 'name'])
    outfile.write(f"{out_str}\n")
    for n in node_site:
        out_str = "\t".join([n[0], n[1], n[2]])
        outfile.write(f"{out_str}\n")

In [20]:
with open(f'{output_folder}/node_morphology.tsv', 'w') as outfile:
    out_str = "\t".join(["SNOMEDCT", 'fsn', 'name'])
    outfile.write(f"{out_str}\n")
    for n in node_morphology:
        out_str = "\t".join([n[0], n[1], n[2]])
        outfile.write(f"{out_str}\n")

In [21]:
with open(f'{output_folder}/edge_condition_category.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_condition_category:
        out_str = "\t".join([str(n[0]), str(n[1])])
        outfile.write(f"{out_str}\n")

In [22]:
with open(f'{output_folder}/edge_category_category.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_category_category:
        out_str = "\t".join([str(n[0]), str(n[1])])
        outfile.write(f"{out_str}\n")

In [23]:
with open(f'{output_folder}/edge_condition_site.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_condition_site:
        out_str = "\t".join([str(n[0]), str(n[1])])
        outfile.write(f"{out_str}\n")

In [24]:
with open(f'{output_folder}/edge_site_site.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_site_site:
        out_str = "\t".join([str(n[0]), str(n[1])])
        outfile.write(f"{out_str}\n")

In [25]:
with open(f'{output_folder}/edge_condition_morphology.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_condition_morphology:
        out_str = "\t".join([str(n[0]), str(n[1])])
        outfile.write(f"{out_str}\n")

In [6]:


url = "bolt://localhost:7687"
driver = GraphDatabase.driver(url, auth=(PARAM["neo4j_username"], PARAM["neo4j_password"]))



In [7]:


with driver.session() as session:
    # Drop constraints / indices
    for constraint in session.run("SHOW CONSTRAINTS"):

        session.run("DROP CONSTRAINT " + constraint[1] + ";")

    # delete all nodes    
    session.run("MATCH (n) DETACH DELETE n")



In [3]:
EMBEDDING_MODEL = "text-embedding-ada-002"
openai.api_key = PARAM["OPENAI_API_KEY"]

In [4]:
def get_embedding(text, model=EMBEDDING_MODEL):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [5]:
qdrant_client = QdrantClient(
    url=PARAM["qdrant_URL"], 
    api_key=PARAM["qdrant_API_KEY"],
)

In [31]:
qdrant_client.recreate_collection(
    collection_name="description",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

True

In [32]:
node_trial

{'NCT05925803': {'title': 'A Multicenter, Randomized, Parallel-group, Double-blind,Two-arm Phase III Study to Evaluate the Safety and Efficacy of Anifrolumab Compared With Placebo in Male and Female Participants 18 to 70 Years of Age Inclusive With Systemic Sclerosis',
  'status': 'NOT_YET_RECRUITING',
  'study_results': False,
  'study_type': 'INTERVENTIONAL',
  'start_date': '2023-07-13',
  'min_age': '18',
  'max_age': '70',
  'gender': 'ALL',
  'criteria': "Key Inclusion Criteria:\n\n1. Adult patients from 18 to 70 years of age inclusive\n2. Systemic sclerosis according to 2013 ACR/EULAR classification criteria\n3. Limited or diffuse cutaneous subsets\n4. Systemic sclerosis disease duration within 6 years from first non-Raynaud's phenomenon manifestation\n5. Either HAQ-DI score ≥ 0.25 points or PtGA score ≥ 3 points\n6. mRSS \\> 10 with early disease or rapid progression as defined by the protocol\n7. mRSS ≥ 15 with disease duration ≥ 18 months and active disease as defined by the 

In [33]:
qdrant_client.upsert(
    collection_name="description",
    points=[
        PointStruct(
            id=index,
            vector= get_embedding(node_trial[t]["description"]),
            payload={"NCT": t, "description": node_trial[t]["description"]}
        )
        for index, t in enumerate(node_trial)
    ]
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [6]:
def query_vector_db(text, collection_name, top_k=5):
    embedding = get_embedding(text)
    hits = qdrant_client.search(
        collection_name=collection_name,
        query_vector=embedding,
        limit=top_k
    )
    return hits

In [35]:
query_question = "Tolerability of AZD2693"
results = query_vector_db(query_question, "description")

for result in results:
    print (result.payload["NCT"], result.payload["description"])

NCT05809934 A Study to Evaluate the Efficacy, Safety and Tolerability of AZD2693 given by subcutaneous injection in adult participants with non-cirrhotic non-alcoholic steatohepatitis with fibrosis and who are carriers of the PNPLA3 148M Risk Allele
NCT05911841 The main purpose of this study is to describe the efficacy and safety of LY3454738 in adult participants with moderate-to-severe atopic dermatitis (AD).
NCT05863234 This is Phase I/II Dose-Escalation Study to evaluate the tolerability, safety, efficacy and pharmacokinetics of PPMX-T003 in aggressive NK-cell leukemia.
NCT05816382 The main aim is to evaluate the safety and tolerability of TAK-861 on participants with type 1 and type 2 narcolepsy from previous parent studies, TAK-861-2001 (NCT05687903) and TAK-861-2002 (NCT05687916).
NCT05923281 To investigate the efficacy and safety of K-877 Extended Release 0.2 mg/day or 0.4 mg/day for 12 weeks in patients with Statin Intolerant\* Hypercholesterolemia,using placebo as a controll.

In [36]:
query_question = "A comprehensive evaluation of PPMX-T003"
results = query_vector_db(query_question, "description")

for result in results:
    print (result.score, result.payload["NCT"], result.payload["description"])

0.8487437 NCT05863234 This is Phase I/II Dose-Escalation Study to evaluate the tolerability, safety, efficacy and pharmacokinetics of PPMX-T003 in aggressive NK-cell leukemia.
0.77726406 NCT05816382 The main aim is to evaluate the safety and tolerability of TAK-861 on participants with type 1 and type 2 narcolepsy from previous parent studies, TAK-861-2001 (NCT05687903) and TAK-861-2002 (NCT05687916).
0.77372235 NCT05818137 This local Phase 3 study is planned to confirm the efficacy and safety in Japanese PAH participants. The primary population of this study is Japanese PAH participants with World Health Organization Functional Class (WHO FC) II or III while the study includes PAH participants with WHO FC I or IV as other populations. There are no hypotheses for this study.
0.7698162 NCT05827016 The purpose of this study is to compare the effectiveness of iberdomide maintenance to lenalidomide maintenance therapy after autologous stem cell transplantation (ASCT) in participants with n

In [7]:
query_question = "trial that compares iberdomide and lenalidomide"
results = query_vector_db(query_question, "description")

for result in results:
    print (result.score, result.payload["NCT"], result.payload["description"])

0.8816393 NCT05827016 The purpose of this study is to compare the effectiveness of iberdomide maintenance to lenalidomide maintenance therapy after autologous stem cell transplantation (ASCT) in participants with newly diagnosed multiple myeloma (NDMM).
0.8135842 NCT05882279 This is a survey among pharmacists who have instructed NINLARO therapy in ixazomib, lenalidomide and dexamethasone (IRD) dosing to patients with relapsed/refractory multiple myeloma (rrMM).

The main aims of the study are:

* To assess the frequency of pharmacists who have provided patients with the contents of the RMP material for patients.
* To assess the frequency of pharmacists who have obtained the RMP material for patients.
* To evaluate the depth of understanding of proper usage of NINLARO among pharmacists.
0.8124607 NCT05863234 This is Phase I/II Dose-Escalation Study to evaluate the tolerability, safety, efficacy and pharmacokinetics of PPMX-T003 in aggressive NK-cell leukemia.
0.79700524 NCT05925803 The 