In [45]:
!pip install pandas



In [46]:
import json
import snomed_ct as sc
import yaml
from neo4j import GraphDatabase
import shutil
from pathlib import Path

import pandas as pd

import os
import umls

In [47]:
with open("config.yaml", "r") as stream:
    try:
        PARAM = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [48]:
f = open('ctg-studies.json')
#f = open('ctg-studies_test.json')
df = json.load(f)
f.close()
df[0]

{'protocolSection': {'identificationModule': {'nctId': 'NCT05756322',
   'orgStudyIdInfo': {'id': 'LBS-007-CT01'},
   'organization': {'fullName': 'Lin BioScience, Inc', 'class': 'INDUSTRY'},
   'briefTitle': 'The Safety and Tolerability of LBS-007 in Patients With Relapsed or Resistant Acute Leukaemias',
   'officialTitle': 'A Phase 1/2, Open-label, Dose Escalation and Expansion Study to Evaluate the Safety and Tolerability of LBS-007 in Patients With Relapsed or Resistant Acute Leukaemias'},
  'statusModule': {'statusVerifiedDate': '2023-10',
   'overallStatus': 'RECRUITING',
   'expandedAccessInfo': {'hasExpandedAccess': False},
   'startDateStruct': {'date': '2023-07-20', 'type': 'ACTUAL'},
   'primaryCompletionDateStruct': {'date': '2025-12-15', 'type': 'ESTIMATED'},
   'completionDateStruct': {'date': '2025-12-15', 'type': 'ESTIMATED'},
   'studyFirstSubmitDate': '2022-10-20',
   'studyFirstSubmitQcDate': '2023-02-23',
   'studyFirstPostDateStruct': {'date': '2023-03-06', 'type':

In [49]:
node_trial = {}
node_institution = set()
node_condition = {}

edge_trial_condition = set()
edge_trial_institution = set()



In [50]:
condition_snomed = {}

for index, row in enumerate(df):
    NCT_mumber = row["protocolSection"]["identificationModule"]["nctId"]
    title = "none"
    if "officialTitle" in row["protocolSection"]["identificationModule"]:
        title = row["protocolSection"]["identificationModule"]["officialTitle"]
    status = row["protocolSection"]["statusModule"]["overallStatus"]
    start_date = row["protocolSection"]["statusModule"]["startDateStruct"]["date"]
    study_results = row["hasResults"]
    conditions = row["protocolSection"]["conditionsModule"]["conditions"]
    
    min_age = ""
    if "minimumAge" in row["protocolSection"]["eligibilityModule"]:
        min_age = row["protocolSection"]["eligibilityModule"]["minimumAge"].replace("Years", "").replace("Year", "").strip()

        if "Month" in min_age:
            min_age = min_age.replace("Months", "").replace("Month", "").strip()
            try:
                min_age = float(min_age)/12
            except ValueError:
                min_age = 0

        elif "Days" in min_age:
            min_age = min_age.replace("Days", "").replace("Day", "").strip()

            try:
                min_age = float(min_age)/365
            except ValueError:
                min_age = 0
        
        else:
            try:
                min_age = float(min_age)
            except ValueError:
                min_age = 0

    

    max_age = ""
    if "maximumAge" in row["protocolSection"]["eligibilityModule"]:
        max_age = row["protocolSection"]["eligibilityModule"]["maximumAge"].replace("Years", "").replace("Year", "").strip()

        if "Month" in max_age:
            max_age = max_age.replace("Months", "").replace("Month", "").strip()

            try:
                max_age = float(max_age)/12
            except ValueError:
                max_age = 0
        elif "Days" in max_age:
            max_age = max_age.replace("Days", "").replace("Day", "").strip()

            try:
                max_age = float(max_age)/365
            except ValueError:
                max_age = 0
        else:
            try:
                max_age = float(max_age)
            except ValueError:
                max_age = 0

    try:
        max_age = float(max_age)
    except ValueError:
        max_age = 0


    criteria = row["protocolSection"]["eligibilityModule"]["eligibilityCriteria"]
    healthy_volunteers = ""
    if "healthyVolunteers" in row["protocolSection"]["eligibilityModule"]:
        healthy_volunteers = row["protocolSection"]["eligibilityModule"]["healthyVolunteers"]

    sampling_method = ""
    if "samplingMethod" in row["protocolSection"]["eligibilityModule"]:
        sampling_method = row["protocolSection"]["eligibilityModule"]["samplingMethod"]
    gender = row["protocolSection"]["eligibilityModule"]["sex"]

    

    snomed_exists = False
    for c in conditions:
        if c.endswith("."):
            c = c[:-1]
        
        snomed_id = None
        if c not in condition_snomed:
            snomed_id = sc.getConceptIdByTerm(c)
            condition_snomed[c] = snomed_id

        snomed_id = condition_snomed[c]

        if snomed_id is not None:
            
            snomed_exists = True
            node_condition[snomed_id] = c
            edge_trial_condition.add((NCT_mumber, snomed_id))

    interventions = []
    if snomed_exists == True:
        if "armsInterventionsModule" in row["protocolSection"] and "interventions" in row["protocolSection"]["armsInterventionsModule"]:
            interventions = "|".join([x["name"] for x in row["protocolSection"]["armsInterventionsModule"]["interventions"]])

            

        study_type = row["protocolSection"]["designModule"]["studyType"]
        
       
        study_designs = json.dumps(row["protocolSection"]["designModule"]["designInfo"])

        institutions = []

        if "leadSponsor" in row["protocolSection"]["sponsorCollaboratorsModule"]:
            name = row["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"]["name"]
            type = row["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"]["class"]
            node_institution.add((name, type))
            edge_trial_institution.add((NCT_mumber, name))
        
        if "collaborators" in row["protocolSection"]["sponsorCollaboratorsModule"]:
            for c in row["protocolSection"]["sponsorCollaboratorsModule"]["collaborators"]:
                name = c["name"]
                type = c["class"]
                node_institution.add((name, type))
                edge_trial_institution.add((NCT_mumber, name))

        description = row["protocolSection"]["descriptionModule"]["briefSummary"]
        
        locations = ""
        if "locations" in row["protocolSection"]["contactsLocationsModule"]:
            locations = "|".join(set([f'{x.get("country")}' for x in row["protocolSection"]["contactsLocationsModule"]["locations"]]))

        phases = "NA"
        if "phases" in row["protocolSection"]["designModule"]:
            phases = "|".join(row["protocolSection"]["designModule"]["phases"])
        enrollment = row["protocolSection"]["designModule"]["enrollmentInfo"]["count"]

        url = f"https://ClinicalTrials.gov/show/{NCT_mumber}"

        outcome_measures = ""

        if "primaryOutcomes" in row["protocolSection"]["outcomesModule"]:
            outcome_measures += "|".join([f'{x["measure"]}; {x["timeFrame"]}' for x in row["protocolSection"]["outcomesModule"]["primaryOutcomes"]])

        if "secondaryOutcomes" in row["protocolSection"]["outcomesModule"]:
            outcome_measures += "|"
            outcome_measures += "|".join(set([f'{x["measure"]}; {x["timeFrame"]}' for x in row["protocolSection"]["outcomesModule"]["primaryOutcomes"]]))


        node_trial[NCT_mumber] = {"title": title, "status": status, "study_results": study_results, "study_type": study_type, 
                                "start_date": start_date, "min_age": min_age, "max_age": max_age, "gender": gender, "criteria": criteria,
                                "healthy_volunteers": healthy_volunteers, "sampling_method": sampling_method,
                                "locations": locations, "phases": phases, "enrollment": enrollment, "description": description,
                                "url": url, "outcome_measures": outcome_measures}


    


In [51]:
len(node_trial)

73

In [52]:
output_folder = "for_neo4j"

In [53]:
with open(f'{output_folder}/node_trial.tsv', 'w') as outfile:
    out_str = "\t".join(["name", 'title', 
                             'status', 'study_results', 'study_type',
                             'start_date', 'min_age', 'max_age', 'gender', "criteria",
                                "healthy_volunteers", "sampling_method",'locations', 
                             'phases', 'enrollment', 'description', 'url', 'outcome_measures'])
    outfile.write(f"{out_str}\n")
    for n in node_trial:
        node = node_trial[n]
        out_str = "\t".join([n, node['title'], 
                             node['status'], str(node['study_results']), node['study_type'],
                             node['start_date'], str(node['min_age']), str(node['max_age']), node['gender'],  node['criteria'].replace("\n", " "),
                             str(node['healthy_volunteers']), node['sampling_method'], node['locations'], node['phases'],
                             str(node['enrollment']), node['description'].replace("\n", " "), node['url'], node['outcome_measures']])
        outfile.write(f"{out_str}\n")

In [54]:
with open(f'{output_folder}/node_institution.tsv', 'w') as outfile:
    out_str = "\t".join(["name", 'type'])
    outfile.write(f"{out_str}\n")
    for n in node_institution:
        out_str = "\t".join([n[0], n[1]])
        outfile.write(f"{out_str}\n")

In [55]:
node_condition

{'1162768007': 'Leukaemia',
 '91861009': 'Acute Myeloid Leukaemia',
 '51092000': 'Chronic Lymphocytic Leukaemia',
 '92818009': 'Chronic Myeloid Leukaemia',
 '128822004': 'Acute Lymphoblastic Leukemia',
 '1162928000': 'Acute Myelogenous Leukemia',
 '93143009': 'Leukemia',
 '52168008': 'Monocytic Leukemia',
 '721308005': 'Acute Leukemia of Ambiguous Lineage',
 '1163439000': 'Myeloid Leukemia',
 '77430005': 'Adult T-Cell Leukemia/Lymphoma',
 '128923008': 'Prolymphocytic Leukemia',
 '91855006': 'Acute Leukemia',
 '278453007': 'Acute Biphenotypic Leukemia',
 '1264458000': 'Acute Undifferentiated Leukemia',
 '425688002': 'ph+ Acute Lymphoblastic Leukemia',
 '28950004': 'Acute Promyelocytic Leukemia',
 '92812005': 'Chronic Leukemia',
 '118600007': 'Lymphoma',
 '188725004': 'Lymphoid Leukemia',
 '12291000132102': 'Refractory Acute Myeloid Leukemia',
 '450913003': 'Mixed Phenotype Acute Leukemia',
 '127225006': 'Chronic Myelomonocytic Leukemia'}

In [56]:
with open(f'{output_folder}/node_condition.tsv', 'w') as outfile:
    out_str = "\t".join(['SNOMEDCT', "name", 'UMLS'])
    outfile.write(f"{out_str}\n")
    for n in node_condition:
        out_str = "\t".join([n, node_condition[n], ""])
        outfile.write(f"{out_str}\n")

In [57]:
with open(f'{output_folder}/edge_trial_condition.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_trial_condition:
        out_str = "\t".join([n[0], n[1]])
        outfile.write(f"{out_str}\n")

In [58]:
with open(f'{output_folder}/edge_trial_institution.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_trial_institution:
        out_str = "\t".join([n[0], n[1]])
        outfile.write(f"{out_str}\n")

Read the condition file

1. add the snomed ontology
2. add UMLS

In [59]:
import yaml

with open("config.yaml", "r") as stream:
    try:
        PARAM = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

UMLS_API_KEY = PARAM["UMLS_API_KEY"]

In [60]:
conditions = pd.read_csv("for_neo4j/node_condition.tsv", sep="\t")

conditions.head()

Unnamed: 0,SNOMEDCT,name,UMLS
0,1162768007,Leukaemia,
1,91861009,Acute Myeloid Leukaemia,
2,51092000,Chronic Lymphocytic Leukaemia,
3,92818009,Chronic Myeloid Leukaemia,
4,128822004,Acute Lymphoblastic Leukemia,


In [61]:
conditions.head()

Unnamed: 0,SNOMEDCT,name,UMLS
0,1162768007,Leukaemia,
1,91861009,Acute Myeloid Leukaemia,
2,51092000,Chronic Lymphocytic Leukaemia,
3,92818009,Chronic Myeloid Leukaemia,
4,128822004,Acute Lymphoblastic Leukemia,


In [62]:
node_category = set()
node_site = set()
node_morphology = set()

edge_condition_category = set()
edge_category_category = set()
edge_condition_site = set()
edge_site_site = set()
edge_condition_morphology = set()

done = set()

son_father = {}

for i, row in conditions.iterrows():
    id = row["SNOMEDCT"]
    name = row["name"]

    relationships = sc.getOnotologyById(str(id))
    print (str(id), relationships)

    for r in relationships:
        source = r[0]
        target = r[1]
        relation = r[2]
        value = relationships[r]
        #print (r)
        if relation == "Is a (attribute)":
            #print (value)
            if value["target_fsn"].endswith("(body structure)"):
                edge_site_site.add((source, target))
                node_site.add((target, value["target_fsn"], value["target_pt"]))
            else:
                if str(source) == str(id):
                    edge_condition_category.add((source, target))
                else:
                    edge_category_category.add((source, target))
            node_category.add((target, value["target_fsn"], value["target_pt"]))
            

        elif relation == "Finding site (attribute)":
            edge_condition_site.add((source, target))
            node_site.add((target, value["target_fsn"], value["target_pt"]))

        elif relation == "Associated morphology (attribute)":
            node_morphology.add((target, value["target_fsn"], value["target_pt"]))
            edge_condition_morphology.add((source, target))


    umls_concepts = umls.get_UMLS(name, UMLS_API_KEY)
    conditions.loc[i, "UMLS"] = ",".join(umls_concepts)

conditions.to_csv("for_neo4j/node_condition.tsv", sep="\t", index=False)

https://browser.ihtsdotools.org/snowstorm/snomed-ct/browser/MAIN/concepts/1162768007


414644002
414644002
1162768007 {('1162768007', '414644002', 'Is a (attribute)'): {'fsn': 'Is a (attribute)', 'pt': 'Is a', 'conceptId': '414644002', 'target_fsn': 'Malignant hematopoietic neoplasm (morphologic abnormality)', 'target_pt': 'Malignant hematopoietic neoplasm'}}


  conditions.loc[i, "UMLS"] = ",".join(umls_concepts)


https://browser.ihtsdotools.org/snowstorm/snomed-ct/browser/MAIN/concepts/91861009
91855006
188732008
14016003
14016003
91861009 {('91861009', '91855006', 'Is a (attribute)'): {'fsn': 'Is a (attribute)', 'pt': 'Is a', 'conceptId': '91855006', 'target_fsn': 'Acute leukemia, disease (disorder)', 'target_pt': 'Acute leukemia'}, ('91861009', '188732008', 'Is a (attribute)'): {'fsn': 'Is a (attribute)', 'pt': 'Is a', 'conceptId': '188732008', 'target_fsn': 'Myeloid leukemia (disorder)', 'target_pt': 'Myeloid leukemia'}, ('91861009', '14016003', 'Finding site (attribute)'): {'fsn': 'Finding site (attribute)', 'pt': 'Finding site', 'conceptId': '14016003', 'target_fsn': 'Bone marrow structure (body structure)', 'target_pt': 'Bone marrow structure'}, ('91861009', '1162928000', 'Associated morphology (attribute)'): {'fsn': 'Associated morphology (attribute)', 'pt': 'Associated morphology', 'conceptId': '1162928000', 'target_fsn': 'Acute myeloid leukemia (morphologic abnormality)', 'target_pt': 

In [63]:
with open(f'{output_folder}/node_category.tsv', 'w') as outfile:
    out_str = "\t".join(["SNOMEDCT", 'fsn', 'name'])
    outfile.write(f"{out_str}\n")
    for n in node_category:
        out_str = "\t".join([n[0], n[1], n[2]])
        outfile.write(f"{out_str}\n")

In [64]:
with open(f'{output_folder}/node_site.tsv', 'w') as outfile:
    out_str = "\t".join(["SNOMEDCT", 'fsn', 'name'])
    outfile.write(f"{out_str}\n")
    for n in node_site:
        out_str = "\t".join([n[0], n[1], n[2]])
        outfile.write(f"{out_str}\n")

In [65]:
with open(f'{output_folder}/node_morphology.tsv', 'w') as outfile:
    out_str = "\t".join(["SNOMEDCT", 'fsn', 'name'])
    outfile.write(f"{out_str}\n")
    for n in node_morphology:
        out_str = "\t".join([n[0], n[1], n[2]])
        outfile.write(f"{out_str}\n")

In [66]:
with open(f'{output_folder}/edge_condition_category.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_condition_category:
        out_str = "\t".join([str(n[0]), str(n[1])])
        outfile.write(f"{out_str}\n")

In [67]:
with open(f'{output_folder}/edge_category_category.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_category_category:
        out_str = "\t".join([str(n[0]), str(n[1])])
        outfile.write(f"{out_str}\n")

In [68]:
with open(f'{output_folder}/edge_condition_site.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_condition_site:
        out_str = "\t".join([str(n[0]), str(n[1])])
        outfile.write(f"{out_str}\n")

In [69]:
with open(f'{output_folder}/edge_site_site.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_site_site:
        out_str = "\t".join([str(n[0]), str(n[1])])
        outfile.write(f"{out_str}\n")

In [70]:
with open(f'{output_folder}/edge_condition_morphology.tsv', 'w') as outfile:
    out_str = "\t".join(["from", 'to'])
    outfile.write(f"{out_str}\n")
    for n in edge_condition_morphology:
        out_str = "\t".join([str(n[0]), str(n[1])])
        outfile.write(f"{out_str}\n")