Install dependencies. There are not too many
- Neo4j python driver
- Library for API calls etc

In [145]:
!pip install neo4j
!pip install requests



Do the imports etc.

In [146]:
from neo4j import GraphDatabase
import requests
import yaml
import json
import urllib.request

Some general purpose methods

In [147]:
# Methods
def clear(tx):
    tx.run("CALL apoc.periodic.iterate('MATCH (n) WHERE NOT n:`_Neodash_Dashboard` RETURN n', 'DETACH DELETE n', {batchSize:1000})")

def file_load(load_files):
  with driver.session() as session:
    nodes = []
    relationships = []
    for file_item in load_files:
      filename = "%s%s" % (load_file_dir, file_item["filename"])
      if "label" in file_item:
        nodes.append("{ fileName: '%s', labels: ['%s'] }" % (filename, file_item["label"]) )
      else:
        relationships.append("{ fileName: '%s', type: '%s' }" % (filename, file_item["type"]) )
    query = """CALL apoc.import.csv( [%s], [%s], {stringIds: false})""" % (", ".join(nodes), ", ".join(relationships))
    print(query)
    result = session.run(query)
    #print(result)
    for record in result:
      print(record)
  driver.close()

def load_cl(ct_type, cl):
  filename = "%scdisc_ct_%s_nodes_%s.csv" % (load_file_dir, ct_type, cl)
  with driver.session() as session:
    query = """MATCH (p:SKOS_CONCEPT) where p.identifier = '%s' 
      WITH p
      LOAD CSV WITH HEADERS FROM '%s' AS row
      CREATE (p)-[:SKOS_NARROWER]->(c:SKOS_CONCEPT {id: toInteger(row.id), alt_label: row.alt_label, definition: row.definition, identifier: row.identifier, notation: row.notation, pref_label: row.pref_label, uri: row.uri})""" % (cl, filename)
    result = session.run(query)
    for record in result:
      print(record)

def dump_cl(cl):
  with driver.session() as session:
    query = """MATCH (cs:SKOS_CONCEPT_SCHEME)-[]->(c1:SKOS_CONCEPT)-[]->(c2:SKOS_CONCEPT) WHERE c1.identifier = '%s' RETURN DISTINCT cs.version as version, c1.identifier as cl_identifier, c1.notation as cl_sub, c2.identifier as cli_identifier, c2.notation as cli_sub""" % (cl) 
    result = session.run(query)
    for record in result:
      print("%s: [%s, %s], [%s, %s]" % (record["version"], record["cl_identifier"], record["cl_sub"], record["cli_identifier"], record["cli_sub"]))
  driver.close()

def set_version(version, update, previous_version=""):
  with driver.session() as session:
    previous_clause = ""
    if previous_version != "":
      previous_clause = """
      WITH v
        MATCH (p:VERSION {version: '%s'})
        MERGE (v)-[:PREVIOUS_VERSION]->(p)""" % (previous_version)
    query = """CREATE (v:VERSION)
      SET v.version = '%s', v.updates = '%s'
      %s
    """ % (version, update, previous_clause)
    result = session.run(query)
  driver.close()

First thing we need to do is connect to the database. So to get setup
- Connect to the DB
- Clear the DB
- Insert some version management nodes just so we can keep track


In [148]:
# Load file directory
load_file_dir = "https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/"

# Connect to the Neo4j DB. You need a line like one of these examples ...
# For Neo4j Sandbox use: driver = GraphDatabase.driver("neo4j://n.n.n.n:7687", auth=("username", "password"))
# For Neo4j Aura use:    driver = GraphDatabase.driver("neo4j+s://url_for_aura", auth=("username", "password"))


with driver.session() as session:
    session.write_transaction(clear)
driver.close()

# USeful query for MATCH path=(a:ACTIVITY)-[r:HAS_PREVIOUS_ACTIVITY]->(b:ACTIVITY) RETURN b.description as desc ORDER BY LENGTH(path) ASC;
set_version("0.1", "First version, basic SDTM domain ganeration.")
set_version("0.2", "Add more BCs for DM domain.", "0.1")
set_version("0.3", "Set up DM from the study.", "0.2")
set_version("0.4", "Associate BCs with domains.", "0.3")
print("Ready ...")


Ready ...


First create the FHIR data types. Need this before anything else.


In [149]:
stage_1_files = [ 
    { "label": "FHIR", "filename": "stage_1_fhir_nodes.csv" },
    { "label": "WEB_SOURCE", "filename": "stage_1_web_source_nodes.csv" },
    { "label": "FHIR_DATA_TYPE", "filename": "stage_1_fhir_data_type_nodes.csv" },
    { "label": "FHIR_DATA_TYPE_PROPERTY", "filename": "stage_1_fhir_data_type_property_nodes.csv" },
    { "type": "HAS_DATA_TYPE_PROPERTY", "filename": "stage_1_has_data_type_property_relationships.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_1_from_source_relationships.csv" },
    { "type": "HAS_DATA_TYPE", "filename": "stage_1_has_data_type_relationships.csv" }
  ]

file_load(stage_1_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_fhir_nodes.csv', labels: ['FHIR'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_web_source_nodes.csv', labels: ['WEB_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_fhir_data_type_nodes.csv', labels: ['FHIR_DATA_TYPE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_fhir_data_type_property_nodes.csv', labels: ['FHIR_DATA_TYPE_PROPERTY'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_has_data_type_property_relationships.csv', type: 'HAS_DATA_TYPE_PROPERTY' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_from_source_relationships.

Now check we have loaded ok. Check the FHIR version, should be 4.0.1

View the dashboard to see the types loaded.

In [150]:
with driver.session() as session:
  query = """MATCH (n:FHIR) Return n.version as version""" 
  result = session.run(query)
  for record in result:
    print("Version: ", record["version"])

driver.close()

Version:  4.0.1


Now load the canonical model.

In [151]:
stage_2_files = [ 
    { "label": "CANONICAL_MODEL", "filename": "stage_2_canonical_model_nodes.csv" },
    { "label": "CANONICAL_NODE", "filename": "stage_2_canonical_node_nodes.csv" },
    { "label": "CANONICAL_DATA_TYPE", "filename": "stage_2_canonical_data_type_nodes.csv" },
    { "label": "OTHER_SOURCE", "filename": "stage_2_other_source_nodes.csv" },
    { "type": "CONSISTS_OF", "filename": "stage_2_consists_of_relationships.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_2_from_source_relationships.csv" },
    { "type": "HAS_SUB_MODEL", "filename": "stage_2_has_sub_model_relationships.csv" },
    { "type": "HAS_DATA_TYPE", "filename": "stage_2_has_data_type_relationships.csv" }
]

file_load(stage_2_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_canonical_model_nodes.csv', labels: ['CANONICAL_MODEL'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_canonical_node_nodes.csv', labels: ['CANONICAL_NODE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_canonical_data_type_nodes.csv', labels: ['CANONICAL_DATA_TYPE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_other_source_nodes.csv', labels: ['OTHER_SOURCE'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_consists_of_relationships.csv', type: 'CONSISTS_OF' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_from_source_relationships.csv'

Check a few relationships exist in the canonical model. Should get three results.

* THERAPEUTIC INTERVENTION
* OBSERVATION
* ADVERSE EVENT



In [152]:
with driver.session() as session:
  query = """MATCH (r)-[]->(t)-[]->(n:CANONICAL_NODE) WHERE n.name="LOCATION" RETURN r.name as root_name, t.name as name""" 
  result = session.run(query)
  for record in result:
    print("%s -> %s -> Location: " % (record["root_name"], record["name"]))

driver.close()

CANONICAL MODEL -> THERAPEUTIC INTERVENTION -> Location: 
CANONICAL MODEL -> OBSERVATION -> Location: 
CANONICAL MODEL -> ADVERSE EVENT -> Location: 


Now link the canonical nodes and the data types using the name as the key. For each "leaf" Canonical node we have the meaningful nodes (LOCATION, METHOD etc) and a child node defining the type of data that can be collected. This is keyed by the data type name so we can match them up. Note that there may be more than one data type for each meaningful node.

In [153]:
def link_data_types(parent_node_label, data_type_label):
  with driver.session() as session:
    query = """MATCH (dt:%s) WHERE NOT (dt)-[:IS_A]->()
      WITH dt
      MATCH (pn:%s)-[]->(dt)
      WITH pn, dt
      MATCH (n:FHIR_DATA_TYPE) WHERE dt.name = n.name
      CREATE (dt)-[:IS_A]->(n)
      RETURN pn.uri as uri, pn.name as parent_name, n.name as fdt_name, dt.name as dt_name;
    """ % (data_type_label, parent_node_label)
    result = session.run(query)
    for record in result:
      print("[%s, %s] -> [FHIR, %s]" % (record['parent_name'], record["dt_name"], record["fdt_name"]))

  driver.close()

link_data_types("CANONICAL_NODE", "CANONICAL_DATA_TYPE")

[PORTION, coding] -> [FHIR, coding]
[DIRECTIONALITY, coding] -> [FHIR, coding]
[LATERALITY, coding] -> [FHIR, coding]
[TEST, coding] -> [FHIR, coding]
[RESULT, coding] -> [FHIR, coding]
[RESULT, quantity] -> [FHIR, quantity]
[DATE TIME, date_time] -> [FHIR, date_time]


Now create the data type nodes for the canonical mode. For each canonical node that references a data type copy the properties of that data type to the canonical node. Give each canonical leaf a unique id (a uri). Also change the lable of the node so they are Canonical Data Type nodes rather than FHIR ones.

In the future these nodes should also have a C code reference providing a definition for the data item.


In [154]:
def duplicate_data_type_nodes(data_type_label, data_type_property_label):
  uri_data = []
  with driver.session() as session:
  
    query = """MATCH (n:%s) WHERE NOT (n)-[:HAS_DATA_TYPE_PROPERTY]->()
      WITH n
      MATCH (n)-[:IS_A]->(dt:FHIR_DATA_TYPE)
      WITH n, dt
      CALL apoc.path.subgraphAll(dt, {relationshipFilter:'HAS_DATA_TYPE_PROPERTY>'})
      YIELD nodes, relationships
      CALL apoc.refactor.cloneSubgraph(
        nodes,
        [rel in relationships WHERE type(rel) = 'HAS_DATA_TYPE_PROPERTY'],
        { standinNodes:[[dt, n]] })
      YIELD input, output, error
      RETURN output;
    """ % (data_type_label)
    result = session.run(query)
    for record in result:
      node = record["output"]
      uri_data.append({"id": node.id})
 
    query = """UNWIND $uri_data AS d
      MATCH (p)-[]->(n) WHERE ID(n)=d.id
      SET n.uri = p.uri + '/' + n.name
      REMOVE n:FHIR_DATA_TYPE_PROPERTY
      SET n:%s RETURN n.uri as uri, n.name as name""" % (data_type_property_label)
    result = session.run(query, uri_data=uri_data)
    for record in result:
      print("Node duplicated: %s, %s" % (record["uri"], record["name"]))
  driver.close()

duplicate_data_type_nodes("CANONICAL_DATA_TYPE", "CANONICAL_DATA_TYPE_PROPERTY")


Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/portion/coding/user_selected, user_selected
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/portion/coding/display, display
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/portion/coding/code, code
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/portion/coding/version, version
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/portion/coding/system, system
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/directionality/coding/user_selected, user_selected
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/directionality/coding/display, display
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/directionality/coding/code, code
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/directionality/coding/version, version
Node duplicated: http://id.d4k.dk/dataset/canonical/common/loc

Now load some terminology. We load just the SDTM terms (it is all we need for the moment). Just load the Code List definitions not every single item with every code list, just too big a load in one go.

In [155]:
stage_3_files = [ 
    { "label": "API_SOURCE", "filename": "stage_3_api_source_nodes.csv" },
    { "label": "SKOS_CONCEPT", "filename": "stage_3_skos_concept_nodes.csv" },
    { "label": "SKOS_CONCEPT_SCHEME", "filename": "stage_3_skos_concept_scheme_nodes.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_3_from_source_relationships.csv" },
    { "type": "SKOS_HAS_TOP_CONCEPT", "filename": "stage_3_skos_has_top_concept_relationships.csv" }
  ]

file_load(stage_3_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_api_source_nodes.csv', labels: ['API_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_skos_concept_nodes.csv', labels: ['SKOS_CONCEPT'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_skos_concept_scheme_nodes.csv', labels: ['SKOS_CONCEPT_SCHEME'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_from_source_relationships.csv', type: 'FROM_SOURCE' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_skos_has_top_concept_relationships.csv', type: 'SKOS_HAS_TOP_CONCEPT' }], {stringIds: false})
<Record file='progress.csv' source='file' format='csv' nodes=986 relationships=985 properties=8868 time=1057 row

Do a quick test on the CT. Check AGEU and its links

In [156]:
with driver.session() as session:
  query = """MATCH (cs)-[]->(c1:SKOS_CONCEPT) WHERE c1.notation = 'AGEU' RETURN DISTINCT cs.version as version, c1.identifier as cl_identifier""" 
  result = session.run(query)
  for record in result:
    print ("%s: %s" % (record["version"], record["cl_identifier"]))
driver.close()

2021-12-17: C66781


Stage 4 to 9 are the CT files for the other areas, ADaM, Protocol, CDASH etc. Not loaded at the moment. Needed to split due to size and limited RAM on the Neo4j server.

Now load BC Templates

In [157]:
stage_10_files = [ 
    { "label": "OTHER_SOURCE", "filename": "stage_10_other_source_nodes.csv" },
    { "label": "BC_DATA_TYPE", "filename": "stage_10_bc_data_type_nodes.csv" },
    { "label": "BC_ITEM", "filename": "stage_10_bc_item_nodes.csv" },
    { "label": "BC_TEMPLATE", "filename": "stage_10_bc_template_nodes.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_10_from_source_relationships.csv" },
    { "type": "HAS_DATA_TYPE", "filename": "stage_10_has_data_type_relationships.csv" },
    { "type": "HAS_IDENTIFIER", "filename": "stage_10_has_identifier_relationships.csv" },
    { "type": "HAS_ITEM", "filename": "stage_10_has_item_relationships.csv" }
  ]

file_load(stage_10_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_other_source_nodes.csv', labels: ['OTHER_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_bc_data_type_nodes.csv', labels: ['BC_DATA_TYPE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_bc_item_nodes.csv', labels: ['BC_ITEM'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_bc_template_nodes.csv', labels: ['BC_TEMPLATE'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_from_source_relationships.csv', type: 'FROM_SOURCE' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_has_data_type_relationships.csv', type: 'HAS_DATA_TYPE' }, {

Now add in the data types nodes for the templates. Copy from the data types to the templates 

In [158]:
link_data_types("BC_ITEM", "BC_DATA_TYPE")
duplicate_data_type_nodes("BC_DATA_TYPE", "BC_DATA_TYPE_PROPERTY")

[Test, coding] -> [FHIR, coding]
[Position, coding] -> [FHIR, coding]
[Site of Administration, coding] -> [FHIR, coding]
[Laterality, coding] -> [FHIR, coding]
[Method, coding] -> [FHIR, coding]
[Result, coding] -> [FHIR, coding]
[Test, coding] -> [FHIR, coding]
[Position, coding] -> [FHIR, coding]
[Site of Administration, coding] -> [FHIR, coding]
[Laterality, coding] -> [FHIR, coding]
[Directionality, coding] -> [FHIR, coding]
[Method, coding] -> [FHIR, coding]
[Result, coding] -> [FHIR, coding]
[Specimen, coding] -> [FHIR, coding]
[Result, quantity] -> [FHIR, quantity]
[Result, quantity] -> [FHIR, quantity]
[Date Time, date_time] -> [FHIR, date_time]
[Date Time, date_time] -> [FHIR, date_time]
Node duplicated: http://id.d4k.dk/dataset/bc_template/base_observation/test/coding/user_selected, user_selected
Node duplicated: http://id.d4k.dk/dataset/bc_template/base_observation/test/coding/display, display
Node duplicated: http://id.d4k.dk/dataset/bc_template/base_observation/test/coding

Now link the BCs to the canonical model. 

In [159]:
with driver.session() as session:
  query = """MATCH (n:BC_ITEM) WHERE n.canonical <> ""
    WITH n
    MATCH (m:CANONICAL_NODE {name: n.canonical})
    WITH n, m
    MATCH (n)-[]->(bdt:BC_DATA_TYPE)-[]->(bdtp:BC_DATA_TYPE_PROPERTY)
    WITH bdt, bdtp, n, m
    MATCH (m)-[]->(cdt:CANONICAL_DATA_TYPE {name: bdt.name})-[]->(cdtp:CANONICAL_DATA_TYPE_PROPERTY {name: bdtp.name})
    CREATE (bdtp)-[:IS_CANONICAL_REF]->(cdtp)
    RETURN n.name as b_name, bdt.name as bdt_name, bdtp.name as bdtp_name, m.name as c_name, cdt.name as cdt_name, cdtp.name as cdtp_name
  """ 
  result = session.run(query)
  for record in result:
    print ("[%s, %s, %s] -> [%s, %s, %s]" % (record["b_name"], record["bdt_name"], record["bdtp_name"], record["c_name"], record["cdt_name"], record["cdtp_name"]))
driver.close()

[Test, coding, display] -> [TEST, coding, display]
[Test, coding, code] -> [TEST, coding, code]
[Test, coding, version] -> [TEST, coding, version]
[Test, coding, user_selected] -> [TEST, coding, user_selected]
[Test, coding, system] -> [TEST, coding, system]
[Laterality, coding, code] -> [LATERALITY, coding, code]
[Laterality, coding, version] -> [LATERALITY, coding, version]
[Laterality, coding, system] -> [LATERALITY, coding, system]
[Laterality, coding, user_selected] -> [LATERALITY, coding, user_selected]
[Laterality, coding, display] -> [LATERALITY, coding, display]
[Date Time, date_time, value] -> [DATE TIME, date_time, value]
[Result, quantity, comparator] -> [RESULT, quantity, comparator]
[Result, quantity, code] -> [RESULT, quantity, code]
[Result, quantity, unit] -> [RESULT, quantity, unit]
[Result, quantity, value] -> [RESULT, quantity, value]
[Result, quantity, system] -> [RESULT, quantity, system]
[Result, coding, display] -> [RESULT, coding, display]
[Result, coding, vers

Now load the BC instances

In [160]:
stage_11_files = [ 
    { "label": "OTHER_SOURCE", "filename": "stage_11_other_source_nodes.csv" },
    { "label": "BC_VALUE_SET", "filename": "stage_11_bc_value_set_nodes.csv" },
    { "label": "BC_DATA_TYPE", "filename": "stage_11_bc_data_type_nodes.csv" },
    { "label": "BC_ITEM", "filename": "stage_11_bc_item_nodes.csv" },
    { "label": "BC_INSTANCE", "filename": "stage_11_bc_instance_nodes.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_11_from_source_relationships.csv" },
    { "type": "HAS_RESPONSE", "filename": "stage_11_has_response_relationships.csv" },
    { "type": "HAS_DATA_TYPE", "filename": "stage_11_has_data_type_relationships.csv" },
    { "type": "HAS_IDENTIFIER", "filename": "stage_11_has_identifier_relationships.csv" },
    { "type": "HAS_ITEM", "filename": "stage_11_has_item_relationships.csv" }
  ]

file_load(stage_11_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_other_source_nodes.csv', labels: ['OTHER_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_bc_value_set_nodes.csv', labels: ['BC_VALUE_SET'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_bc_data_type_nodes.csv', labels: ['BC_DATA_TYPE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_bc_item_nodes.csv', labels: ['BC_ITEM'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_bc_instance_nodes.csv', labels: ['BC_INSTANCE'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_from_source_relationships.csv', type: 'FROM_SOURCE' }, { fileN

Now link the BC instances to the data types

In [161]:
link_data_types("BC_ITEM", "BC_DATA_TYPE")
duplicate_data_type_nodes("BC_DATA_TYPE", "BC_DATA_TYPE_PROPERTY")

[Test, coding] -> [FHIR, coding]
[Test, coding] -> [FHIR, coding]
[Result, coding] -> [FHIR, coding]
[Test, coding] -> [FHIR, coding]
[Result, coding] -> [FHIR, coding]
[Test, coding] -> [FHIR, coding]
[Test, coding] -> [FHIR, coding]
[Result, coding] -> [FHIR, coding]
[Result, quantity] -> [FHIR, quantity]
[Result, quantity] -> [FHIR, quantity]
[Date Time, date_time] -> [FHIR, date_time]
[Date Time, date_time] -> [FHIR, date_time]
[Date Time, date_time] -> [FHIR, date_time]
[Date Time, date_time] -> [FHIR, date_time]
[Date Time, date_time] -> [FHIR, date_time]
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/user_selected, user_selected
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/display, display
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/code, code
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/version, version
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/

Now link the BC Instances to the BC Template that is based on

In [162]:
with driver.session() as session:
  query = """MATCH (n:BC_INSTANCE)
    WITH n
    MATCH (m:BC_TEMPLATE {name: n.based_on})
    WITH n, m
    CREATE (n)-[:BASED_ON]->(m)
    RETURN n.name as i_name, m.name as t_name
  """ 
  result = session.run(query)
  for record in result:
    print ("%s -> %s" % (record["i_name"], record["t_name"]))
driver.close()

Weight -> Base Observation
Race -> Base Observation
Sex -> Base Observation
Age -> Base Observation
Ethnicity -> Base Observation


Now load the SDTM IG so we can link the SDTM to the canonical model.

In [163]:
stage_13_files = [ 
    { "label": "API_SOURCE", "filename": "stage_13_api_source_nodes.csv" },
    { "label": "OTHER_SOURCE", "filename": "stage_13_other_source_nodes.csv" },
    { "label": "SDTM_MODEL", "filename": "stage_13_sdtm_model_nodes.csv" },
    { "label": "SDTM_CLASS", "filename": "stage_13_sdtm_class_nodes.csv" },
    { "label": "SDTM_MODEL_VARIABLE", "filename": "stage_13_sdtm_model_variable_nodes.csv" },
    { "label": "SDTM_IG", "filename": "stage_13_sdtm_ig_nodes.csv" },
    { "label": "SDTM_DATASET", "filename": "stage_13_sdtm_dataset_nodes.csv" },
    { "label": "SDTM_VARIABLE", "filename": "stage_13_sdtm_variable_nodes.csv" },
    { "label": "CANONICAL_REF", "filename": "stage_13_canonical_ref_nodes.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_13_from_source_relationships.csv" },
    { "type": "HAS_CANONICAL_REF", "filename": "stage_13_has_canonical_ref_relationships.csv" },
    { "type": "HAS_CLASS", "filename": "stage_13_has_class_relationships.csv" },
    { "type": "HAS_DATASET", "filename": "stage_13_has_dataset_relationships.csv" },
    { "type": "HAS_VARIABLE", "filename": "stage_13_has_variable_relationships.csv" }
  ]

file_load(stage_13_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_api_source_nodes.csv', labels: ['API_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_other_source_nodes.csv', labels: ['OTHER_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_sdtm_model_nodes.csv', labels: ['SDTM_MODEL'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_sdtm_class_nodes.csv', labels: ['SDTM_CLASS'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_sdtm_model_variable_nodes.csv', labels: ['SDTM_MODEL_VARIABLE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_sdtm_ig_nodes.csv', labels: ['SDTM_IG'] }, { fileN

Check that SDTM Model and IG loaded

In [164]:
with driver.session() as session:
  query = """MATCH (ds:SDTM_DATASET)-[]->(v:SDTM_VARIABLE) RETURN DISTINCT ds.name as ds_name, v.name as v_name LIMIT 20"""
  result = session.run(query)
  for record in result:
    print ("[%s, %s] loaded" % (record["ds_name"], record["v_name"]))
driver.close()

with driver.session() as session:
  query = """MATCH (m:SDTM_MODEL)-[]->(c:SDTM_CLASS)-[]->(v:SDTM_MODEL_VARIABLE) RETURN DISTINCT m.name as m_name, c.name as c_name, v.name as v_name"""
  result = session.run(query)
  for record in result:
    print ("[%s, %s, %s] loaded" % (record["m_name"], record["c_name"], record["v_name"]))
driver.close()

[AG, AGENDTC] loaded
[AG, AGOCCUR] loaded
[AG, AGDECOD] loaded
[AG, AGSTRTPT] loaded
[AG, STUDYID] loaded
[AG, AGSTTPT] loaded
[AG, AGCLAS] loaded
[AG, AGREASND] loaded
[AG, AGDOSTXT] loaded
[AG, VISITNUM] loaded
[AG, AGPRESP] loaded
[AG, DOMAIN] loaded
[AG, VISIT] loaded
[AG, TAETORD] loaded
[AG, AGDOSU] loaded
[AG, AGSTRF] loaded
[AG, AGSCAT] loaded
[AG, AGLNKID] loaded
[AG, AGENRF] loaded
[AG, AGDOSFRM] loaded
[SDTM Model, demographics, AGEU] loaded
[SDTM Model, demographics, AGE] loaded
[SDTM Model, demographics, SEX] loaded
[SDTM Model, demographics, ETHNIC] loaded
[SDTM Model, demographics, RACE] loaded
[SDTM Model, findings, --ORRES] loaded
[SDTM Model, findings, --DTC] loaded
[SDTM Model, findings, --ORRESU] loaded


Now link up the SDTM Model and IG variables. We need to 
- Do the vertical domains, the "--" cases
- Things like DM

In [165]:
with driver.session() as session:
  # Vertical 
  query = """MATCH (smv:SDTM_MODEL_VARIABLE)
    MATCH (igv:SDTM_VARIABLE) WHERE substring(smv.name, 2) = substring(igv.name, 2) AND left(smv.name, 2) = "--"
    MERGE (igv)-[:BASED_ON]->(smv)
    RETURN igv.name as ig_name, smv.name as m_name
  """
  result = session.run(query)
  for record in result:
    print ("%s -> %s linked" % (record["ig_name"], record["m_name"]))

  # DM type, no "--" prefix involved
  query = """MATCH (smv:SDTM_MODEL_VARIABLE)
    MATCH (igv:SDTM_VARIABLE) WHERE smv.name = igv.name
    MERGE (igv)-[:BASED_ON]->(smv)
    RETURN igv.name as ig_name, smv.name as m_name
  """
  result = session.run(query)
  for record in result:
    print ("%s -> %s linked" % (record["ig_name"], record["m_name"]))
driver.close()

BSORRES -> --ORRES linked
CPORRES -> --ORRES linked
CVORRES -> --ORRES linked
DAORRES -> --ORRES linked
DDORRES -> --ORRES linked
EGORRES -> --ORRES linked
FTORRES -> --ORRES linked
GFORRES -> --ORRES linked
IEORRES -> --ORRES linked
ISORRES -> --ORRES linked
LBORRES -> --ORRES linked
MBORRES -> --ORRES linked
MIORRES -> --ORRES linked
BSORRESU -> --ORRESU linked
CPORRESU -> --ORRESU linked
CVORRESU -> --ORRESU linked
DAORRESU -> --ORRESU linked
EGORRESU -> --ORRESU linked
FTORRESU -> --ORRESU linked
GFORRESU -> --ORRESU linked
ISORRESU -> --ORRESU linked
LBORRESU -> --ORRESU linked
MBORRESU -> --ORRESU linked
MIORRESU -> --ORRESU linked
MLDTC -> --DTC linked
BEDTC -> --DTC linked
CEDTC -> --DTC linked
DSDTC -> --DTC linked
HODTC -> --DTC linked
MHDTC -> --DTC linked
BSDTC -> --DTC linked
CPDTC -> --DTC linked
CVDTC -> --DTC linked
DADTC -> --DTC linked
DDDTC -> --DTC linked
EGDTC -> --DTC linked
FTDTC -> --DTC linked
GFDTC -> --DTC linked
IEDTC -> --DTC linked
ISDTC -> --DTC linked
LB

And link the model variables to the canonical model.

In [166]:
with driver.session() as session:
  query = """MATCH (smv:SDTM_MODEL_VARIABLE)-[]->(cr:CANONICAL_REF)
    MATCH (CANONICAL_NODE {name: cr.node})-[]->(CANONICAL_DATA_TYPE {name: cr.data_type})-[]->(cdtp:CANONICAL_DATA_TYPE_PROPERTY {name: cr.property})
    MERGE (smv)-[:IS_CANONICAL_REF]->(cdtp)
    RETURN smv.name as v_name, cr.node as c_node, cr.data_type as c_dt, cr.property as c_property, cdtp.uri as uri
  """
  result = session.run(query)
  for record in result:
    print ("%s -> [%s, %s, %s] -> %s linked" % (record["v_name"], record["c_node"], record["c_dt"], record["c_property"], record["uri"]))
driver.close()

--DTC -> [DATE TIME, date_time, value] -> http://id.d4k.dk/dataset/canonical/common/date_time/date_time/value linked
SEX -> [RESULT, coding, code] -> http://id.d4k.dk/dataset/canonical/observation/observation_result/result/coding/code linked
RACE -> [RESULT, coding, code] -> http://id.d4k.dk/dataset/canonical/observation/observation_result/result/coding/code linked
ETHNIC -> [RESULT, coding, code] -> http://id.d4k.dk/dataset/canonical/observation/observation_result/result/coding/code linked
--ORRES -> [RESULT, coding, code] -> http://id.d4k.dk/dataset/canonical/observation/observation_result/result/coding/code linked
AGEU -> [RESULT, quantity, unit] -> http://id.d4k.dk/dataset/canonical/observation/observation_result/result/quantity/unit linked
--ORRESU -> [RESULT, quantity, unit] -> http://id.d4k.dk/dataset/canonical/observation/observation_result/result/quantity/unit linked
AGE -> [RESULT, quantity, value] -> http://id.d4k.dk/dataset/canonical/observation/observation_result/result/qu

So we now have linked in the SDTM IG, the model to the canonical model. The last thing we want to do is link the domains to the BCs targetted at each domain. Do this from a configuration file. This would normally be part of the metadata of the standards.

In [167]:
file_dir = "https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/cdisc_sdtm/"

x = urllib.request.urlopen("%scdisc_sdtm_bc.yaml" % (file_dir))
mapping = yaml.load(x)
print(mapping)

for domain in mapping["root"]["domains"]:
  with driver.session() as session:
    query = """MATCH (d:SDTM_DATASET {name: '%s'})
      WITH d
      UNWIND $bcs AS bc
      MATCH (bci:BC_INSTANCE {name: bc.name})
      MERGE (d)-[:CAN_USE_BC]->(bci)
      RETURN d.name as domain, bci.name as bc_name""" % (domain["name"])
    result = session.run(query, bcs=domain["bcs"])
    for record in result:
      print ("Set %s -> %s" % (record["domain"], record["bc_name"]))
  driver.close()

with driver.session() as session:
  query = """MATCH (d:SDTM_DATASET)-[:CAN_USE_BC]->(bc:BC_INSTANCE)
    RETURN d.name as domain, bc.name as bc"""
  result = session.run(query)
  for record in result:
    print ("Checked %s -> %s" % (record["domain"], record["bc"]))
driver.close()


{'root': {'domains': [{'name': 'VS', 'bcs': [{'name': 'Weight'}]}, {'name': 'DM', 'bcs': [{'name': 'Age'}, {'name': 'Sex'}, {'name': 'Race'}, {'name': 'Ethnicity'}]}]}}
Set VS -> Weight
Set DM -> Age
Set DM -> Sex
Set DM -> Race
Set DM -> Ethnicity
Checked VS -> Weight
Checked DM -> Race
Checked DM -> Sex
Checked DM -> Age
Checked DM -> Ethnicity


Now load the DDF study example data. Provides an example of a "industry standard" design.

In [168]:
stage_12_files = [    
    { "label": "ENDPOINT", "filename": "stage_12_endpoint_nodes.csv" },
    { "label": "STUDY_DATA", "filename": "stage_12_study_data_nodes.csv" },
    { "label": "PROCEDURE", "filename": "stage_12_procedure_nodes.csv" },
    { "label": "ACTIVITY", "filename": "stage_12_activity_nodes.csv" },
    { "label": "WORKFLOW_ITEM", "filename": "stage_12_workflow_item_nodes.csv" },
    { "label": "VISIT", "filename": "stage_12_visit_nodes.csv" },
    { "label": "STUDY_CELL", "filename": "stage_12_study_cell_nodes.csv" },
    { "label": "RULE", "filename": "stage_12_rule_nodes.csv" },
    { "label": "STUDY_ELEMENT", "filename": "stage_12_study_element_nodes.csv" },
    { "label": "EPOCH", "filename": "stage_12_study_epoch_nodes.csv" },
    { "label": "STUDY_ARM", "filename": "stage_12_study_arm_nodes.csv" },
    { "label": "OBJECTIVE", "filename": "stage_12_objective_nodes.csv" },
    { "label": "CODE", "filename": "stage_12_code_nodes.csv" },
    { "label": "INVESTIGATIONAL_INTERVENTIONS", "filename": "stage_12_investigational_interventions_nodes.csv" },
    { "label": "POPULATION", "filename": "stage_12_population_nodes.csv" },
    { "label": "STUDY_DESIGN", "filename": "stage_12_study_design_nodes.csv" },
    { "label": "INDICATION", "filename": "stage_12_indication_nodes.csv" },
    { "label": "STUDY_PROTOCOL", "filename": "stage_12_study_protocol_nodes.csv" },
    { "label": "STUDY_PHASE", "filename": "stage_12_study_phase_nodes.csv" },
    { "label": "STUDY_TYPE", "filename": "stage_12_study_type_nodes.csv" },
    { "label": "STUDY_IDENTIFIER", "filename": "stage_12_study_identifier_nodes.csv" },
    { "label": "STUDY", "filename": "stage_12_study_nodes.csv" },
    { "type": "HAS_CODED", "filename": "stage_12_has_coded_relationships.csv" },
    { "type": "HAS_ENDPOINT", "filename": "stage_12_has_endpoint_relationships.csv" },
    { "type": "HAS_STUDY_DATA", "filename": "stage_12_has_study_data_relationships.csv" },
    { "type": "HAS_PROCEDURE", "filename": "stage_12_has_procedure_relationships.csv" },
    { "type": "HAS_PREVIOUS_ACTIVITY", "filename": "stage_12_has_previous_activity_relationships.csv" },
    { "type": "USED_IN_VISIT", "filename": "stage_12_used_in_visit_relationships.csv" },
    { "type": "HAS_ACTIVITY", "filename": "stage_12_has_activity_relationships.csv" },
    { "type": "HAS_VISIT", "filename": "stage_12_has_visit_relationships.csv" },
    { "type": "HAS_END_RULE", "filename": "stage_12_has_end_rule_relationships.csv" },
    { "type": "HAS_START_RULE", "filename": "stage_12_has_start_rule_relationships.csv" },
    { "type": "HAS_ELEMENT", "filename": "stage_12_has_element_relationships.csv" },
    { "type": "HAS_EPOCH", "filename": "stage_12_has_epoch_relationships.csv" },
    { "type": "HAS_ARM", "filename": "stage_12_has_arm_relationships.csv" },
    { "type": "HAS_CELL", "filename": "stage_12_has_cell_relationships.csv" },
    { "type": "HAS_OBJECTIVE", "filename": "stage_12_has_objective_relationships.csv" },
    { "type": "HAS_INDICATION", "filename": "stage_12_has_indication_relationships.csv" },
    { "type": "HAS_INVESTIGATIONAL_INTERVENTION", "filename": "stage_12_has_investigational_intervention_relationships.csv" },
    { "type": "HAS_POPULATION", "filename": "stage_12_has_population_relationships.csv" },
    { "type": "HAS_STUDY_DESIGN", "filename": "stage_12_has_study_design_relationships.csv" },
    { "type": "HAS_PROTOCOL", "filename": "stage_12_has_protocol_relationships.csv" },
    { "type": "HAS_STUDY_PHASE", "filename": "stage_12_has_study_phase_relationships.csv" },
    { "type": "HAS_STUDY_TYPE", "filename": "stage_12_has_study_type_relationships.csv" },
    { "type": "HAS_IDENTIFIER", "filename": "stage_12_has_identifier_relationships.csv" }
  ]

file_load(stage_12_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_endpoint_nodes.csv', labels: ['ENDPOINT'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_study_data_nodes.csv', labels: ['STUDY_DATA'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_procedure_nodes.csv', labels: ['PROCEDURE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_activity_nodes.csv', labels: ['ACTIVITY'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_workflow_item_nodes.csv', labels: ['WORKFLOW_ITEM'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_visit_nodes.csv', labels: ['VISIT'] }, { fileName: 'https://raw.githubuserco

Add in a study URI for the study

In [169]:
def set_study_uri(brief_title, uri):
  with driver.session() as session:
    query = """MATCH (p:STUDY_PROTOCOL {brief_title: '%s'})<-[]-(s:STUDY)
      SET s.uri = '%s'
      """ % (brief_title, uri)
    result = session.run(query)
  driver.close()

def get_study_uri(brief_title):
  with driver.session() as session:
    query = """MATCH (p:STUDY_PROTOCOL {brief_title: '%s'})<-[]-(s:STUDY)
      RETURN s.uri as uri;
      """ % (brief_title)
    result = session.run(query)
    for record in result:
      the_result = record["uri"]
  driver.close()
  return the_result

set_study_uri("DDR", "http://id.d4k.dk/dataset/study/ddr")
print("URI for study DDR is %s" % (get_study_uri("DDR")))

URI for study DDR is http://id.d4k.dk/dataset/study/ddr


Duplicate the BC. The study activity Study data node uses WGHT as a code, really need to update it to be "Weight" so we can name match all BCs. Just work round this for the moment.

In [None]:
def duplicate_bc(study_data_name, bc_name, study_uri):
  bc_uri = "%s/bc/%s" % (study_uri, bc_name)
  with driver.session() as session:
    query = """MATCH (a:ACTIVITY)-[]->(sd:STUDY_DATA) WHERE sd.name = '%s'
      WITH a, sd
      MATCH (bc:BC_INSTANCE) WHERE bc.name = '%s'
      WITH a, sd, bc
      CREATE (a)-[:HAS_BC]->(new:STUDY_BC_INSTANCE)
      SET new = bc
      SET new.uri = '%s'
      CREATE (new)-[:BASED_ON]->(bc)
      WITH bc, new
      CALL apoc.path.subgraphAll(bc, {relationshipFilter:'HAS_ITEM>|HAS_DATA_TYPE>|HAS_RESPONSE>'})
      YIELD nodes, relationships
      CALL apoc.refactor.cloneSubgraph(
        nodes,
        [rel in relationships WHERE type(rel) = 'HAS_DATA_TYPE_PROPERTY'],
        { standinNodes:[[bc, new]] })
      YIELD input, output, error
      RETURN output;
      """ % (study_data_name, bc_name, bc_uri)
    result = session.run(query)
    print("%i nodes duplicated for BC %s" % (len(result.values()), bc_name))
  driver.close()
  return bc_uri

def update_bc_uris(bc_uri):
  with driver.session() as session:
    ids = []
    query = """MATCH (bc:STUDY_BC_INSTANCE) WHERE bc.uri = '%s'
      WITH bc
      CALL apoc.path.subgraphAll(bc, {relationshipFilter:'HAS_ITEM>|HAS_DATA_TYPE>|HAS_DATA_TYPE_PROPERTY>|HAS_RESPONSE>'})
      YIELD nodes, relationships
      RETURN nodes;
    """ % (bc_uri)
    result = session.run(query)
    for record in result:
      for node in record:
        for node1 in node:
          if node1['uri'] == bc_uri:
            continue
          ids.append({"id": node1.id})
 
    query = """UNWIND $id_data AS d
      MATCH (p)-[]->(n) WHERE ID(n)=d.id
      SET n.uri = p.uri + '/' + replace(toLower(n.name), " ", "_")
      RETURN n.uri as uri"""
    result = session.run(query, id_data=ids)
    for record in result:
      print("Node uri updated: %s" % (record["uri"]))
  driver.close()

def add_bc_canonical_refs(bc_uri):
  with driver.session() as session:
    query = """MATCH (n:STUDY_BC_INSTANCE {uri: '%s'})-[:BASED_ON]->(in)-[:BASED_ON]->(t)-[]->(i:BC_ITEM)
      -[]->(dt:BC_DATA_TYPE)
      -[]->(p:BC_DATA_TYPE_PROPERTY)
      -[]->(cr:CANONICAL_DATA_TYPE_PROPERTY) 
      WITH n, i.name as i_name, dt.name as dt_name, p.name as p_name, cr
      MATCH (n:STUDY_BC_INSTANCE {uri: '%s'})-[]->(BC_ITEM {name: i_name})
      -[]->(BC_DATA_TYPE {name: dt_name})
      -[]->(sp:BC_DATA_TYPE_PROPERTY {name: p_name})
      WITH sp, cr
      CREATE (sp)-[:IS_CANONICAL_REF]->(cr) 
      RETURN sp.name as name, cr.uri as uri
    """ % (bc_uri, bc_uri)
    result = session.run(query)
    for record in result:
      print("Canonical reference set: %s -> %s" % (record["name"], record["uri"]))
  driver.close()

# Get the study URI, will be used as the base URI for the BC URIs
study_uri = get_study_uri("DDR")

# Duplicate the BCs for the study
weight_bc_uri = duplicate_bc("WGHT", "Weight", study_uri)
age_bc_uri = duplicate_bc("AGE", "Age", study_uri)
race_bc_uri = duplicate_bc("RACE", "Race", study_uri)
sex_bc_uri = duplicate_bc("SEX", "Sex", study_uri)
ethnicity_bc_uri = duplicate_bc("ETHNICITY", "Ethnicity", study_uri)

# Add in all the data type nodes for all BCs
link_data_types("BC_ITEM", "BC_DATA_TYPE")
duplicate_data_type_nodes("BC_DATA_TYPE", "BC_DATA_TYPE_PROPERTY")

# Set the URIs for the BCs and add in the lines to the canonical nodes
update_bc_uris(age_bc_uri)
update_bc_uris(race_bc_uri)
update_bc_uris(sex_bc_uri)
update_bc_uris(ethnicity_bc_uri)
update_bc_uris(weight_bc_uri)
add_bc_canonical_refs(age_bc_uri)
add_bc_canonical_refs(race_bc_uri)
add_bc_canonical_refs(sex_bc_uri)
add_bc_canonical_refs(ethnicity_bc_uri)
add_bc_canonical_refs(weight_bc_uri)

13 nodes duplicated for BC Weight
16 nodes duplicated for BC Age
18 nodes duplicated for BC Race
15 nodes duplicated for BC Sex
15 nodes duplicated for BC Ethnicity
[Test, coding] -> [FHIR, coding]
[Test, coding] -> [FHIR, coding]
[Test, coding] -> [FHIR, coding]
[Result, coding] -> [FHIR, coding]
[Result, coding] -> [FHIR, coding]
[Test, coding] -> [FHIR, coding]
[Result, coding] -> [FHIR, coding]
[Test, coding] -> [FHIR, coding]
[Result, quantity] -> [FHIR, quantity]
[Result, quantity] -> [FHIR, quantity]
[Date Time, date_time] -> [FHIR, date_time]
[Date Time, date_time] -> [FHIR, date_time]
[Date Time, date_time] -> [FHIR, date_time]
[Date Time, date_time] -> [FHIR, date_time]
[Date Time, date_time] -> [FHIR, date_time]
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/result/quantity/unit, unit
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/result/quantity/code, code
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/result/quantity/comparator, co

And the final configuration needed is to link Domains to the Study BC Instances. The BC Instances are already linked, so we are inheriting the links

In [None]:
with driver.session() as session:

  query = """MATCH (n:SDTM_DATASET)-[:CAN_USE_BC]->(bc:BC_INSTANCE) WHERE NOT (n)-[:USE_BC]->()
    WITH n, bc
    MATCH (sbc:STUDY_BC_INSTANCE)-[:BASED_ON]->(bc)
    MERGE (n)-[:USE_BC {enabled: true}]->(sbc)
    RETURN sbc.name as bc_name, n.name as domain
  """
  result = session.run(query)
  for record in result:
    print ("%s -> %s" % (record["domain"], record["bc_name"]))


Now load any Code Lists used by the BCs. Check loaded OK.

- C66741 - VSTESTCD
- C66770 - VSRESU

In [None]:
load_cl("sdtm", "C66741")
load_cl("sdtm", "C66770")
load_cl("sdtm", "C74457")
load_cl("sdtm", "C66731")
load_cl("sdtm", "C66781")
load_cl("sdtm", "C66790")
dump_cl("C66741")
dump_cl("C66770")
dump_cl("C74457")



LOad the sponsor CT

In [None]:
stage_14_files = [ 
    { "label": "OTHER_SOURCE", "filename": "stage_14_other_source_nodes.csv" },
    { "label": "SKOS_CONCEPT", "filename": "stage_14_skos_concept_nodes.csv" },
    { "label": "SKOS_CONCEPT_SCHEME", "filename": "stage_14_skos_concept_scheme_nodes.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_14_from_source_relationships.csv" },
    { "type": "SKOS_HAS_TOP_CONCEPT", "filename": "stage_14_skos_has_top_concept_relationships.csv" },
    { "type": "SKOS_NARROWER", "filename": "stage_14_skos_narrower_relationships.csv" }
  ]

file_load(stage_14_files)

dump_cl("D000001")

Now link up to the CT for the library BCs and the Study versions

In [None]:
with driver.session() as session:
  query = """MATCH (n:BC_DATA_TYPE)-[:HAS_RESPONSE]->(d) 
    WITH n,d
    MATCH (cl:SKOS_CONCEPT {identifier: d.cl})-[]->(cli:SKOS_CONCEPT {identifier: d.cli})
    MERGE (n)-[:HAS_RESPONSE]->(cli)
    DETACH DELETE d
    RETURN n.name as name, cli.uri as uri
  """ 
  result = session.run(query)
  for record in result:
    print ("%s -> %s" % (record["name"], record["uri"]))
driver.close()

Now add in some data points. A data point will point at the BC and the visit to which is applies (it will point to the WorkFlow Item). This is the nature of the DDF model. I think we can make it better.

In [None]:
with driver.session() as session:
  query = """MATCH (v:VISIT)<-[]-(w:WORKFLOW_ITEM)-[]->(a:ACTIVITY)-[]->(bc:STUDY_BC_INSTANCE) RETURN v.name as visit, w.id as wfi , a.description as activity, bc.name as bc""" 
  result = session.run(query)
  for record in result:
    print ("%s -> %s -> %s -> %s" % (record["visit"], record["wfi"], record["activity"], record["bc"]))
driver.close()

A query just to print out the URIs of the Study BC Instances so we can find them easily to setup the data. In a real system this would be automatic. This is an expansion of the above query.

In [None]:
with driver.session() as session:
  query = """MATCH (v:VISIT)<-[]-(w:WORKFLOW_ITEM)-[]->(a:ACTIVITY)-[]->(bc:STUDY_BC_INSTANCE)-[HAS_ITEM]->()
    -[HAS_DATA_TYPE]->()-[HAS_DATA_TYPE_PROPERTY]->(bdtp:BC_DATA_TYPE_PROPERTY) 
    RETURN DISTINCT v.name as visit, w.id as wfi, a.description as activity, bc.name as bc, bdtp.uri as uri""" 
  result = session.run(query)
  for record in result:
    print ("%s -> %s -> %s -> %s -> %s" % (record["visit"], record["wfi"], record["activity"], record["bc"], record["uri"]))
driver.close()

Create some simple data. We need the actual data point plus just enough information to attach it to the study graph. So 

- Create subjects
- Create data points and link into the study design and the BC attached to (the Study instance, not the definition of the BC that we copied earlier)

In [None]:
uri_refs = { "weight": {}, "age": {}, "race": {}, "sex": {}, "ethnicity": {} }
uri_refs["weight"]["value"] = "http://id.d4k.dk/dataset/study/ddr/bc/Weight/result/quantity/value"
uri_refs["weight"]["unit"] = "http://id.d4k.dk/dataset/study/ddr/bc/Weight/result/quantity/unit"
uri_refs["weight"]["date_time"] = "http://id.d4k.dk/dataset/study/ddr/bc/Weight/date_time/date_time/value"

uri_refs["age"]["value"] = "http://id.d4k.dk/dataset/study/ddr/bc/Age/result/quantity/value"
uri_refs["age"]["unit"] = "http://id.d4k.dk/dataset/study/ddr/bc/Age/result/quantity/unit"
uri_refs["age"]["date_time"] = "http://id.d4k.dk/dataset/study/ddr/bc/Age/date_time/date_time/value"

uri_refs["race"]["value"] = "http://id.d4k.dk/dataset/study/ddr/bc/Race/result/coding/code"
uri_refs["race"]["date_time"] = "http://id.d4k.dk/dataset/study/ddr/bc/Race/date_time/date_time/value"

uri_refs["sex"]["value"] = "http://id.d4k.dk/dataset/study/ddr/bc/Sex/result/coding/code"
uri_refs["sex"]["date_time"] = "http://id.d4k.dk/dataset/study/ddr/bc/Sex/date_time/date_time/value"

uri_refs["ethnicity"]["value"] = "http://id.d4k.dk/dataset/study/ddr/bc/Ethnicity/result/coding/code"
uri_refs["ethnicity"]["date_time"] = "http://id.d4k.dk/dataset/study/ddr/bc/Ethnicity/date_time/date_time/value"

subjects = [ 
  { "subject_id": "1234"}, 
  { "subject_id": "1235" }, 
  { "subject_id": "1236" }, 
  { "subject_id": "1237" }
]

time_1234 = "2022-02-11T09:57:00" 
time_1235 = "2022-02-12T10:17:00"
time_1236 = "2022-02-13T13:27:00"
time_1237 = "2022-02-15T11:17:00"
  
subject_data = [
  { "subject_id": "1234", "visit": "SCREENING VISIT", "data_point": uri_refs["age"]["value"], "value": "22" },
  { "subject_id": "1234", "visit": "SCREENING VISIT", "data_point": uri_refs["age"]["unit"], "value": "YEARS" },
  { "subject_id": "1234", "visit": "SCREENING VISIT", "data_point": uri_refs["age"]["date_time"], "value": time_1234 },
  { "subject_id": "1234", "visit": "SCREENING VISIT", "data_point": uri_refs["race"]["value"], "value": "WHITE" },
  { "subject_id": "1234", "visit": "SCREENING VISIT", "data_point": uri_refs["race"]["date_time"], "value": time_1234 },
  { "subject_id": "1234", "visit": "SCREENING VISIT", "data_point": uri_refs["sex"]["value"], "value": "M" },
  { "subject_id": "1234", "visit": "SCREENING VISIT", "data_point": uri_refs["sex"]["date_time"], "value": time_1234 },
  { "subject_id": "1234", "visit": "SCREENING VISIT", "data_point": uri_refs["ethnicity"]["value"], "value": "NOT HISPANIC OR LATINO" },
  { "subject_id": "1234", "visit": "SCREENING VISIT", "data_point": uri_refs["ethnicity"]["date_time"], "value": time_1234 },
  
  { "subject_id": "1235", "visit": "SCREENING VISIT", "data_point": uri_refs["age"]["value"], "value": "62" },
  { "subject_id": "1235", "visit": "SCREENING VISIT", "data_point": uri_refs["age"]["unit"], "value": "YEARS" },
  { "subject_id": "1235", "visit": "SCREENING VISIT", "data_point": uri_refs["age"]["date_time"], "value": time_1235 },
  { "subject_id": "1235", "visit": "SCREENING VISIT", "data_point": uri_refs["race"]["value"], "value": "ASIAN" },
  { "subject_id": "1235", "visit": "SCREENING VISIT", "data_point": uri_refs["race"]["date_time"], "value": time_1235 },
  { "subject_id": "1235", "visit": "SCREENING VISIT", "data_point": uri_refs["sex"]["value"], "value": "M" },
  { "subject_id": "1235", "visit": "SCREENING VISIT", "data_point": uri_refs["sex"]["date_time"], "value": time_1235 },
  { "subject_id": "1235", "visit": "SCREENING VISIT", "data_point": uri_refs["ethnicity"]["value"], "value": "NOT HISPANIC OR LATINO" },
  { "subject_id": "1235", "visit": "SCREENING VISIT", "data_point": uri_refs["ethnicity"]["date_time"], "value": time_1235 },
  
  { "subject_id": "1236", "visit": "SCREENING VISIT", "data_point": uri_refs["age"]["value"], "value": "37" },
  { "subject_id": "1236", "visit": "SCREENING VISIT", "data_point": uri_refs["age"]["unit"], "value": "YEARS" },
  { "subject_id": "1236", "visit": "SCREENING VISIT", "data_point": uri_refs["age"]["date_time"], "value": time_1236 },
  { "subject_id": "1236", "visit": "SCREENING VISIT", "data_point": uri_refs["race"]["value"], "value": "ASIAN" },
  { "subject_id": "1236", "visit": "SCREENING VISIT", "data_point": uri_refs["race"]["date_time"], "value": time_1236 },
  { "subject_id": "1236", "visit": "SCREENING VISIT", "data_point": uri_refs["sex"]["value"], "value": "M" },
  { "subject_id": "1236", "visit": "SCREENING VISIT", "data_point": uri_refs["sex"]["date_time"], "value": time_1236 },
  { "subject_id": "1236", "visit": "SCREENING VISIT", "data_point": uri_refs["ethnicity"]["value"], "value": "NOT HISPANIC OR LATINO" },
  { "subject_id": "1236", "visit": "SCREENING VISIT", "data_point": uri_refs["ethnicity"]["date_time"], "value": time_1236 },
  
  { "subject_id": "1237", "visit": "SCREENING VISIT", "data_point": uri_refs["age"]["value"], "value": "28" },
  { "subject_id": "1237", "visit": "SCREENING VISIT", "data_point": uri_refs["age"]["unit"], "value": "YEARS" },
  { "subject_id": "1237", "visit": "SCREENING VISIT", "data_point": uri_refs["age"]["date_time"], "value": time_1237 },
  { "subject_id": "1237", "visit": "SCREENING VISIT", "data_point": uri_refs["race"]["value"], "value": "WHITE" },
  { "subject_id": "1237", "visit": "SCREENING VISIT", "data_point": uri_refs["race"]["date_time"], "value": time_1237 },
  { "subject_id": "1237", "visit": "SCREENING VISIT", "data_point": uri_refs["sex"]["value"], "value": "M" },
  { "subject_id": "1237", "visit": "SCREENING VISIT", "data_point": uri_refs["sex"]["date_time"], "value": time_1237 },
  { "subject_id": "1237", "visit": "SCREENING VISIT", "data_point": uri_refs["ethnicity"]["value"], "value": "NOT HISPANIC OR LATINO" },
  { "subject_id": "1237", "visit": "SCREENING VISIT", "data_point": uri_refs["ethnicity"]["date_time"], "value": time_1237 },
  
  { "subject_id": "1234", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": uri_refs["weight"]["value"], "value": "76" },
  { "subject_id": "1234", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": uri_refs["weight"]["unit"], "value": "kg" },
  { "subject_id": "1234", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": uri_refs["weight"]["date_time"], "value": "2022-03-01T13:57:00" },
  
  { "subject_id": "1234", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": uri_refs["weight"]["value"], "value": "78" },
  { "subject_id": "1234", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": uri_refs["weight"]["unit"], "value": "kg" },
  { "subject_id": "1234", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": uri_refs["weight"]["date_time"], "value": "2022-03-10T13:57:00" },
  
  { "subject_id": "1234", "visit": "FU 1", "data_point": uri_refs["weight"]["value"], "value": "77" },
  { "subject_id": "1234", "visit": "FU 1", "data_point": uri_refs["weight"]["unit"], "value": "kg" },
  { "subject_id": "1234", "visit": "FU 1", "data_point": uri_refs["weight"]["date_time"], "value": "2022-03-19T13:57:00" },
  
  { "subject_id": "1235", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": uri_refs["weight"]["value"], "value": "42" },
  { "subject_id": "1235", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": uri_refs["weight"]["unit"], "value": "kg" },
  { "subject_id": "1235", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": uri_refs["weight"]["date_time"], "value": "2022-03-03T13:57:00" },
  
  { "subject_id": "1235", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": uri_refs["weight"]["value"], "value": "44" },
  { "subject_id": "1235", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": uri_refs["weight"]["unit"], "value": "kg" },
  { "subject_id": "1235", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": uri_refs["weight"]["date_time"], "value": "2022-03-13T13:57:00" },
  
  { "subject_id": "1235", "visit": "FU 1", "data_point": uri_refs["weight"]["value"], "value": "43" },
  { "subject_id": "1235", "visit": "FU 1", "data_point": uri_refs["weight"]["unit"], "value": "kg" },
  { "subject_id": "1235", "visit": "FU 1", "data_point": uri_refs["weight"]["date_time"], "value": "2022-03-22T13:57:00" },
]

print(subject_data)

Create the subjects.

In [None]:
with driver.session() as session:
  for subject in subjects:
    query = """MATCH (p:STUDY_PROTOCOL {brief_title:'DDR'})<-[]-(s:STUDY)
      WITH s
      CREATE (s)<-[:ENROLLED_IN]-(ss:STUDY_SUBJECT)
      SET ss.identifier = '%s'
    """ % (subject["subject_id"]) 
    result = session.run(query)
  query = """MATCH (s:STUDY_SUBJECT) RETURN s.identifier as subject"""
  result = session.run(query)
  for record in result:
    print ("Subject %s created" % (record["subject"]))
driver.close()

Now add in the data

In [None]:
with driver.session() as session:
  for item in subject_data:
    query = """MATCH (s:STUDY_SUBJECT {identifier: '%s'})
      WITH s
      MATCH (v:VISIT {name: '%s'})<-[]-(wi:WORKFLOW_ITEM)-[:HAS_ACTIVITY]->(a)-[:HAS_BC]->(bc)-[:HAS_ITEM]->
        (i)-[:HAS_DATA_TYPE]->(dt)-[:HAS_DATA_TYPE_PROPERTY]->(d:BC_DATA_TYPE_PROPERTY {uri: '%s'})
      WITH s, wi, d
      CREATE (s)<-[:FOR_SUBJECT]-(sdp:STUDY_DATA_POINT)-[:FOR_WORKFLOW_ITEM]->(wi)
      WITH sdp, d
      CREATE (sdp)-[:FOR_VALUE]->(d)
      SET sdp.value = '%s'
      SET sdp.uuid = apoc.create.uuid()
    """ % (item["subject_id"], item["visit"], item["data_point"], item["value"])
    result = session.run(query)
  query = """MATCH (s:STUDY_DATA_POINT) RETURN DISTINCT s.value as value, s.uuid as uuid"""
  result = session.run(query)
  for record in result:
    print ("Value %s, %s created" % (record["value"], record["uuid"]))
driver.close()

So having linked the very basic data we have all the pieces in place. We can now start getting data out, simple at the moment but ...

Query the VS domain and, though we should not, the DM domain.

In [181]:
def get_domain_data(domain):
  with driver.session() as session:
    query = """MATCH (bc:STUDY_BC_INSTANCE)<-[:USE_BC {enabled: true}]-(sd:SDTM_DATASET {name: '%s'})-[]->(sv:SDTM_VARIABLE)-[]->(cv:SDTM_MODEL_VARIABLE)-[:IS_CANONICAL_REF]->(fdt:CANONICAL_DATA_TYPE_PROPERTY)
      <-[:IS_CANONICAL_REF]-(bdt:BC_DATA_TYPE_PROPERTY)<-[:FOR_VALUE]-(sdp:STUDY_DATA_POINT)-[]->(wfi:WORKFLOW_ITEM) WHERE bc.name = cv.bc OR cv.bc = ""
      WITH bc, sd, sv, cv, fdt, bdt, sdp, wfi
      MATCH (wfi)-[:USED_IN_VISIT]->(v:VISIT)<-[]-(e:EPOCH), 
      (sdp)-[:FOR_SUBJECT]->(subj:STUDY_SUBJECT),
      (ct)<-[:HAS_RESPONSE]-()<-[:HAS_DATA_TYPE]-()<-[:HAS_IDENTIFIER]-(bc:STUDY_BC_INSTANCE)-[*]->(bdt)
    RETURN sd.name as domain, sv.name as variable, sdp.value as data, wfi.id as uuid, v.name as visit, e.study_epoch_name as epoch, subj.identifier as subject, ct.notation as test_code
  """ % (domain)
    result = session.run(query)
    for record in result:
      print ("%s, %s, %s, %s, %s, %s, [%s -> %s]" % (record["domain"], record["variable"], record["test_code"], record["subject"], record["uuid"], record["data"], record["visit"], record["epoch"]))
  driver.close()

print("VS")
print("==")
print("")
get_domain_data('VS')
print("")
print("DM")
print("==")
print("")
get_domain_data('DM')

VS
==

VS, VSDTC, WEIGHT, 1235, 1558, 2022-03-22T13:57:00, [FU 1 -> FOLLOW-UP]
VS, VSDTC, WEIGHT, 1235, 1557, 2022-03-13T13:57:00, [CYCLE 2, TREATMENT DAY 1 -> TREATMENT]
VS, VSDTC, WEIGHT, 1235, 1556, 2022-03-03T13:57:00, [CYCLE 1, TREATMENT DAY 1 -> TREATMENT]
VS, VSDTC, WEIGHT, 1234, 1558, 2022-03-19T13:57:00, [FU 1 -> FOLLOW-UP]
VS, VSDTC, WEIGHT, 1234, 1557, 2022-03-10T13:57:00, [CYCLE 2, TREATMENT DAY 1 -> TREATMENT]
VS, VSDTC, WEIGHT, 1234, 1556, 2022-03-01T13:57:00, [CYCLE 1, TREATMENT DAY 1 -> TREATMENT]
VS, VSORRES, WEIGHT, 1235, 1558, 43, [FU 1 -> FOLLOW-UP]
VS, VSORRES, WEIGHT, 1235, 1557, 44, [CYCLE 2, TREATMENT DAY 1 -> TREATMENT]
VS, VSORRES, WEIGHT, 1235, 1556, 42, [CYCLE 1, TREATMENT DAY 1 -> TREATMENT]
VS, VSORRES, WEIGHT, 1234, 1558, 77, [FU 1 -> FOLLOW-UP]
VS, VSORRES, WEIGHT, 1234, 1557, 78, [CYCLE 2, TREATMENT DAY 1 -> TREATMENT]
VS, VSORRES, WEIGHT, 1234, 1556, 76, [CYCLE 1, TREATMENT DAY 1 -> TREATMENT]
VS, VSORRESU, WEIGHT, 1235, 1558, kg, [FU 1 -> FOLLOW-UP]
V