Install dependencies

In [115]:
!pip install neo4j
!pip install requests



First thing we need to do is connect to the database. We also define some general purpose methods in this section.

In [116]:
from neo4j import GraphDatabase
import requests
import json
import urllib.request

# Load file directory
load_file_dir = "https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/"

# Connect to the Neo4j DB. You need a line like one of these examples:
# driver = GraphDatabase.driver("neo4j://n.n.n.n:7687", auth=("username", "password"))
# driver = GraphDatabase.driver("neo4j+s://url_for_aura", auth=("username", "password"))


# Methods
def clear(tx):
    tx.run("CALL apoc.periodic.iterate('MATCH (n) RETURN n', 'DETACH DELETE n', {batchSize:1000})")

def file_load(load_files):
  with driver.session() as session:
    nodes = []
    relationships = []
    for file_item in load_files:
      filename = "%s%s" % (load_file_dir, file_item["filename"])
      if "label" in file_item:
        nodes.append("{ fileName: '%s', labels: ['%s'] }" % (filename, file_item["label"]) )
      else:
        relationships.append("{ fileName: '%s', type: '%s' }" % (filename, file_item["type"]) )
    query = """CALL apoc.import.csv( [%s], [%s], {stringIds: false})""" % (", ".join(nodes), ", ".join(relationships))
    print(query)
    result = session.run(query)
    #print(result)
    for record in result:
      print(record)
  driver.close()

def load_cl(ct_type, cl):
  filename = "%scdisc_ct_%s_nodes_%s.csv" % (load_file_dir, ct_type, cl)
  with driver.session() as session:
    query = """MATCH (p:SKOS_CONCEPT) where p.identifier = '%s' 
      WITH p
      LOAD CSV WITH HEADERS FROM '%s' AS row
      CREATE (p)-[:SKOS_NARROWER]->(c:SKOS_CONCEPT {id: toInteger(row.id), alt_label: row.alt_label, definition: row.definition, identifier: row.identifier, notation: row.notation, pref_label: row.pref_label, uri: row.uri})""" % (cl, filename)
    result = session.run(query)
    for record in result:
      print(record)

def dump_cl(cl):
  with driver.session() as session:
    query = """MATCH (cs:SKOS_CONCEPT_SCHEME)-[]->(c1:SKOS_CONCEPT)-[]->(c2:SKOS_CONCEPT) WHERE c1.identifier = '%s' RETURN DISTINCT cs.version as version, c1.identifier as cl_identifier, c1.notation as cl_sub, c2.identifier as cli_identifier, c2.notation as cli_sub""" % (cl) 
    result = session.run(query, uri_data=uri_data)
    for record in result:
      print("%s: [%s, %s], [%s, %s]" % (record["version"], record["cl_identifier"], record["cl_sub"], record["cli_identifier"], record["cli_sub"]))
  driver.close()

with driver.session() as session:
    session.write_transaction(clear)
driver.close()

print("Ready ...")


Ready ...


First create the FHIR data types. Need this before anything else.


In [117]:
stage_1_files = [ 
    { "label": "FHIR", "filename": "stage_1_fhir_nodes.csv" },
    { "label": "WEB_SOURCE", "filename": "stage_1_web_source_nodes.csv" },
    { "label": "FHIR_DATA_TYPE", "filename": "stage_1_fhir_data_type_nodes.csv" },
    { "label": "FHIR_DATA_TYPE_PROPERTY", "filename": "stage_1_fhir_data_type_property_nodes.csv" },
    { "type": "HAS_DATA_TYPE_PROPERTY", "filename": "stage_1_has_data_type_property_relationships.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_1_from_source_relationships.csv" },
    { "type": "HAS_DATA_TYPE", "filename": "stage_1_has_data_type_relationships.csv" }
  ]

file_load(stage_1_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_fhir_nodes.csv', labels: ['FHIR'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_web_source_nodes.csv', labels: ['WEB_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_fhir_data_type_nodes.csv', labels: ['FHIR_DATA_TYPE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_fhir_data_type_property_nodes.csv', labels: ['FHIR_DATA_TYPE_PROPERTY'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_has_data_type_property_relationships.csv', type: 'HAS_DATA_TYPE_PROPERTY' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_from_source_relationships.

Now check we have loaded ok. Check the FHIR version, should be 4.0.1

In [118]:
with driver.session() as session:
  query = """MATCH (n:FHIR) Return n.version as version""" 
  result = session.run(query)
  for record in result:
    print("Version: ", record["version"])

driver.close()

Version:  4.0.1


Now load the canonical model.

In [119]:
stage_2_files = [ 
    { "label": "CANONICAL_MODEL", "filename": "stage_2_canonical_model_nodes.csv" },
    { "label": "CANONICAL_NODE", "filename": "stage_2_canonical_node_nodes.csv" },
    { "label": "CANONICAL_DATA_TYPE", "filename": "stage_2_canonical_data_type_nodes.csv" },
    { "label": "OTHER_SOURCE", "filename": "stage_2_other_source_nodes.csv" },
    { "type": "CONSISTS_OF", "filename": "stage_2_consists_of_relationships.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_2_from_source_relationships.csv" },
    { "type": "HAS_SUB_MODEL", "filename": "stage_2_has_sub_model_relationships.csv" },
    { "type": "HAS_DATA_TYPE", "filename": "stage_2_has_data_type_relationships.csv" }
]

file_load(stage_2_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_canonical_model_nodes.csv', labels: ['CANONICAL_MODEL'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_canonical_node_nodes.csv', labels: ['CANONICAL_NODE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_canonical_data_type_nodes.csv', labels: ['CANONICAL_DATA_TYPE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_other_source_nodes.csv', labels: ['OTHER_SOURCE'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_consists_of_relationships.csv', type: 'CONSISTS_OF' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_from_source_relationships.csv'

Check a few relationships exist in the canonical model. Should get three results.

* THERAPEUTIC INTERVENTION
* OBSERVATION
* ADVERSE EVENT



In [120]:
with driver.session() as session:
  query = """MATCH (r)-[]->(t)-[]->(n:CANONICAL_NODE) WHERE n.name="LOCATION" RETURN r.name as root_name, t.name as name""" 
  result = session.run(query)
  for record in result:
    print("%s -> %s -> Location: " % (record["root_name"], record["name"]))

driver.close()

CANONICAL MODEL -> ADVERSE EVENT -> Location: 
CANONICAL MODEL -> OBSERVATION -> Location: 
CANONICAL MODEL -> THERAPEUTIC INTERVENTION -> Location: 


Now link the canonical nodes and the data types using the name as the key. For each "leaf" Canonical node we have the meaningful nodes (LOCATION, METHOD etc) and a child node defining the type of data that can be collected. This is keyed by the data type name so we can match them up. Note that there may be more than one data type for each meaningful node.

In [121]:
with driver.session() as session:
  query = """MATCH (n:CANONICAL_NODE)-[]->(m:CANONICAL_DATA_TYPE) RETURN n.name as name, m.uri as uri""" 
  result = session.run(query)
  for record in result:
    query = """MATCH (n:CANONICAL_DATA_TYPE{uri:'%s'})
      WITH n
      MATCH (dt:FHIR_DATA_TYPE {name: n.data_type})
      CREATE (n)-[:IS_A]->(dt)
      RETURN n.data_type as cn_name, dt.name as dt_name;
    """ % (record["uri"])
    inner_result = session.run(query)
    for inner_record in inner_result:
      print("[%s, %s, %s] -> FHIR %s" % (record['name'], record['uri'], inner_record["cn_name"], inner_record["dt_name"]))

driver.close()

[TEST, http://id.d4k.dk/dataset/canonical/n42/coding, coding] -> FHIR coding
[LATERALITY, http://id.d4k.dk/dataset/canonical/n8/coding, coding] -> FHIR coding
[DIRECTIONALITY, http://id.d4k.dk/dataset/canonical/n9/coding, coding] -> FHIR coding
[PORTION, http://id.d4k.dk/dataset/canonical/n18/coding, coding] -> FHIR coding
[RESULT, http://id.d4k.dk/dataset/canonical/n141/coding, coding] -> FHIR coding
[RESULT, http://id.d4k.dk/dataset/canonical/n141/quantity, quantity] -> FHIR quantity
[DATE & TIME, http://id.d4k.dk/dataset/canonical/n2/date_time, date_time] -> FHIR date_time


Now create the data type nodes for the canonical mode. For each canonical node that references a data type copy the properties of that data type to the canonical node. Give each canonical leaf a unique id (a uri). Also change the lable of the node so they are Canonical Data Type nodes rather than FHIR ones.

In the future these nodes should also have a C code reference providing a definition for the data item.


In [122]:
uri_data = []
with driver.session() as session:
  query = """MATCH (n:CANONICAL_DATA_TYPE)-[:IS_A]->(dt:FHIR_DATA_TYPE) RETURN n.uri as canonical_uri, dt.uri as data_type""" 
  result = session.run(query)
  for record in result:
    query = """MATCH (n:CANONICAL_DATA_TYPE{uri:'%s'}), (dt:FHIR_DATA_TYPE{uri:'%s'})
      CALL apoc.path.subgraphAll(dt, {relationshipFilter:'HAS_DATA_TYPE_PROPERTY>'})
      YIELD nodes, relationships
      CALL apoc.refactor.cloneSubgraph(
        nodes,
        [rel in relationships WHERE type(rel) = 'HAS_DATA_TYPE_PROPERTY'],
        { standinNodes:[[dt, n]] })
      YIELD input, output, error
      RETURN output;
    """ % (record["canonical_uri"], record["data_type"])
    inner_result = session.run(query)
    for inner_record in inner_result:
      node = inner_record["output"]
      uri_data.append({"id": node.id, "uri": "%s#%s" % (record["canonical_uri"], node["name"]) })
    print ("Duplicated for: ", record["canonical_uri"])

driver.close()

with driver.session() as session:
  query = """UNWIND $uri_data AS d
    MATCH (n) WHERE ID(n)=d.id
    SET n.uri = d.uri
    REMOVE n:FHIR_DATA_TYPE_PROPERTY
    SET n:CANONICAL_DATA_TYPE_PROPERTY"""
  result = session.run(query, uri_data=uri_data)
driver.close()
print ("URIs set.")



Duplicated for:  http://id.d4k.dk/dataset/canonical/n141/coding
Duplicated for:  http://id.d4k.dk/dataset/canonical/n18/coding
Duplicated for:  http://id.d4k.dk/dataset/canonical/n9/coding
Duplicated for:  http://id.d4k.dk/dataset/canonical/n8/coding
Duplicated for:  http://id.d4k.dk/dataset/canonical/n42/coding
Duplicated for:  http://id.d4k.dk/dataset/canonical/n141/quantity
Duplicated for:  http://id.d4k.dk/dataset/canonical/n2/date_time
URIs set.


Now load some terminology. We load just the SDTM terms (it is all we need for the moment). Just load the Code List definitions not every single item with every code list, just too big a load in one go.

In [123]:
stage_3_files = [ 
    { "label": "API_SOURCE", "filename": "stage_3_api_source_nodes.csv" },
    { "label": "SKOS_CONCEPT", "filename": "stage_3_skos_concept_nodes.csv" },
    { "label": "SKOS_CONCEPT_SCHEME", "filename": "stage_3_skos_concept_scheme_nodes.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_3_from_source_relationships.csv" },
    { "type": "SKOS_HAS_TOP_CONCEPT", "filename": "stage_3_skos_has_top_concept_relationships.csv" },
    { "type": "SKOS_NARROWER", "filename": "stage_3_skos_narrower_relationships.csv" } # Should be empty
  ]

file_load(stage_3_files)


CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_api_source_nodes.csv', labels: ['API_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_skos_concept_nodes.csv', labels: ['SKOS_CONCEPT'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_skos_concept_scheme_nodes.csv', labels: ['SKOS_CONCEPT_SCHEME'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_from_source_relationships.csv', type: 'FROM_SOURCE' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_skos_has_top_concept_relationships.csv', type: 'SKOS_HAS_TOP_CONCEPT' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_skos_narrower_relationsh

Do a quick test on the CT. Check AGEU and its links

In [124]:
with driver.session() as session:
  query = """MATCH (cs)-[]->(c1:SKOS_CONCEPT) WHERE c1.notation = 'AGEU' RETURN DISTINCT cs.version as version, c1.identifier as cl_identifier""" 
  result = session.run(query, uri_data=uri_data)
  for record in result:
    print ("%s: %s" % (record["version"], record["cl_identifier"]))
driver.close()


2021-12-17: C66781


Stage 4 to 9 are the CT files for the other areas, ADaM, Protocol, CDASH etc. Not loaded at the moment. Needed to split due to size and limited RAM on the Neo4j server.

Now load BC Templates

In [125]:
stage_10_files = [ 
    { "label": "OTHER_SOURCE", "filename": "stage_10_other_source_nodes.csv" },
    { "label": "BC_DATA_TYPE", "filename": "stage_10_bc_data_type_nodes.csv" },
    { "label": "BC_ITEM", "filename": "stage_10_bc_item_nodes.csv" },
    { "label": "BC_TEMPLATE", "filename": "stage_10_bc_template_nodes.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_10_from_source_relationships.csv" },
    { "type": "HAS_DATA_TYPE", "filename": "stage_10_has_data_type_relationships.csv" },
    { "type": "HAS_IDENTIFIER", "filename": "stage_10_has_identifier_relationships.csv" },
    { "type": "HAS_ITEM", "filename": "stage_10_has_item_relationships.csv" }
  ]

file_load(stage_10_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_other_source_nodes.csv', labels: ['OTHER_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_bc_data_type_nodes.csv', labels: ['BC_DATA_TYPE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_bc_item_nodes.csv', labels: ['BC_ITEM'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_bc_template_nodes.csv', labels: ['BC_TEMPLATE'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_from_source_relationships.csv', type: 'FROM_SOURCE' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_has_data_type_relationships.csv', type: 'HAS_DATA_TYPE' }, {

Now add in the data types nodes for the templates. Copy from the data types to the templates 

In [126]:
with driver.session() as session:
  query = """MATCH (m:BC_DATA_TYPE) RETURN m.uri as uri""" 
  result = session.run(query)
  for record in result:
    print(record["uri"])
    query = """MATCH (n:BC_DATA_TYPE{uri:'%s'})
      WITH n
      MATCH (dt:FHIR_DATA_TYPE{name: n.name})
      CREATE (n)-[:IS_A]->(dt)
      RETURN n.name as bc_name, dt.name as dt_name;
    """ % (record["uri"])
    inner_result = session.run(query)
    for inner_record in inner_result:
      print("%s, %s linked" % (inner_record["bc_name"], inner_record["dt_name"]))

driver.close()

uri_data = []
with driver.session() as session:
  query = """MATCH (n:BC_DATA_TYPE)-[:IS_A]->(dt:FHIR_DATA_TYPE) RETURN n.uri as bc_data_type, dt.uri as data_type""" 
  result = session.run(query)
  for record in result:
    query = """MATCH (n:BC_DATA_TYPE{uri:'%s'}), (dt:FHIR_DATA_TYPE{uri:'%s'})
      CALL apoc.path.subgraphAll(dt, {relationshipFilter:'HAS_DATA_TYPE_PROPERTY>'})
      YIELD nodes, relationships
      CALL apoc.refactor.cloneSubgraph(
        nodes,
        [rel in relationships WHERE type(rel) = 'HAS_DATA_TYPE_PROPERTY'],
        { standinNodes:[[dt, n]] })
      YIELD input, output, error
      RETURN output;
    """ % (record["bc_data_type"], record["data_type"])
    inner_result = session.run(query)
    for inner_record in inner_result:
      node = inner_record["output"]
      #print(node)
      uri_data.append({"id": node.id, "uri": "%s#%s" % (record["bc_data_type"], node["name"]) })
    print ("Duplicated for: ", record["bc_data_type"])

driver.close()

with driver.session() as session:
  query = """UNWIND $uri_data AS d
    MATCH (n) WHERE ID(n)=d.id
    SET n.uri = d.uri;""" 
  result = session.run(query, uri_data=uri_data)
driver.close()
print ("URIs set.")

http://id.d4k.dk/dataset/bc_template/base_observation/test/coding
coding, coding linked
http://id.d4k.dk/dataset/bc_template/base_observation/position/coding
coding, coding linked
http://id.d4k.dk/dataset/bc_template/base_observation/site_of_administration/coding
coding, coding linked
http://id.d4k.dk/dataset/bc_template/base_observation/laterality/coding
coding, coding linked
http://id.d4k.dk/dataset/bc_template/base_observation/method/coding
coding, coding linked
http://id.d4k.dk/dataset/bc_template/base_observation/date_time/datetime
http://id.d4k.dk/dataset/bc_template/base_observation/result/quantity
quantity, quantity linked
http://id.d4k.dk/dataset/bc_template/base_observation/result/coding
coding, coding linked
http://id.d4k.dk/dataset/bc_template/base_laboratory/test/coding
coding, coding linked
http://id.d4k.dk/dataset/bc_template/base_laboratory/position/coding
coding, coding linked
http://id.d4k.dk/dataset/bc_template/base_laboratory/site_of_administration/coding
coding, co

In [127]:
stage_11_files = [ 
    { "label": "OTHER_SOURCE", "filename": "stage_11_other_source_nodes.csv" },
    { "label": "BC_VALUE_SET", "filename": "stage_11_bc_value_set_nodes.csv" },
    { "label": "BC_DATA_PROPERTY", "filename": "stage_11_bc_data_property_nodes.csv" },
    { "label": "BC_DATA_TYPE", "filename": "stage_11_bc_data_type_nodes.csv" },
    { "label": "BC_ITEM", "filename": "stage_11_bc_item_nodes.csv" },
    { "label": "BC_INSTANCE", "filename": "stage_11_bc_instance_nodes.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_11_from_source_relationships.csv" },
    { "type": "HAS_RESPONSE", "filename": "stage_11_has_response_relationships.csv" },
    { "type": "HAS_DATA_PROPERTY", "filename": "stage_11_has_data_property_relationships.csv" },
    { "type": "HAS_DATA_TYPE", "filename": "stage_11_has_data_type_relationships.csv" },
    { "type": "HAS_IDENTIFIER", "filename": "stage_11_has_identifier_relationships.csv" },
    { "type": "HAS_ITEM", "filename": "stage_11_has_item_relationships.csv" }
  ]

file_load(stage_11_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_other_source_nodes.csv', labels: ['OTHER_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_bc_value_set_nodes.csv', labels: ['BC_VALUE_SET'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_bc_data_property_nodes.csv', labels: ['BC_DATA_PROPERTY'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_bc_data_type_nodes.csv', labels: ['BC_DATA_TYPE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_bc_item_nodes.csv', labels: ['BC_ITEM'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_bc_instance_nodes.csv', labels: ['BC_INSTANCE'] }], [{

Check that we can see the coded values for the BC loaded. Should get three results

- [C66741, C25208]
- [C66770, C28252]
- [C66770, C48531]

In [128]:
with driver.session() as session:
  query = """MATCH (n:BC_DATA_PROPERTY)-[:HAS_RESPONSE]->(d) RETURN n.name as name, d.cl as cl, d.cli as cli, d.uri as uri""" 
  result = session.run(query)
  for record in result:
    print ("%s: [%s, %s], uri=%s" % (record["name"], record["cl"], record["cli"], record["uri"]))
driver.close()

code: [C66741, C25208], uri=http://id.d4k.dk/dataset/bc_instance/weight/test/coding/code/c66741-c25208
code: [C66770, C28252], uri=http://id.d4k.dk/dataset/bc_instance/weight/result/quantity/code/c66770-c28252
code: [C66770, C48531], uri=http://id.d4k.dk/dataset/bc_instance/weight/result/quantity/code/c66770-c48531


Now load any Code Lists used by the BCs. Check loaded OK.

- C66741 - VSTESTCD
- C66770 - VSRESU

In [129]:
load_cl("sdtm", "C66741")
load_cl("sdtm", "C66770")
dump_cl("C66741")
dump_cl("C66770")


2021-12-17: [C66741, VSTESTCD], [C100947, HIPCIR]
2021-12-17: [C66741, VSTESTCD], [C181553, ARMSPAN]
2021-12-17: [C66741, VSTESTCD], [C156606, CHESTCIR]
2021-12-17: [C66741, VSTESTCD], [C25208, WEIGHT]
2021-12-17: [C66741, VSTESTCD], [C174311, SAO2FIO2]
2021-12-17: [C66741, VSTESTCD], [C25347, HEIGHT]
2021-12-17: [C66741, VSTESTCD], [C163569, WTAPCTL]
2021-12-17: [C66741, VSTESTCD], [C178060, CALFCIR]
2021-12-17: [C66741, VSTESTCD], [C98785, SSSKNF]
2021-12-17: [C66741, VSTESTCD], [C163567, BMIAPCTL]
2021-12-17: [C66741, VSTESTCD], [C170639, CRWNHEEL]
2021-12-17: [C66741, VSTESTCD], [C174371, TEMPPB]
2021-12-17: [C66741, VSTESTCD], [C100945, PULSEPR]
2021-12-17: [C66741, VSTESTCD], [C71258, LBM]
2021-12-17: [C66741, VSTESTCD], [C132482, EWEIGHT]
2021-12-17: [C66741, VSTESTCD], [C17651, WAISTHIP]
2021-12-17: [C66741, VSTESTCD], [C163570, WTHTPCTL]
2021-12-17: [C66741, VSTESTCD], [C174373, FTSAD]
2021-12-17: [C66741, VSTESTCD], [C174370, TEMPCB]
2021-12-17: [C66741, VSTESTCD], [C41255, I

Now load the DDF study example data. Provides an example of a "industry standard" design.

In [130]:
stage_12_files = [    
    { "label": "ENDPOINT", "filename": "stage_12_endpoint_nodes.csv" },
    { "label": "STUDY_DATA", "filename": "stage_12_study_data_nodes.csv" },
    { "label": "PROCEDURE", "filename": "stage_12_procedure_nodes.csv" },
    { "label": "ACTIVITY", "filename": "stage_12_activity_nodes.csv" },
    { "label": "WORKFLOW_ITEM", "filename": "stage_12_workflow_item_nodes.csv" },
    { "label": "VISIT", "filename": "stage_12_visit_nodes.csv" },
    { "label": "STUDY_CELL", "filename": "stage_12_study_cell_nodes.csv" },
    { "label": "RULE", "filename": "stage_12_rule_nodes.csv" },
    { "label": "STUDY_ELEMENT", "filename": "stage_12_study_element_nodes.csv" },
    { "label": "EPOCH", "filename": "stage_12_study_epoch_nodes.csv" },
    { "label": "STUDY_ARM", "filename": "stage_12_study_arm_nodes.csv" },
    { "label": "OBJECTIVE", "filename": "stage_12_objective_nodes.csv" },
    { "label": "CODE", "filename": "stage_12_code_nodes.csv" },
    { "label": "INVESTIGATIONAL_INTERVENTIONS", "filename": "stage_12_investigational_interventions_nodes.csv" },
    { "label": "POPULATION", "filename": "stage_12_population_nodes.csv" },
    { "label": "STUDY_DESIGN", "filename": "stage_12_study_design_nodes.csv" },
    { "label": "INDICATION", "filename": "stage_12_indication_nodes.csv" },
    { "label": "STUDY_PROTOCOL", "filename": "stage_12_study_protocol_nodes.csv" },
    { "label": "STUDY_PHASE", "filename": "stage_12_study_phase_nodes.csv" },
    { "label": "STUDY_TYPE", "filename": "stage_12_study_type_nodes.csv" },
    { "label": "STUDY_IDENTIFIER", "filename": "stage_12_study_identifier_nodes.csv" },
    { "label": "STUDY", "filename": "stage_12_study_nodes.csv" },
    { "type": "HAS_CODED", "filename": "stage_12_has_coded_relationships.csv" },
    { "type": "HAS_ENDPOINT", "filename": "stage_12_has_endpoint_relationships.csv" },
    { "type": "HAS_STUDY_DATA", "filename": "stage_12_has_study_data_relationships.csv" },
    { "type": "HAS_PROCEDURE", "filename": "stage_12_has_procedure_relationships.csv" },
    { "type": "HAS_PREVIOUS_ACTIVITY", "filename": "stage_12_has_previous_activity_relationships.csv" },
    { "type": "HAS_PREVIOUS_WORKFLOW", "filename": "stage_12_has_previous_workflow_relationships.csv" },
    { "type": "USED_IN_VISIT", "filename": "stage_12_used_in_visit_relationships.csv" },
    { "type": "HAS_ACTIVITY", "filename": "stage_12_has_activity_relationships.csv" },
    { "type": "HAS_VISIT", "filename": "stage_12_has_visit_relationships.csv" },
    { "type": "HAS_END_RULE", "filename": "stage_12_has_end_rule_relationships.csv" },
    { "type": "HAS_START_RULE", "filename": "stage_12_has_start_rule_relationships.csv" },
    { "type": "HAS_ELEMENT", "filename": "stage_12_has_element_relationships.csv" },
    { "type": "HAS_EPOCH", "filename": "stage_12_has_epoch_relationships.csv" },
    { "type": "HAS_ARM", "filename": "stage_12_has_arm_relationships.csv" },
    { "type": "HAS_CELL", "filename": "stage_12_has_cell_relationships.csv" },
    { "type": "HAS_OBJECTIVE", "filename": "stage_12_has_objective_relationships.csv" },
    { "type": "HAS_INDICATION", "filename": "stage_12_has_indication_relationships.csv" },
    { "type": "HAS_INVESTIGATIONAL_INTERVENTION", "filename": "stage_12_has_investigational_intervention_relationships.csv" },
    { "type": "HAS_POPULATION", "filename": "stage_12_has_population_relationships.csv" },
    { "type": "HAS_STUDY_DESIGN", "filename": "stage_12_has_study_design_relationships.csv" },
    { "type": "HAS_PROTOCOL", "filename": "stage_12_has_protocol_relationships.csv" },
    { "type": "HAS_STUDY_PHASE", "filename": "stage_12_has_study_phase_relationships.csv" },
    { "type": "HAS_STUDY_TYPE", "filename": "stage_12_has_study_type_relationships.csv" },
    { "type": "HAS_IDENTIFIER", "filename": "stage_12_has_identifier_relationships.csv" }
  ]

file_load(stage_12_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_endpoint_nodes.csv', labels: ['ENDPOINT'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_study_data_nodes.csv', labels: ['STUDY_DATA'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_procedure_nodes.csv', labels: ['PROCEDURE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_activity_nodes.csv', labels: ['ACTIVITY'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_workflow_item_nodes.csv', labels: ['WORKFLOW_ITEM'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_visit_nodes.csv', labels: ['VISIT'] }, { fileName: 'https://raw.githubuserco

Duplicate the BC. The study activity Study data node uses WGHT as a code, really need to update it to be "Weight" so we can name match all BCs. Just work round this for the moment.

In [131]:
with driver.session() as session:
  query = """MATCH (a:ACTIVITY)-[]->(sd:STUDY_DATA) WHERE sd.name = "WGHT"
    WITH a, sd
    MATCH (bc:BC_INSTANCE) WHERE bc.name = "Weight"
    WITH a, sd, bc
    CREATE (a)-[:HAS_BC]->(new:STUDY_BC_INSTANCE)
    SET new = bc
    WITH bc, new
    CALL apoc.path.subgraphAll(bc, {relationshipFilter:'HAS_ITEM>|HAS_DATA_TYPE>|HAS_DATA_PROPERTY>|HAS_RESPONSE>'})
    YIELD nodes, relationships
    CALL apoc.refactor.cloneSubgraph(
      nodes,
      [rel in relationships WHERE type(rel) = 'HAS_DATA_TYPE_PROPERTY'],
      { standinNodes:[[bc, new]] })
    YIELD input, output, error
    RETURN output;
    """
  result = session.run(query)
  for record in result:
    node = inner_record["output"]
    print(node)

driver.close()



<Node id=1237 labels=frozenset({'FHIR_DATA_TYPE_PROPERTY'}) properties={'name': 'value', 'simple_data_type': 'float', 'id': 8, 'uri': 'http://id.d4k.dk/dataset/hl7/fhir/datatypes#quantity_value'}>
<Node id=1237 labels=frozenset({'FHIR_DATA_TYPE_PROPERTY'}) properties={'name': 'value', 'simple_data_type': 'float', 'id': 8, 'uri': 'http://id.d4k.dk/dataset/hl7/fhir/datatypes#quantity_value'}>
<Node id=1237 labels=frozenset({'FHIR_DATA_TYPE_PROPERTY'}) properties={'name': 'value', 'simple_data_type': 'float', 'id': 8, 'uri': 'http://id.d4k.dk/dataset/hl7/fhir/datatypes#quantity_value'}>
<Node id=1237 labels=frozenset({'FHIR_DATA_TYPE_PROPERTY'}) properties={'name': 'value', 'simple_data_type': 'float', 'id': 8, 'uri': 'http://id.d4k.dk/dataset/hl7/fhir/datatypes#quantity_value'}>
<Node id=1237 labels=frozenset({'FHIR_DATA_TYPE_PROPERTY'}) properties={'name': 'value', 'simple_data_type': 'float', 'id': 8, 'uri': 'http://id.d4k.dk/dataset/hl7/fhir/datatypes#quantity_value'}>
<Node id=1237 l

Now create a unique UUID (should really be a URI) for each leaf value node. This is a simple example of FAIR, each data point in a study has a unique id and can be addressed.

Also, the DDF model does not allow us to easily get a unique BC instance for each actual BC, that is the WorkFlow Item. Keep it simple for the moment by going via the Activity and Study Data nodes.

In [132]:
save_for_later = {}

with driver.session() as session:
  query = """MATCH (bc:STUDY_BC_INSTANCE)-[*]->(v:BC_DATA_PROPERTY) WHERE v.name='value' OR v.name='unit'
    WITH collect(v) as nodes, apoc.create.uuid() as uuid
    FOREACH (n in nodes | SET n.uuid = uuid)
    """
  result = session.run(query)
  query = """MATCH (bc:STUDY_BC_INSTANCE)-[]->(i:BC_ITEM)-[*]->(v:BC_DATA_PROPERTY) WHERE i.name='Result' AND v.name='value' OR v.name='unit'
    RETURN DISTINCT bc.name as bc_name, v.name as name, v.uuid as uuid"""
  result = session.run(query)
  for record in result:
    print("BC %s property '%s' has uuid: '%s'" % (record["bc_name"], record["name"], record["uuid"]))
    save_for_later[record["name"]] = record["uuid"]

driver.close()
print(save_for_later)

BC Weight property 'unit' has uuid: '3fe3d51d-0bb4-4df0-9ba0-235eea259e49'
BC Weight property 'value' has uuid: '344569d6-aa3c-480a-8c6e-f2aa57de3932'
{'unit': '3fe3d51d-0bb4-4df0-9ba0-235eea259e49', 'value': '344569d6-aa3c-480a-8c6e-f2aa57de3932'}


Now link up to the CT for the library BCs and the Study versions

In [133]:
with driver.session() as session:
  query = """MATCH (n:BC_DATA_PROPERTY)-[:HAS_RESPONSE]->(d) 
    WITH n,d
    MATCH (cl:SKOS_CONCEPT {identifier: d.cl})-[]->(cli:SKOS_CONCEPT {identifier: d.cli})
    MERGE (n)-[:HAS_RESPONSE]->(cli)
    DETACH DELETE d
    RETURN n.name as name, cli.uri as uri
  """ 
  result = session.run(query)
  for record in result:
    print ("%s -> %s" % (record["name"], record["uri"]))
driver.close()

code -> http://id.d4k.dk/dataset/cdisc/ct/v48/sdtm/C66770-C28252
code -> http://id.d4k.dk/dataset/cdisc/ct/v48/sdtm/C66770-C28252
code -> http://id.d4k.dk/dataset/cdisc/ct/v48/sdtm/C66770-C48531
code -> http://id.d4k.dk/dataset/cdisc/ct/v48/sdtm/C66770-C48531
code -> http://id.d4k.dk/dataset/cdisc/ct/v48/sdtm/C66741-C25208
code -> http://id.d4k.dk/dataset/cdisc/ct/v48/sdtm/C66741-C25208


Now add in some data points. A data point will point at the BC and the visit to which is applies (of the WorkFlow Item). This is the nature of the DDF model. I think we can make it better.

In [134]:
with driver.session() as session:
  query = """MATCH (v:VISIT)<-[]-(w:WORKFLOW_ITEM)-[]->(a:ACTIVITY)-[]->(bc:STUDY_BC_INSTANCE) RETURN v.name as visit, w.id as wfi , a.description as activity, bc.name as bc""" 
  result = session.run(query)
  for record in result:
    print ("%s -> %s -> %s -> %s" % (record["visit"], record["wfi"], record["activity"], record["bc"]))
driver.close()

CYCLE 1, TREATMENT DAY 1 -> 1512 -> Weight -> Weight
FU 1 -> 1514 -> Weight -> Weight
CYCLE 2, TREATMENT DAY 1 -> 1513 -> Weight -> Weight


Create some simple data. We need the actual data point plus just enough information to attach it to the study graph. So 

- Create subjects
- Create data points and link into the study design and the BC attached to (the Study instance, not the definition of the BC that we copied earlier)

In [135]:
subjects = [ 
  { "subject_id": "1234"}, 
  { "subject_id": "1235" }, 
  { "subject_id": "1236" }, 
  { "subject_id": "1237" }
]
subject_data = [
  { "subject_id": "1234", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": save_for_later["value"], "value": "76" },
  { "subject_id": "1234", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": save_for_later["unit"], "value": "kg" },
  { "subject_id": "1234", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": save_for_later["value"], "value": "78" },
  { "subject_id": "1234", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": save_for_later["unit"], "value": "kg" },
  { "subject_id": "1234", "visit": "FU 1", "data_point": save_for_later["value"], "value": "77" },
  { "subject_id": "1234", "visit": "FU 1", "data_point": save_for_later["unit"], "value": "kg" },
  { "subject_id": "1235", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": save_for_later["value"], "value": "42" },
  { "subject_id": "1235", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": save_for_later["unit"], "value": "kg" },
  { "subject_id": "1235", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": save_for_later["value"], "value": "44" },
  { "subject_id": "1235", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": save_for_later["unit"], "value": "kg" },
  { "subject_id": "1235", "visit": "FU 1", "data_point": save_for_later["value"], "value": "43" },
  { "subject_id": "1235", "visit": "FU 1", "data_point": save_for_later["unit"], "value": "kg" },
]

print(subject_data)

[{'subject_id': '1234', 'visit': 'CYCLE 1, TREATMENT DAY 1', 'data_point': '344569d6-aa3c-480a-8c6e-f2aa57de3932', 'value': '76'}, {'subject_id': '1234', 'visit': 'CYCLE 1, TREATMENT DAY 1', 'data_point': '3fe3d51d-0bb4-4df0-9ba0-235eea259e49', 'value': 'kg'}, {'subject_id': '1234', 'visit': 'CYCLE 2, TREATMENT DAY 1', 'data_point': '344569d6-aa3c-480a-8c6e-f2aa57de3932', 'value': '78'}, {'subject_id': '1234', 'visit': 'CYCLE 2, TREATMENT DAY 1', 'data_point': '3fe3d51d-0bb4-4df0-9ba0-235eea259e49', 'value': 'kg'}, {'subject_id': '1234', 'visit': 'FU 1', 'data_point': '344569d6-aa3c-480a-8c6e-f2aa57de3932', 'value': '77'}, {'subject_id': '1234', 'visit': 'FU 1', 'data_point': '3fe3d51d-0bb4-4df0-9ba0-235eea259e49', 'value': 'kg'}, {'subject_id': '1235', 'visit': 'CYCLE 1, TREATMENT DAY 1', 'data_point': '344569d6-aa3c-480a-8c6e-f2aa57de3932', 'value': '42'}, {'subject_id': '1235', 'visit': 'CYCLE 1, TREATMENT DAY 1', 'data_point': '3fe3d51d-0bb4-4df0-9ba0-235eea259e49', 'value': 'kg'},

Create the subjects.

In [136]:
with driver.session() as session:
  for subject in subjects:
    query = """MATCH (p:STUDY_PROTOCOL {brief_title:'DDR'})<-[]-(s:STUDY)
      WITH s
      CREATE (s)<-[:ENROLLED_IN]-(ss:STUDY_SUBJECT)
      SET ss.identifier = '%s'
    """ % (subject["subject_id"]) 
    result = session.run(query)
  query = """MATCH (s:STUDY_SUBJECT) RETURN s.identifier as subject"""
  result = session.run(query)
  for record in result:
    print ("Subject %s created" % (record["subject"]))
driver.close()

Subject 1234 created
Subject 1235 created
Subject 1236 created
Subject 1237 created


Now add in the data

In [137]:
with driver.session() as session:
  for item in subject_data:
    query = """MATCH (s:STUDY_SUBJECT {identifier: '%s'})
      WITH s
      MATCH (v:VISIT {name: '%s'})<-[]-(wi:WORKFLOW_ITEM)-[:HAS_ACTIVITY]->(a)-[:HAS_BC]->(bc)-[:HAS_ITEM]->
        (i)-[:HAS_DATA_TYPE]->(dt)-[:HAS_DATA_PROPERTY]->(d:BC_DATA_PROPERTY {uuid: '%s'})
      WITH s, wi, d
      CREATE (s)<-[:FOR_SUBJECT]-(sdp:STUDY_DATA_POINT)-[:FOR_WORKFLOW_ITEM]->(wi)
      WITH sdp, d
      CREATE (sdp)-[:FOR_VALUE]->(d)
      SET sdp.value = '%s'
      SET sdp.uuid = apoc.create.uuid()
    """ % (item["subject_id"], item["visit"], item["data_point"], item["value"]) 
    result = session.run(query)
  query = """MATCH (s:STUDY_DATA_POINT) RETURN DISTINCT s.value as value, s.uuid as uuid"""
  result = session.run(query)
  for record in result:
    print ("Value %s, %s created" % (record["value"], record["uuid"]))
driver.close()

Value 76, c4147952-bbd0-4f10-ad19-f68e2ed728ae created
Value kg, 7a90d8b4-d638-4beb-8196-645803da3d68 created
Value 78, bcefabc6-f013-4c4c-b858-7f8e34e3f559 created
Value kg, 7571c33c-c8b1-4fda-989d-181484540da2 created
Value 77, f3626c3d-db8d-4604-86fd-ee96f4ce8e51 created
Value kg, 30a32840-6936-4e90-83b7-e50ed33a8c00 created
Value 42, 4cb2f176-29db-4039-a658-3064ffc721cc created
Value kg, 892e2725-0675-4bbd-8123-9cc227886de7 created
Value 44, 72a4014c-f6c3-4055-80ea-447248685519 created
Value kg, bf2e4234-0a04-48b7-bbfa-5a5e128425ed created
Value 43, 8616fe7d-a54a-4fec-bb3c-48221d3b8048 created
Value kg, 3ebe0718-2d61-4f01-970c-4cd39f190503 created


Now load the SDTM IG so we can link the SDTM to the canonical model.

In [138]:
stage_13_files = [ 
    { "label": "API_SOURCE", "filename": "stage_13_api_source_nodes.csv" },
    { "label": "SDTM_IG", "filename": "stage_13_sdtm_ig_nodes.csv" },
    { "label": "SDTM_DATASET", "filename": "stage_13_sdtm_dataset_nodes.csv" },
    { "label": "SDTM_VARIABLE", "filename": "stage_13_sdtm_variable_nodes.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_13_from_source_relationships.csv" },
    { "type": "HAS_DATASET", "filename": "stage_13_has_dataset_relationships.csv" },
    { "type": "HAS_VARIABLE", "filename": "stage_13_has_variable_relationships.csv" }
  ]

file_load(stage_13_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_api_source_nodes.csv', labels: ['API_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_sdtm_ig_nodes.csv', labels: ['SDTM_IG'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_sdtm_dataset_nodes.csv', labels: ['SDTM_DATASET'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_sdtm_variable_nodes.csv', labels: ['SDTM_VARIABLE'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_from_source_relationships.csv', type: 'FROM_SOURCE' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_has_dataset_relationships.csv', type: 'HAS_DATASET' }, { fil

Check SDTM loaded

In [139]:
with driver.session() as session:
  query = """MATCH (ds:SDTM_DATASET) RETURN DISTINCT ds.name as name"""
  result = session.run(query)
  for record in result:
    print ("Dataset %s loaded" % (record["name"]))
driver.close()

Dataset AG loaded
Dataset CM loaded
Dataset EC loaded
Dataset EX loaded
Dataset ML loaded
Dataset PR loaded
Dataset SU loaded
Dataset AE loaded
Dataset BE loaded
Dataset CE loaded
Dataset DS loaded
Dataset DV loaded
Dataset HO loaded
Dataset MH loaded
Dataset BS loaded
Dataset CP loaded
Dataset CV loaded
Dataset DA loaded
Dataset DD loaded
Dataset EG loaded
Dataset FT loaded
Dataset GF loaded
Dataset IE loaded
Dataset IS loaded
Dataset LB loaded
Dataset MB loaded
Dataset MI loaded
Dataset MK loaded
Dataset MS loaded
Dataset NV loaded
Dataset OE loaded
Dataset PC loaded
Dataset PE loaded
Dataset PP loaded
Dataset QS loaded
Dataset RE loaded
Dataset RP loaded
Dataset RS loaded
Dataset SC loaded
Dataset SS loaded
Dataset TR loaded
Dataset TU loaded
Dataset UR loaded
Dataset VS loaded
Dataset FA loaded
Dataset SR loaded
Dataset CO loaded
Dataset DM loaded
Dataset SE loaded
Dataset SM loaded
Dataset SV loaded
Dataset TA loaded
Dataset TD loaded
Dataset TE loaded
Dataset TI loaded
Dataset TM

Now link the VS domain to the canonical model. Do it simply for the moment by hand. This should be automatic via SDTM class definitions which should be loaded.

In [140]:
# The Canonical Model References
#
# Quantity
# http://id.d4k.dk/dataset/canonical/n141/quantity#value
# http://id.d4k.dk/dataset/canonical/n141/quantity#unit

# Coded
# http://id.d4k.dk/dataset/canonical/n141/coding#code

with driver.session() as session:
  query = """MATCH (s:SDTM_DATASET {name: 'VS'}), 
    (s)-[]->(v1:SDTM_VARIABLE {name: 'VSORRES'}), 
    (s)-[]->(v2:SDTM_VARIABLE {name: 'VSORRESU'}), 
    (c1:CANONICAL_DATA_TYPE_PROPERTY {uri: "http://id.d4k.dk/dataset/canonical/n141/quantity#value"}), 
    (c2:CANONICAL_DATA_TYPE_PROPERTY {uri: "http://id.d4k.dk/dataset/canonical/n141/quantity#unit"}),
    (c3:CANONICAL_DATA_TYPE_PROPERTY {uri: "http://id.d4k.dk/dataset/canonical/n141/coding#code"})
    WITH s, v1, v2, c1, c2, c3
    CREATE (v1)-[:IS_A]->(cv1:SDTM_MODEL_VARIABLE {name: '--ORRES'})-[:IS_CANONICAL_REF]->(c1)
    CREATE (v2)-[:IS_A]->(cv2:SDTM_MODEL_VARIABLE {name: '--ORRESU'})-[:IS_CANONICAL_REF]->(c2)
    CREATE (v1)-[:IS_CANONICAL_REF]->(c3)
  """
  result = session.run(query)
  query = """MATCH (s:SDTM_MODEL_VARIABLE)-[]->(d:CANONICAL_DATA_TYPE_PROPERTY) RETURN DISTINCT s.name as name, d.uri as uri"""
  result = session.run(query)
  for record in result:
    print ("Value %s, %s created" % (record["name"], record["uri"]))
driver.close()

Value --ORRES, http://id.d4k.dk/dataset/canonical/n141/quantity#value created
Value --ORRESU, http://id.d4k.dk/dataset/canonical/n141/quantity#unit created


Now link in the Study BC Instance to the canonical model. Again by hand but this should be automated.

In [141]:
# http://id.d4k.dk/dataset/bc_instance/weight/result/quantity/value = save_for_later['value']
# http://id.d4k.dk/dataset/canonical/n141/quantity#unit = save_for_later['unit']

print(save_for_later)

with driver.session() as session:
  query = """MATCH (bc1:BC_DATA_PROPERTY {uuid: '%s'}), (bc2:BC_DATA_PROPERTY {uuid: '%s'}),
    (c1:CANONICAL_DATA_TYPE_PROPERTY {uri: "http://id.d4k.dk/dataset/canonical/n141/quantity#value"}),
    (c2:CANONICAL_DATA_TYPE_PROPERTY {uri: "http://id.d4k.dk/dataset/canonical/n141/quantity#unit"})
    WITH bc1, bc2, c1, c2
    CREATE (bc1)-[:IS_CANONICAL_REF]->(c1)
    CREATE (bc2)-[:IS_CANONICAL_REF]->(c2)
  """ % (save_for_later['value'], save_for_later['unit'])
  result = session.run(query)
  query = """MATCH (s:BC_DATA_PROPERTY)-[]->(d:FHIR_DATA_TYPE_PROPERTY) RETURN DISTINCT s.name as name, d.uri as uri"""
  result = session.run(query)
  for record in result:
    print ("Value %s, %s created" % (record["name"], record["uri"]))
driver.close()

{'unit': '3fe3d51d-0bb4-4df0-9ba0-235eea259e49', 'value': '344569d6-aa3c-480a-8c6e-f2aa57de3932'}


So having linked the very basic data we have all the pieces in place. We can now start getting data out, simple at the moment but ...

Query the VS domain

In [142]:
with driver.session() as session:
  query = """MATCH (sd:SDTM_DATASET {name: 'VS'})-[]->(sv:SDTM_VARIABLE)-[]->(cv:SDTM_MODEL_VARIABLE)-[:IS_CANONICAL_REF]->(fdt:CANONICAL_DATA_TYPE_PROPERTY)
  <-[:IS_CANONICAL_REF]-(bdt:BC_DATA_PROPERTY)<-[:FOR_VALUE]-(sdp:STUDY_DATA_POINT)-[]->(wfi:WORKFLOW_ITEM), 
  (wfi)-[:USED_IN_VISIT]->(v:VISIT), 
  (sdp)-[:FOR_SUBJECT]->(subj:STUDY_SUBJECT),
  (e:EPOCH)-[]->(v)
RETURN sd.name as domain, sv.name as variable, sdp.value as data, v.name as visit, e.study_epoch_name as epoch, subj.identifier as subject
  """
  result = session.run(query)
  for record in result:
    print ("%s, %s, %s, %s, %s %s" % (record["domain"], record["variable"], record["data"], record["visit"], record["epoch"], record["subject"]))
driver.close()

VS, VSORRES, 43, FU 1, FOLLOW-UP 1235
VS, VSORRES, 44, CYCLE 2, TREATMENT DAY 1, TREATMENT 1235
VS, VSORRES, 42, CYCLE 1, TREATMENT DAY 1, TREATMENT 1235
VS, VSORRES, 77, FU 1, FOLLOW-UP 1234
VS, VSORRES, 78, CYCLE 2, TREATMENT DAY 1, TREATMENT 1234
VS, VSORRES, 76, CYCLE 1, TREATMENT DAY 1, TREATMENT 1234
VS, VSORRESU, kg, FU 1, FOLLOW-UP 1235
VS, VSORRESU, kg, CYCLE 2, TREATMENT DAY 1, TREATMENT 1235
VS, VSORRESU, kg, CYCLE 1, TREATMENT DAY 1, TREATMENT 1235
VS, VSORRESU, kg, FU 1, FOLLOW-UP 1234
VS, VSORRESU, kg, CYCLE 2, TREATMENT DAY 1, TREATMENT 1234
VS, VSORRESU, kg, CYCLE 1, TREATMENT DAY 1, TREATMENT 1234


So, next ...
- Expand, adding more BCs, data etc
- Generate a CRF for the study
- Generate an aCRF
- Generate a define.xml
- Expand, adding more BCs, data etc