Install dependencies

In [736]:
!pip install neo4j
!pip install requests



First thing we need to do is connect to the database. We also define some general purpose methods in this section.

In [737]:
from neo4j import GraphDatabase
import requests
import json
import urllib.request

# Load file directory
load_file_dir = "https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/"

# Connect to the Neo4j DB. You need a line like one of these examples:
# driver = GraphDatabase.driver("neo4j://n.n.n.n:7687", auth=("username", "password"))
# driver = GraphDatabase.driver("neo4j+s://url_for_aura", auth=("username", "password"))


# Methods
def clear(tx):
    tx.run("CALL apoc.periodic.iterate('MATCH (n) RETURN n', 'DETACH DELETE n', {batchSize:1000})")

def file_load(load_files):
  with driver.session() as session:
    nodes = []
    relationships = []
    for file_item in load_files:
      filename = "%s%s" % (load_file_dir, file_item["filename"])
      if "label" in file_item:
        nodes.append("{ fileName: '%s', labels: ['%s'] }" % (filename, file_item["label"]) )
      else:
        relationships.append("{ fileName: '%s', type: '%s' }" % (filename, file_item["type"]) )
    query = """CALL apoc.import.csv( [%s], [%s], {stringIds: false})""" % (", ".join(nodes), ", ".join(relationships))
    print(query)
    result = session.run(query)
    #print(result)
    for record in result:
      print(record)
  driver.close()

def load_cl(ct_type, cl):
  filename = "%scdisc_ct_%s_nodes_%s.csv" % (load_file_dir, ct_type, cl)
  with driver.session() as session:
    query = """MATCH (p:SKOS_CONCEPT) where p.identifier = '%s' 
      WITH p
      LOAD CSV WITH HEADERS FROM '%s' AS row
      CREATE (p)-[:SKOS_NARROWER]->(c:SKOS_CONCEPT {id: toInteger(row.id), alt_label: row.alt_label, definition: row.definition, identifier: row.identifier, notation: row.notation, pref_label: row.pref_label, uri: row.uri})""" % (cl, filename)
    result = session.run(query)
    for record in result:
      print(record)

def dump_cl(cl):
  with driver.session() as session:
    query = """MATCH (cs:SKOS_CONCEPT_SCHEME)-[]->(c1:SKOS_CONCEPT)-[]->(c2:SKOS_CONCEPT) WHERE c1.identifier = '%s' RETURN DISTINCT cs.version as version, c1.identifier as cl_identifier, c1.notation as cl_sub, c2.identifier as cli_identifier, c2.notation as cli_sub""" % (cl) 
    result = session.run(query)
    for record in result:
      print("%s: [%s, %s], [%s, %s]" % (record["version"], record["cl_identifier"], record["cl_sub"], record["cli_identifier"], record["cli_sub"]))
  driver.close()

def set_version(version, update):
  with driver.session() as session:
    query = """CREATE (v:VERSION)
      SET v.version = '%s', v.updates = '%s'
    """ % (version, update)
    result = session.run(query)
  driver.close()

with driver.session() as session:
    session.write_transaction(clear)
driver.close()

set_version(0.1, "First version, basic SDTM domain ganeration.")
print("Ready ...")


Ready ...


First create the FHIR data types. Need this before anything else.


In [738]:
stage_1_files = [ 
    { "label": "FHIR", "filename": "stage_1_fhir_nodes.csv" },
    { "label": "WEB_SOURCE", "filename": "stage_1_web_source_nodes.csv" },
    { "label": "FHIR_DATA_TYPE", "filename": "stage_1_fhir_data_type_nodes.csv" },
    { "label": "FHIR_DATA_TYPE_PROPERTY", "filename": "stage_1_fhir_data_type_property_nodes.csv" },
    { "type": "HAS_DATA_TYPE_PROPERTY", "filename": "stage_1_has_data_type_property_relationships.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_1_from_source_relationships.csv" },
    { "type": "HAS_DATA_TYPE", "filename": "stage_1_has_data_type_relationships.csv" }
  ]

file_load(stage_1_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_fhir_nodes.csv', labels: ['FHIR'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_web_source_nodes.csv', labels: ['WEB_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_fhir_data_type_nodes.csv', labels: ['FHIR_DATA_TYPE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_fhir_data_type_property_nodes.csv', labels: ['FHIR_DATA_TYPE_PROPERTY'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_has_data_type_property_relationships.csv', type: 'HAS_DATA_TYPE_PROPERTY' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_1_from_source_relationships.

Now check we have loaded ok. Check the FHIR version, should be 4.0.1

View the dashboard to see the types loaded.

In [739]:
with driver.session() as session:
  query = """MATCH (n:FHIR) Return n.version as version""" 
  result = session.run(query)
  for record in result:
    print("Version: ", record["version"])

driver.close()

Version:  4.0.1


Now load the canonical model.

In [740]:
stage_2_files = [ 
    { "label": "CANONICAL_MODEL", "filename": "stage_2_canonical_model_nodes.csv" },
    { "label": "CANONICAL_NODE", "filename": "stage_2_canonical_node_nodes.csv" },
    { "label": "CANONICAL_DATA_TYPE", "filename": "stage_2_canonical_data_type_nodes.csv" },
    { "label": "OTHER_SOURCE", "filename": "stage_2_other_source_nodes.csv" },
    { "type": "CONSISTS_OF", "filename": "stage_2_consists_of_relationships.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_2_from_source_relationships.csv" },
    { "type": "HAS_SUB_MODEL", "filename": "stage_2_has_sub_model_relationships.csv" },
    { "type": "HAS_DATA_TYPE", "filename": "stage_2_has_data_type_relationships.csv" }
]

file_load(stage_2_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_canonical_model_nodes.csv', labels: ['CANONICAL_MODEL'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_canonical_node_nodes.csv', labels: ['CANONICAL_NODE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_canonical_data_type_nodes.csv', labels: ['CANONICAL_DATA_TYPE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_other_source_nodes.csv', labels: ['OTHER_SOURCE'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_consists_of_relationships.csv', type: 'CONSISTS_OF' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_2_from_source_relationships.csv'

Check a few relationships exist in the canonical model. Should get three results.

* THERAPEUTIC INTERVENTION
* OBSERVATION
* ADVERSE EVENT



In [741]:
with driver.session() as session:
  query = """MATCH (r)-[]->(t)-[]->(n:CANONICAL_NODE) WHERE n.name="LOCATION" RETURN r.name as root_name, t.name as name""" 
  result = session.run(query)
  for record in result:
    print("%s -> %s -> Location: " % (record["root_name"], record["name"]))

driver.close()

CANONICAL MODEL -> THERAPEUTIC INTERVENTION -> Location: 
CANONICAL MODEL -> OBSERVATION -> Location: 
CANONICAL MODEL -> ADVERSE EVENT -> Location: 


Now link the canonical nodes and the data types using the name as the key. For each "leaf" Canonical node we have the meaningful nodes (LOCATION, METHOD etc) and a child node defining the type of data that can be collected. This is keyed by the data type name so we can match them up. Note that there may be more than one data type for each meaningful node.

In [742]:
def link_data_types(parent_node_label, data_type_label):
  with driver.session() as session:
    query = """MATCH (dt:%s) WHERE NOT (dt)-[:IS_A]->()
      WITH dt
      MATCH (pn:%s)-[]->(dt)
      WITH pn, dt
      MATCH (n:FHIR_DATA_TYPE) WHERE dt.name = n.name
      CREATE (dt)-[:IS_A]->(n)
      RETURN pn.uri as uri, pn.name as parent_name, n.name as fdt_name, dt.name as dt_name;
    """ % (data_type_label, parent_node_label)
    result = session.run(query)
    for record in result:
      print("[%s, %s] -> [FHIR, %s]" % (record['parent_name'], record["dt_name"], record["fdt_name"]))

  driver.close()

link_data_types("CANONICAL_NODE", "CANONICAL_DATA_TYPE")

[PORTION, coding] -> [FHIR, coding]
[DIRECTIONALITY, coding] -> [FHIR, coding]
[LATERALITY, coding] -> [FHIR, coding]
[TEST, coding] -> [FHIR, coding]
[RESULT, coding] -> [FHIR, coding]
[RESULT, quantity] -> [FHIR, quantity]
[DATE TIME, date_time] -> [FHIR, date_time]


Now create the data type nodes for the canonical mode. For each canonical node that references a data type copy the properties of that data type to the canonical node. Give each canonical leaf a unique id (a uri). Also change the lable of the node so they are Canonical Data Type nodes rather than FHIR ones.

In the future these nodes should also have a C code reference providing a definition for the data item.


In [743]:
def duplicate_data_type_nodes(data_type_label, data_type_property_label):
  uri_data = []
  with driver.session() as session:
  
    query = """MATCH (n:%s) WHERE NOT (n)-[:HAS_DATA_TYPE_PROPERTY]->()
      WITH n
      MATCH (n)-[:IS_A]->(dt:FHIR_DATA_TYPE)
      WITH n, dt
      CALL apoc.path.subgraphAll(dt, {relationshipFilter:'HAS_DATA_TYPE_PROPERTY>'})
      YIELD nodes, relationships
      CALL apoc.refactor.cloneSubgraph(
        nodes,
        [rel in relationships WHERE type(rel) = 'HAS_DATA_TYPE_PROPERTY'],
        { standinNodes:[[dt, n]] })
      YIELD input, output, error
      RETURN output;
    """ % (data_type_label)
    result = session.run(query)
    for record in result:
      node = record["output"]
      uri_data.append({"id": node.id})
 
    query = """UNWIND $uri_data AS d
      MATCH (p)-[]->(n) WHERE ID(n)=d.id
      SET n.uri = p.uri + '/' + n.name
      REMOVE n:FHIR_DATA_TYPE_PROPERTY
      SET n:%s RETURN n.uri as uri, n.name as name""" % (data_type_property_label)
    result = session.run(query, uri_data=uri_data)
    for record in result:
      print("Node duplicated: %s, %s" % (record["uri"], record["name"]))
  driver.close()

duplicate_data_type_nodes("CANONICAL_DATA_TYPE", "CANONICAL_DATA_TYPE_PROPERTY")


Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/portion/coding/version, version
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/portion/coding/system, system
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/portion/coding/code, code
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/portion/coding/user_selected, user_selected
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/portion/coding/display, display
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/directionality/coding/version, version
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/directionality/coding/system, system
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/directionality/coding/code, code
Node duplicated: http://id.d4k.dk/dataset/canonical/common/location/directionality/coding/user_selected, user_selected
Node duplicated: http://id.d4k.dk/dataset/canonical/common/locat

Now load some terminology. We load just the SDTM terms (it is all we need for the moment). Just load the Code List definitions not every single item with every code list, just too big a load in one go.

In [744]:
stage_3_files = [ 
    { "label": "API_SOURCE", "filename": "stage_3_api_source_nodes.csv" },
    { "label": "SKOS_CONCEPT", "filename": "stage_3_skos_concept_nodes.csv" },
    { "label": "SKOS_CONCEPT_SCHEME", "filename": "stage_3_skos_concept_scheme_nodes.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_3_from_source_relationships.csv" },
    { "type": "SKOS_HAS_TOP_CONCEPT", "filename": "stage_3_skos_has_top_concept_relationships.csv" }
  ]

file_load(stage_3_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_api_source_nodes.csv', labels: ['API_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_skos_concept_nodes.csv', labels: ['SKOS_CONCEPT'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_skos_concept_scheme_nodes.csv', labels: ['SKOS_CONCEPT_SCHEME'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_from_source_relationships.csv', type: 'FROM_SOURCE' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_3_skos_has_top_concept_relationships.csv', type: 'SKOS_HAS_TOP_CONCEPT' }], {stringIds: false})
<Record file='progress.csv' source='file' format='csv' nodes=986 relationships=985 properties=8868 time=918 rows

Do a quick test on the CT. Check AGEU and its links

In [745]:
with driver.session() as session:
  query = """MATCH (cs)-[]->(c1:SKOS_CONCEPT) WHERE c1.notation = 'AGEU' RETURN DISTINCT cs.version as version, c1.identifier as cl_identifier""" 
  result = session.run(query)
  for record in result:
    print ("%s: %s" % (record["version"], record["cl_identifier"]))
driver.close()

2021-12-17: C66781


Stage 4 to 9 are the CT files for the other areas, ADaM, Protocol, CDASH etc. Not loaded at the moment. Needed to split due to size and limited RAM on the Neo4j server.

Now load BC Templates

In [746]:
stage_10_files = [ 
    { "label": "OTHER_SOURCE", "filename": "stage_10_other_source_nodes.csv" },
    { "label": "BC_DATA_TYPE", "filename": "stage_10_bc_data_type_nodes.csv" },
    { "label": "BC_ITEM", "filename": "stage_10_bc_item_nodes.csv" },
    { "label": "BC_TEMPLATE", "filename": "stage_10_bc_template_nodes.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_10_from_source_relationships.csv" },
    { "type": "HAS_DATA_TYPE", "filename": "stage_10_has_data_type_relationships.csv" },
    { "type": "HAS_IDENTIFIER", "filename": "stage_10_has_identifier_relationships.csv" },
    { "type": "HAS_ITEM", "filename": "stage_10_has_item_relationships.csv" }
  ]

file_load(stage_10_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_other_source_nodes.csv', labels: ['OTHER_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_bc_data_type_nodes.csv', labels: ['BC_DATA_TYPE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_bc_item_nodes.csv', labels: ['BC_ITEM'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_bc_template_nodes.csv', labels: ['BC_TEMPLATE'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_from_source_relationships.csv', type: 'FROM_SOURCE' }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_10_has_data_type_relationships.csv', type: 'HAS_DATA_TYPE' }, {

Now add in the data types nodes for the templates. Copy from the data types to the templates 

In [747]:
link_data_types("BC_ITEM", "BC_DATA_TYPE")
duplicate_data_type_nodes("BC_DATA_TYPE", "BC_DATA_TYPE_PROPERTY")

[Position, coding] -> [FHIR, coding]
[Site of Administration, coding] -> [FHIR, coding]
[Laterality, coding] -> [FHIR, coding]
[Method, coding] -> [FHIR, coding]
[Result, coding] -> [FHIR, coding]
[Test, coding] -> [FHIR, coding]
[Position, coding] -> [FHIR, coding]
[Site of Administration, coding] -> [FHIR, coding]
[Laterality, coding] -> [FHIR, coding]
[Directionality, coding] -> [FHIR, coding]
[Method, coding] -> [FHIR, coding]
[Result, coding] -> [FHIR, coding]
[Specimen, coding] -> [FHIR, coding]
[Test, coding] -> [FHIR, coding]
[Result, quantity] -> [FHIR, quantity]
[Result, quantity] -> [FHIR, quantity]
[Date Time, date_time] -> [FHIR, date_time]
[Date Time, date_time] -> [FHIR, date_time]
Node duplicated: http://id.d4k.dk/dataset/bc_template/base_observation/position/coding/version, version
Node duplicated: http://id.d4k.dk/dataset/bc_template/base_observation/position/coding/system, system
Node duplicated: http://id.d4k.dk/dataset/bc_template/base_observation/position/coding/c

Now link the BCs to the canonical model. 

In [748]:
with driver.session() as session:
  query = """MATCH (n:BC_ITEM) WHERE n.canonical <> ""
    WITH n
    MATCH (m:CANONICAL_NODE {name: n.canonical})
    WITH n, m
    MATCH (n)-[]->(bdt:BC_DATA_TYPE)-[]->(bdtp:BC_DATA_TYPE_PROPERTY)
    WITH bdt, bdtp, n, m
    MATCH (m)-[]->(cdt:CANONICAL_DATA_TYPE {name: bdt.name})-[]->(cdtp:CANONICAL_DATA_TYPE_PROPERTY {name: bdtp.name})
    CREATE (bdtp)-[:IS_CANONICAL_REF]->(cdtp)
    RETURN n.name as b_name, bdt.name as bdt_name, bdtp.name as bdtp_name, m.name as c_name, cdt.name as cdt_name, cdtp.name as cdtp_name
  """ 
  result = session.run(query)
  for record in result:
    print ("[%s, %s, %s] -> [%s, %s, %s]" % (record["b_name"], record["bdt_name"], record["bdtp_name"], record["c_name"], record["cdt_name"], record["cdtp_name"]))
driver.close()

[Test, coding, display] -> [TEST, coding, display]
[Test, coding, user_selected] -> [TEST, coding, user_selected]
[Test, coding, version] -> [TEST, coding, version]
[Test, coding, system] -> [TEST, coding, system]
[Test, coding, code] -> [TEST, coding, code]
[Laterality, coding, version] -> [LATERALITY, coding, version]
[Laterality, coding, display] -> [LATERALITY, coding, display]
[Laterality, coding, user_selected] -> [LATERALITY, coding, user_selected]
[Laterality, coding, code] -> [LATERALITY, coding, code]
[Laterality, coding, system] -> [LATERALITY, coding, system]
[Date Time, date_time, value] -> [DATE TIME, date_time, value]
[Result, coding, user_selected] -> [RESULT, coding, user_selected]
[Result, coding, version] -> [RESULT, coding, version]
[Result, coding, system] -> [RESULT, coding, system]
[Result, coding, code] -> [RESULT, coding, code]
[Result, coding, display] -> [RESULT, coding, display]
[Result, quantity, comparator] -> [RESULT, quantity, comparator]
[Result, quanti

Now load the BC instances

In [749]:
stage_11_files = [ 
    { "label": "OTHER_SOURCE", "filename": "stage_11_other_source_nodes.csv" },
    { "label": "BC_VALUE_SET", "filename": "stage_11_bc_value_set_nodes.csv" },
    { "label": "BC_DATA_TYPE", "filename": "stage_11_bc_data_type_nodes.csv" },
    { "label": "BC_ITEM", "filename": "stage_11_bc_item_nodes.csv" },
    { "label": "BC_INSTANCE", "filename": "stage_11_bc_instance_nodes.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_11_from_source_relationships.csv" },
    { "type": "HAS_RESPONSE", "filename": "stage_11_has_response_relationships.csv" },
    { "type": "HAS_DATA_TYPE", "filename": "stage_11_has_data_type_relationships.csv" },
    { "type": "HAS_IDENTIFIER", "filename": "stage_11_has_identifier_relationships.csv" },
    { "type": "HAS_ITEM", "filename": "stage_11_has_item_relationships.csv" }
  ]

file_load(stage_11_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_other_source_nodes.csv', labels: ['OTHER_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_bc_value_set_nodes.csv', labels: ['BC_VALUE_SET'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_bc_data_type_nodes.csv', labels: ['BC_DATA_TYPE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_bc_item_nodes.csv', labels: ['BC_ITEM'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_bc_instance_nodes.csv', labels: ['BC_INSTANCE'] }], [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_11_from_source_relationships.csv', type: 'FROM_SOURCE' }, { fileN

Now link the BC instances to the data types

In [750]:
link_data_types("BC_ITEM", "BC_DATA_TYPE")
duplicate_data_type_nodes("BC_DATA_TYPE", "BC_DATA_TYPE_PROPERTY")

[Test, coding] -> [FHIR, coding]
[Result, quantity] -> [FHIR, quantity]
[Date Time, date_time] -> [FHIR, date_time]
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/version, version
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/system, system
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/code, code
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/user_selected, user_selected
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/display, display
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/date_time/date_time/value, value
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/result/quantity/code, code
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/result/quantity/value, value
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/result/quantity/unit, unit
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/result/qu

Now link the BC Instances to the BC Template that is based on

In [751]:
with driver.session() as session:
  query = """MATCH (n:BC_INSTANCE)
    WITH n
    MATCH (m:BC_TEMPLATE {name: n.based_on})
    WITH n, m
    CREATE (n)-[:BASED_ON]->(m)
    RETURN n.name as i_name, m.name as t_name
  """ 
  result = session.run(query)
  for record in result:
    print ("%s -> %s" % (record["i_name"], record["t_name"]))
driver.close()

Weight -> Base Observation


Now load the SDTM IG so we can link the SDTM to the canonical model.

In [752]:
stage_13_files = [ 
    { "label": "API_SOURCE", "filename": "stage_13_api_source_nodes.csv" },
    { "label": "OTHER_SOURCE", "filename": "stage_13_other_source_nodes.csv" },
    { "label": "SDTM_MODEL", "filename": "stage_13_sdtm_model_nodes.csv" },
    { "label": "SDTM_CLASS", "filename": "stage_13_sdtm_class_nodes.csv" },
    { "label": "SDTM_MODEL_VARIABLE", "filename": "stage_13_sdtm_model_variable_nodes.csv" },
    { "label": "SDTM_IG", "filename": "stage_13_sdtm_ig_nodes.csv" },
    { "label": "SDTM_DATASET", "filename": "stage_13_sdtm_dataset_nodes.csv" },
    { "label": "SDTM_VARIABLE", "filename": "stage_13_sdtm_variable_nodes.csv" },
    { "label": "CANONICAL_REF", "filename": "stage_13_canonical_ref_nodes.csv" },
    { "type": "FROM_SOURCE", "filename": "stage_13_from_source_relationships.csv" },
    { "type": "HAS_CANONICAL_REF", "filename": "stage_13_has_canonical_ref_relationships.csv" },
    { "type": "HAS_CLASS", "filename": "stage_13_has_class_relationships.csv" },
    { "type": "HAS_DATASET", "filename": "stage_13_has_dataset_relationships.csv" },
    { "type": "HAS_VARIABLE", "filename": "stage_13_has_variable_relationships.csv" }
  ]

file_load(stage_13_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_api_source_nodes.csv', labels: ['API_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_other_source_nodes.csv', labels: ['OTHER_SOURCE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_sdtm_model_nodes.csv', labels: ['SDTM_MODEL'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_sdtm_class_nodes.csv', labels: ['SDTM_CLASS'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_sdtm_model_variable_nodes.csv', labels: ['SDTM_MODEL_VARIABLE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_13_sdtm_ig_nodes.csv', labels: ['SDTM_IG'] }, { fileN

Check that SDTM Model and IG loaded

In [753]:
with driver.session() as session:
  query = """MATCH (ds:SDTM_DATASET)-[]->(v:SDTM_VARIABLE) RETURN DISTINCT ds.name as ds_name, v.name as v_name LIMIT 20"""
  result = session.run(query)
  for record in result:
    print ("[%s, %s] loaded" % (record["ds_name"], record["v_name"]))
driver.close()

with driver.session() as session:
  query = """MATCH (m:SDTM_MODEL)-[]->(c:SDTM_CLASS)-[]->(v:SDTM_MODEL_VARIABLE) RETURN DISTINCT m.name as m_name, c.name as c_name, v.name as v_name"""
  result = session.run(query)
  for record in result:
    print ("[%s, %s, %s] loaded" % (record["m_name"], record["c_name"], record["v_name"]))
driver.close()

[AG, AGPRESP] loaded
[AG, AGSTDTC] loaded
[AG, DOMAIN] loaded
[AG, AGENRF] loaded
[AG, AGSTAT] loaded
[AG, AGSEQ] loaded
[AG, AGSTRTPT] loaded
[AG, AGSTDY] loaded
[AG, AGDOSE] loaded
[AG, AGDOSFRM] loaded
[AG, VISITNUM] loaded
[AG, TAETORD] loaded
[AG, AGDOSU] loaded
[AG, AGDOSTXT] loaded
[AG, AGLNKID] loaded
[AG, AGROUTE] loaded
[AG, AGENDTC] loaded
[AG, AGTRT] loaded
[AG, USUBJID] loaded
[AG, AGSCAT] loaded
[SDTM Model, findings, --ORRES] loaded
[SDTM Model, findings, --DTC] loaded
[SDTM Model, findings, --ORRESU] loaded


Now link up the SDTM Model and IG variables

In [754]:
with driver.session() as session:
  query = """MATCH (smv:SDTM_MODEL_VARIABLE)
    MATCH (igv:SDTM_VARIABLE) WHERE substring(smv.name, 2) = substring(igv.name, 2)
    MERGE (igv)-[:BASED_ON]->(smv)
    RETURN igv.name as ig_name, smv.name as m_name
  """
  result = session.run(query)
  for record in result:
    print ("%s -> %s linked" % (record["ig_name"], record["m_name"]))
driver.close()

BSORRES -> --ORRES linked
CPORRES -> --ORRES linked
CVORRES -> --ORRES linked
DAORRES -> --ORRES linked
DDORRES -> --ORRES linked
EGORRES -> --ORRES linked
FTORRES -> --ORRES linked
GFORRES -> --ORRES linked
IEORRES -> --ORRES linked
ISORRES -> --ORRES linked
LBORRES -> --ORRES linked
MBORRES -> --ORRES linked
MIORRES -> --ORRES linked
BSORRESU -> --ORRESU linked
CPORRESU -> --ORRESU linked
CVORRESU -> --ORRESU linked
DAORRESU -> --ORRESU linked
EGORRESU -> --ORRESU linked
FTORRESU -> --ORRESU linked
GFORRESU -> --ORRESU linked
ISORRESU -> --ORRESU linked
LBORRESU -> --ORRESU linked
MBORRESU -> --ORRESU linked
MIORRESU -> --ORRESU linked
MLDTC -> --DTC linked
BEDTC -> --DTC linked
CEDTC -> --DTC linked
DSDTC -> --DTC linked
HODTC -> --DTC linked
MHDTC -> --DTC linked
BSDTC -> --DTC linked
CPDTC -> --DTC linked
CVDTC -> --DTC linked
DADTC -> --DTC linked
DDDTC -> --DTC linked
EGDTC -> --DTC linked
FTDTC -> --DTC linked
GFDTC -> --DTC linked
IEDTC -> --DTC linked
ISDTC -> --DTC linked
LB

And link the model variables to the canonical model.

In [755]:
with driver.session() as session:
  query = """MATCH (smv:SDTM_MODEL_VARIABLE)-[]->(cr:CANONICAL_REF)
    MATCH (CANONICAL_NODE {name: cr.node})-[]->(CANONICAL_DATA_TYPE {name: cr.data_type})-[]->(cdtp:CANONICAL_DATA_TYPE_PROPERTY {name: cr.property})
    MERGE (smv)-[:IS_CANONICAL_REF]->(cdtp)
    RETURN smv.name as v_name, cr.node as c_node, cr.data_type as c_dt, cr.property as c_property, cdtp.uri as uri
  """
  result = session.run(query)
  for record in result:
    print ("%s -> [%s, %s, %s] -> %s linked" % (record["v_name"], record["c_node"], record["c_dt"], record["c_property"], record["uri"]))
driver.close()

--DTC -> [DATE TIME, date_time, value] -> http://id.d4k.dk/dataset/canonical/common/date_time/date_time/value linked
--ORRES -> [RESULT, coding, code] -> http://id.d4k.dk/dataset/canonical/observation/observation_result/result/coding/code linked
--ORRES -> [RESULT, quantity, value] -> http://id.d4k.dk/dataset/canonical/observation/observation_result/result/quantity/value linked
--ORRESU -> [RESULT, quantity, unit] -> http://id.d4k.dk/dataset/canonical/observation/observation_result/result/quantity/unit linked


Now load the DDF study example data. Provides an example of a "industry standard" design.

In [756]:
stage_12_files = [    
    { "label": "ENDPOINT", "filename": "stage_12_endpoint_nodes.csv" },
    { "label": "STUDY_DATA", "filename": "stage_12_study_data_nodes.csv" },
    { "label": "PROCEDURE", "filename": "stage_12_procedure_nodes.csv" },
    { "label": "ACTIVITY", "filename": "stage_12_activity_nodes.csv" },
    { "label": "WORKFLOW_ITEM", "filename": "stage_12_workflow_item_nodes.csv" },
    { "label": "VISIT", "filename": "stage_12_visit_nodes.csv" },
    { "label": "STUDY_CELL", "filename": "stage_12_study_cell_nodes.csv" },
    { "label": "RULE", "filename": "stage_12_rule_nodes.csv" },
    { "label": "STUDY_ELEMENT", "filename": "stage_12_study_element_nodes.csv" },
    { "label": "EPOCH", "filename": "stage_12_study_epoch_nodes.csv" },
    { "label": "STUDY_ARM", "filename": "stage_12_study_arm_nodes.csv" },
    { "label": "OBJECTIVE", "filename": "stage_12_objective_nodes.csv" },
    { "label": "CODE", "filename": "stage_12_code_nodes.csv" },
    { "label": "INVESTIGATIONAL_INTERVENTIONS", "filename": "stage_12_investigational_interventions_nodes.csv" },
    { "label": "POPULATION", "filename": "stage_12_population_nodes.csv" },
    { "label": "STUDY_DESIGN", "filename": "stage_12_study_design_nodes.csv" },
    { "label": "INDICATION", "filename": "stage_12_indication_nodes.csv" },
    { "label": "STUDY_PROTOCOL", "filename": "stage_12_study_protocol_nodes.csv" },
    { "label": "STUDY_PHASE", "filename": "stage_12_study_phase_nodes.csv" },
    { "label": "STUDY_TYPE", "filename": "stage_12_study_type_nodes.csv" },
    { "label": "STUDY_IDENTIFIER", "filename": "stage_12_study_identifier_nodes.csv" },
    { "label": "STUDY", "filename": "stage_12_study_nodes.csv" },
    { "type": "HAS_CODED", "filename": "stage_12_has_coded_relationships.csv" },
    { "type": "HAS_ENDPOINT", "filename": "stage_12_has_endpoint_relationships.csv" },
    { "type": "HAS_STUDY_DATA", "filename": "stage_12_has_study_data_relationships.csv" },
    { "type": "HAS_PROCEDURE", "filename": "stage_12_has_procedure_relationships.csv" },
    { "type": "HAS_PREVIOUS_ACTIVITY", "filename": "stage_12_has_previous_activity_relationships.csv" },
    { "type": "USED_IN_VISIT", "filename": "stage_12_used_in_visit_relationships.csv" },
    { "type": "HAS_ACTIVITY", "filename": "stage_12_has_activity_relationships.csv" },
    { "type": "HAS_VISIT", "filename": "stage_12_has_visit_relationships.csv" },
    { "type": "HAS_END_RULE", "filename": "stage_12_has_end_rule_relationships.csv" },
    { "type": "HAS_START_RULE", "filename": "stage_12_has_start_rule_relationships.csv" },
    { "type": "HAS_ELEMENT", "filename": "stage_12_has_element_relationships.csv" },
    { "type": "HAS_EPOCH", "filename": "stage_12_has_epoch_relationships.csv" },
    { "type": "HAS_ARM", "filename": "stage_12_has_arm_relationships.csv" },
    { "type": "HAS_CELL", "filename": "stage_12_has_cell_relationships.csv" },
    { "type": "HAS_OBJECTIVE", "filename": "stage_12_has_objective_relationships.csv" },
    { "type": "HAS_INDICATION", "filename": "stage_12_has_indication_relationships.csv" },
    { "type": "HAS_INVESTIGATIONAL_INTERVENTION", "filename": "stage_12_has_investigational_intervention_relationships.csv" },
    { "type": "HAS_POPULATION", "filename": "stage_12_has_population_relationships.csv" },
    { "type": "HAS_STUDY_DESIGN", "filename": "stage_12_has_study_design_relationships.csv" },
    { "type": "HAS_PROTOCOL", "filename": "stage_12_has_protocol_relationships.csv" },
    { "type": "HAS_STUDY_PHASE", "filename": "stage_12_has_study_phase_relationships.csv" },
    { "type": "HAS_STUDY_TYPE", "filename": "stage_12_has_study_type_relationships.csv" },
    { "type": "HAS_IDENTIFIER", "filename": "stage_12_has_identifier_relationships.csv" }
  ]

file_load(stage_12_files)

CALL apoc.import.csv( [{ fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_endpoint_nodes.csv', labels: ['ENDPOINT'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_study_data_nodes.csv', labels: ['STUDY_DATA'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_procedure_nodes.csv', labels: ['PROCEDURE'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_activity_nodes.csv', labels: ['ACTIVITY'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_workflow_item_nodes.csv', labels: ['WORKFLOW_ITEM'] }, { fileName: 'https://raw.githubusercontent.com/data4knowledge/biomedical_concepts/main/data/csv_load/stage_12_visit_nodes.csv', labels: ['VISIT'] }, { fileName: 'https://raw.githubuserco

Add in a study URI for the study

In [757]:
def set_study_uri(brief_title, uri):
  with driver.session() as session:
    query = """MATCH (p:STUDY_PROTOCOL {brief_title: '%s'})<-[]-(s:STUDY)
      SET s.uri = '%s'
      """ % (brief_title, uri)
    result = session.run(query)
  driver.close()

def get_study_uri(brief_title):
  with driver.session() as session:
    query = """MATCH (p:STUDY_PROTOCOL {brief_title: '%s'})<-[]-(s:STUDY)
      RETURN s.uri as uri;
      """ % (brief_title)
    result = session.run(query)
    for record in result:
      the_result = record["uri"]
  driver.close()
  return the_result

set_study_uri("DDR", "http://id.d4k.dk/dataset/study/ddr")
print("URI for study DDR is %s" % (get_study_uri("DDR")))

URI for study DDR is http://id.d4k.dk/dataset/study/ddr


Duplicate the BC. The study activity Study data node uses WGHT as a code, really need to update it to be "Weight" so we can name match all BCs. Just work round this for the moment.

In [758]:
def duplicate_bc(study_data_name, bc_name, study_uri):
  bc_uri = "%s/bc/%s" % (study_uri, bc_name)
  with driver.session() as session:
    query = """MATCH (a:ACTIVITY)-[]->(sd:STUDY_DATA) WHERE sd.name = '%s'
      WITH a, sd
      MATCH (bc:BC_INSTANCE) WHERE bc.name = '%s'
      WITH a, sd, bc
      CREATE (a)-[:HAS_BC]->(new:STUDY_BC_INSTANCE)
      SET new = bc
      SET new.uri = '%s'
      CREATE (new)-[:BASED_ON]->(bc)
      WITH bc, new
      CALL apoc.path.subgraphAll(bc, {relationshipFilter:'HAS_ITEM>|HAS_DATA_TYPE>|HAS_RESPONSE>'})
      YIELD nodes, relationships
      CALL apoc.refactor.cloneSubgraph(
        nodes,
        [rel in relationships WHERE type(rel) = 'HAS_DATA_TYPE_PROPERTY'],
        { standinNodes:[[bc, new]] })
      YIELD input, output, error
      RETURN output;
      """ % (study_data_name, bc_name, bc_uri)
    result = session.run(query)
    print("%i nodes duplicated for BC %s" % (len(result.values()), bc_name))
  driver.close()
  return bc_uri

def update_bc_uris(bc_uri):
  with driver.session() as session:
    ids = []
    query = """MATCH (bc:STUDY_BC_INSTANCE) WHERE bc.uri = '%s'
      WITH bc
      CALL apoc.path.subgraphAll(bc, {relationshipFilter:'HAS_ITEM>|HAS_DATA_TYPE>|HAS_DATA_TYPE_PROPERTY>|HAS_RESPONSE>'})
      YIELD nodes, relationships
      RETURN nodes;
    """ % (bc_uri)
    result = session.run(query)
    for record in result:
      for node in record:
        for node1 in node:
          if node1['uri'] == bc_uri:
            continue
          ids.append({"id": node1.id})
 
    query = """UNWIND $id_data AS d
      MATCH (p)-[]->(n) WHERE ID(n)=d.id
      SET n.uri = p.uri + '/' + replace(toLower(n.name), " ", "_")
      RETURN n.uri as uri"""
    result = session.run(query, id_data=ids)
    for record in result:
      print("Node uri updated: %s" % (record["uri"]))
  driver.close()

def add_bc_canonical_refs(bc_uri):
  with driver.session() as session:
    query = """MATCH (n:STUDY_BC_INSTANCE {uri: '%s'})-[:BASED_ON]->(in)-[:BASED_ON]->(t)-[]->(i:BC_ITEM)
      -[]->(dt:BC_DATA_TYPE)
      -[]->(p:BC_DATA_TYPE_PROPERTY)
      -[]->(cr:CANONICAL_DATA_TYPE_PROPERTY) 
      WITH n, i.name as i_name, dt.name as dt_name, p.name as p_name, cr
      MATCH (n:STUDY_BC_INSTANCE {uri: '%s'})-[]->(BC_ITEM {name: i_name})
      -[]->(BC_DATA_TYPE {name: dt_name})
      -[]->(sp:BC_DATA_TYPE_PROPERTY {name: p_name})
      WITH sp, cr
      CREATE (sp)-[:IS_CANONICAL_REF]->(cr) 
      RETURN sp.name as name, cr.uri as uri
    """ % (bc_uri, bc_uri)
    result = session.run(query)
    for record in result:
      print("Canonical reference set: %s -> %s" % (record["name"], record["uri"]))
  driver.close()

study_uri = get_study_uri("DDR")
bc_uri = duplicate_bc("WGHT", "Weight", study_uri)
link_data_types("BC_ITEM", "BC_DATA_TYPE")
duplicate_data_type_nodes("BC_DATA_TYPE", "BC_DATA_TYPE_PROPERTY")
update_bc_uris(bc_uri)
add_bc_canonical_refs(bc_uri)

13 nodes duplicated for BC Weight
[Test, coding] -> [FHIR, coding]
[Result, quantity] -> [FHIR, quantity]
[Date Time, date_time] -> [FHIR, date_time]
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/version, version
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/system, system
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/code, code
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/user_selected, user_selected
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/test/coding/display, display
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/result/quantity/code, code
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/result/quantity/value, value
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/result/quantity/unit, unit
Node duplicated: http://id.d4k.dk/dataset/bc_instance/weight/result/quantity/system, system
Node duplicated: http://id.d4k.dk/data

Now load any Code Lists used by the BCs. Check loaded OK.

- C66741 - VSTESTCD
- C66770 - VSRESU

In [759]:
load_cl("sdtm", "C66741")
load_cl("sdtm", "C66770")
dump_cl("C66741")
dump_cl("C66770")


2021-12-17: [C66741, VSTESTCD], [C81298, BODLNGTH]
2021-12-17: [C66741, VSTESTCD], [C98793, TRSKNF]
2021-12-17: [C66741, VSTESTCD], [C163569, WTAPCTL]
2021-12-17: [C66741, VSTESTCD], [C172610, DBPAPCTL]
2021-12-17: [C66741, VSTESTCD], [C168125, CPLRFLT]
2021-12-17: [C66741, VSTESTCD], [C178060, CALFCIR]
2021-12-17: [C66741, VSTESTCD], [C174371, TEMPPB]
2021-12-17: [C66741, VSTESTCD], [C81255, HDCIRC]
2021-12-17: [C66741, VSTESTCD], [C147491, ENRGEXP]
2021-12-17: [C66741, VSTESTCD], [C117976, IDEALWT]
2021-12-17: [C66741, VSTESTCD], [C98785, SSSKNF]
2021-12-17: [C66741, VSTESTCD], [C174376, ULNARL]
2021-12-17: [C66741, VSTESTCD], [C174374, FTWTGAPL]
2021-12-17: [C66741, VSTESTCD], [C174311, SAO2FIO2]
2021-12-17: [C66741, VSTESTCD], [C132482, EWEIGHT]
2021-12-17: [C66741, VSTESTCD], [C100947, HIPCIR]
2021-12-17: [C66741, VSTESTCD], [C154891, NECKCIR]
2021-12-17: [C66741, VSTESTCD], [C49676, PULSE]
2021-12-17: [C66741, VSTESTCD], [C156606, CHESTCIR]
2021-12-17: [C66741, VSTESTCD], [C18155

Now link up to the CT for the library BCs and the Study versions

In [760]:
with driver.session() as session:
  query = """MATCH (n:BC_DATA_TYPE)-[:HAS_RESPONSE]->(d) 
    WITH n,d
    MATCH (cl:SKOS_CONCEPT {identifier: d.cl})-[]->(cli:SKOS_CONCEPT {identifier: d.cli})
    MERGE (n)-[:HAS_RESPONSE]->(cli)
    DETACH DELETE d
    RETURN n.name as name, cli.uri as uri
  """ 
  result = session.run(query)
  for record in result:
    print ("%s -> %s" % (record["name"], record["uri"]))
driver.close()

quantity -> http://id.d4k.dk/dataset/cdisc/ct/v48/sdtm/C66770-C48531
quantity -> http://id.d4k.dk/dataset/cdisc/ct/v48/sdtm/C66770-C48531
quantity -> http://id.d4k.dk/dataset/cdisc/ct/v48/sdtm/C66770-C28252
quantity -> http://id.d4k.dk/dataset/cdisc/ct/v48/sdtm/C66770-C28252
coding -> http://id.d4k.dk/dataset/cdisc/ct/v48/sdtm/C66741-C25208
coding -> http://id.d4k.dk/dataset/cdisc/ct/v48/sdtm/C66741-C25208


Now add in some data points. A data point will point at the BC and the visit to which is applies (of the WorkFlow Item). This is the nature of the DDF model. I think we can make it better.

In [761]:
with driver.session() as session:
  query = """MATCH (v:VISIT)<-[]-(w:WORKFLOW_ITEM)-[]->(a:ACTIVITY)-[]->(bc:STUDY_BC_INSTANCE) RETURN v.name as visit, w.id as wfi , a.description as activity, bc.name as bc""" 
  result = session.run(query)
  for record in result:
    print ("%s -> %s -> %s -> %s" % (record["visit"], record["wfi"], record["activity"], record["bc"]))
driver.close()

FU 1 -> 1490 -> Weight -> Weight
CYCLE 1, TREATMENT DAY 1 -> 1488 -> Weight -> Weight
CYCLE 2, TREATMENT DAY 1 -> 1489 -> Weight -> Weight


Create some simple data. We need the actual data point plus just enough information to attach it to the study graph. So 

- Create subjects
- Create data points and link into the study design and the BC attached to (the Study instance, not the definition of the BC that we copied earlier)

In [762]:
uri_refs = {}
uri_refs["value"] = "http://id.d4k.dk/dataset/study/ddr/bc/Weight/result/quantity/value"
uri_refs["unit"] = "http://id.d4k.dk/dataset/study/ddr/bc/Weight/result/quantity/unit"
uri_refs["date_time"] = "http://id.d4k.dk/dataset/study/ddr/bc/Weight/date_time/date_time/value"

subjects = [ 
  { "subject_id": "1234"}, 
  { "subject_id": "1235" }, 
  { "subject_id": "1236" }, 
  { "subject_id": "1237" }
]
subject_data = [
  { "subject_id": "1234", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": uri_refs["value"], "value": "76" },
  { "subject_id": "1234", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": uri_refs["unit"], "value": "kg" },
  { "subject_id": "1234", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": uri_refs["date_time"], "value": "2022-03-01T13:57:00" },
  
  { "subject_id": "1234", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": uri_refs["value"], "value": "78" },
  { "subject_id": "1234", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": uri_refs["unit"], "value": "kg" },
  { "subject_id": "1234", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": uri_refs["date_time"], "value": "2022-03-10T13:57:00" },
  
  { "subject_id": "1234", "visit": "FU 1", "data_point": uri_refs["value"], "value": "77" },
  { "subject_id": "1234", "visit": "FU 1", "data_point": uri_refs["unit"], "value": "kg" },
  { "subject_id": "1234", "visit": "FU 1", "data_point": uri_refs["date_time"], "value": "2022-03-19T13:57:00" },
  
  { "subject_id": "1235", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": uri_refs["value"], "value": "42" },
  { "subject_id": "1235", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": uri_refs["unit"], "value": "kg" },
  { "subject_id": "1235", "visit": "CYCLE 1, TREATMENT DAY 1", "data_point": uri_refs["date_time"], "value": "2022-03-03T13:57:00" },
  
  { "subject_id": "1235", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": uri_refs["value"], "value": "44" },
  { "subject_id": "1235", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": uri_refs["unit"], "value": "kg" },
  { "subject_id": "1235", "visit": "CYCLE 2, TREATMENT DAY 1", "data_point": uri_refs["date_time"], "value": "2022-03-13T13:57:00" },
  
  { "subject_id": "1235", "visit": "FU 1", "data_point": uri_refs["value"], "value": "43" },
  { "subject_id": "1235", "visit": "FU 1", "data_point": uri_refs["unit"], "value": "kg" },
  { "subject_id": "1235", "visit": "FU 1", "data_point": uri_refs["date_time"], "value": "2022-03-22T13:57:00" },
]

print(subject_data)

[{'subject_id': '1234', 'visit': 'CYCLE 1, TREATMENT DAY 1', 'data_point': 'http://id.d4k.dk/dataset/study/ddr/bc/Weight/result/quantity/value', 'value': '76'}, {'subject_id': '1234', 'visit': 'CYCLE 1, TREATMENT DAY 1', 'data_point': 'http://id.d4k.dk/dataset/study/ddr/bc/Weight/result/quantity/unit', 'value': 'kg'}, {'subject_id': '1234', 'visit': 'CYCLE 1, TREATMENT DAY 1', 'data_point': 'http://id.d4k.dk/dataset/study/ddr/bc/Weight/date_time/date_time/value', 'value': '2022-03-01T13:57:00'}, {'subject_id': '1234', 'visit': 'CYCLE 2, TREATMENT DAY 1', 'data_point': 'http://id.d4k.dk/dataset/study/ddr/bc/Weight/result/quantity/value', 'value': '78'}, {'subject_id': '1234', 'visit': 'CYCLE 2, TREATMENT DAY 1', 'data_point': 'http://id.d4k.dk/dataset/study/ddr/bc/Weight/result/quantity/unit', 'value': 'kg'}, {'subject_id': '1234', 'visit': 'CYCLE 2, TREATMENT DAY 1', 'data_point': 'http://id.d4k.dk/dataset/study/ddr/bc/Weight/date_time/date_time/value', 'value': '2022-03-10T13:57:00'},

Create the subjects.

In [763]:
with driver.session() as session:
  for subject in subjects:
    query = """MATCH (p:STUDY_PROTOCOL {brief_title:'DDR'})<-[]-(s:STUDY)
      WITH s
      CREATE (s)<-[:ENROLLED_IN]-(ss:STUDY_SUBJECT)
      SET ss.identifier = '%s'
    """ % (subject["subject_id"]) 
    result = session.run(query)
  query = """MATCH (s:STUDY_SUBJECT) RETURN s.identifier as subject"""
  result = session.run(query)
  for record in result:
    print ("Subject %s created" % (record["subject"]))
driver.close()

Subject 1234 created
Subject 1235 created
Subject 1236 created
Subject 1237 created


Now add in the data

In [764]:
with driver.session() as session:
  for item in subject_data:
    query = """MATCH (s:STUDY_SUBJECT {identifier: '%s'})
      WITH s
      MATCH (v:VISIT {name: '%s'})<-[]-(wi:WORKFLOW_ITEM)-[:HAS_ACTIVITY]->(a)-[:HAS_BC]->(bc)-[:HAS_ITEM]->
        (i)-[:HAS_DATA_TYPE]->(dt)-[:HAS_DATA_TYPE_PROPERTY]->(d:BC_DATA_TYPE_PROPERTY {uri: '%s'})
      WITH s, wi, d
      CREATE (s)<-[:FOR_SUBJECT]-(sdp:STUDY_DATA_POINT)-[:FOR_WORKFLOW_ITEM]->(wi)
      WITH sdp, d
      CREATE (sdp)-[:FOR_VALUE]->(d)
      SET sdp.value = '%s'
      SET sdp.uuid = apoc.create.uuid()
    """ % (item["subject_id"], item["visit"], item["data_point"], item["value"])
    result = session.run(query)
  query = """MATCH (s:STUDY_DATA_POINT) RETURN DISTINCT s.value as value, s.uuid as uuid"""
  result = session.run(query)
  for record in result:
    print ("Value %s, %s created" % (record["value"], record["uuid"]))
driver.close()

Value 76, a799847e-62a9-43ac-93a9-aeb3c41b71b4 created
Value kg, 0dc34279-56bc-44e6-896f-7a0156f0fc69 created
Value 2022-03-01T13:57:00, c03f385b-20ec-44c0-9488-81141a464d59 created
Value 78, 01bf974d-d420-4623-8e66-955f45cc7a23 created
Value kg, 03befc73-b97b-43b8-939f-a2ecd688ea5c created
Value 2022-03-10T13:57:00, e38e3de4-dbeb-4b67-b9d1-ad07bee1f67e created
Value 77, 65059f85-bf29-476f-a46d-8d06f134e781 created
Value kg, b81fa913-e5b9-4240-8f2c-32f4034f3c52 created
Value 2022-03-19T13:57:00, d16f6335-48d9-4652-9538-004b42e1738a created
Value 42, 33f665ab-6f96-4ae2-a498-5412ab5003a4 created
Value kg, d485b50a-c803-428a-af7b-cac252300174 created
Value 2022-03-03T13:57:00, 5231c023-5443-4d58-8e7f-8e730f78584a created
Value 44, 541e5f4b-0b5b-4e8b-9eb0-b687425a3fee created
Value kg, 6393dc6e-7034-4224-8bb6-07d4f5747804 created
Value 2022-03-13T13:57:00, ff44ce37-c2f9-4bb6-b590-501d2621948a created
Value 43, 69ad39c3-a4d9-4402-a33a-e03dba74d246 created
Value kg, 4c48906e-2c88-428b-b064-0

So having linked the very basic data we have all the pieces in place. We can now start getting data out, simple at the moment but ...

Query the VS domain

In [765]:
with driver.session() as session:
  query = """MATCH (sd:SDTM_DATASET {name: 'VS'})-[]->(sv:SDTM_VARIABLE)-[]->(cv:SDTM_MODEL_VARIABLE)-[:IS_CANONICAL_REF]->(fdt:CANONICAL_DATA_TYPE_PROPERTY)
  <-[:IS_CANONICAL_REF]-(bdt:BC_DATA_TYPE_PROPERTY)<-[:FOR_VALUE]-(sdp:STUDY_DATA_POINT)-[]->(wfi:WORKFLOW_ITEM), 
  (wfi)-[:USED_IN_VISIT]->(v:VISIT), 
  (sdp)-[:FOR_SUBJECT]->(subj:STUDY_SUBJECT),
  (e:EPOCH)-[]->(v),
  (ct)<-[:HAS_RESPONSE]-()<-[:HAS_DATA_TYPE]-()<-[:HAS_IDENTIFIER]-(bc:STUDY_BC_INSTANCE)-[*]->(bdt)
RETURN sd.name as domain, sv.name as variable, sdp.value as data, wfi.id as uuid, v.name as visit, e.study_epoch_name as epoch, subj.identifier as subject, ct.notation as test_code
  """
  result = session.run(query)
  for record in result:
    print ("%s, %s, %s, %s, %s, %s, [%s -> %s]" % (record["domain"], record["variable"], record["test_code"], record["subject"], record["uuid"], record["data"], record["visit"], record["epoch"]))
driver.close()

VS, VSORRES, WEIGHT, 1235, 1490, 43, [FU 1 -> FOLLOW-UP]
VS, VSORRES, WEIGHT, 1235, 1489, 44, [CYCLE 2, TREATMENT DAY 1 -> TREATMENT]
VS, VSORRES, WEIGHT, 1235, 1488, 42, [CYCLE 1, TREATMENT DAY 1 -> TREATMENT]
VS, VSORRES, WEIGHT, 1234, 1490, 77, [FU 1 -> FOLLOW-UP]
VS, VSORRES, WEIGHT, 1234, 1489, 78, [CYCLE 2, TREATMENT DAY 1 -> TREATMENT]
VS, VSORRES, WEIGHT, 1234, 1488, 76, [CYCLE 1, TREATMENT DAY 1 -> TREATMENT]
VS, VSORRESU, WEIGHT, 1235, 1490, kg, [FU 1 -> FOLLOW-UP]
VS, VSORRESU, WEIGHT, 1235, 1489, kg, [CYCLE 2, TREATMENT DAY 1 -> TREATMENT]
VS, VSORRESU, WEIGHT, 1235, 1488, kg, [CYCLE 1, TREATMENT DAY 1 -> TREATMENT]
VS, VSORRESU, WEIGHT, 1234, 1490, kg, [FU 1 -> FOLLOW-UP]
VS, VSORRESU, WEIGHT, 1234, 1489, kg, [CYCLE 2, TREATMENT DAY 1 -> TREATMENT]
VS, VSORRESU, WEIGHT, 1234, 1488, kg, [CYCLE 1, TREATMENT DAY 1 -> TREATMENT]
VS, VSDTC, WEIGHT, 1235, 1490, 2022-03-22T13:57:00, [FU 1 -> FOLLOW-UP]
VS, VSDTC, WEIGHT, 1235, 1489, 2022-03-13T13:57:00, [CYCLE 2, TREATMENT DAY 1 

So, next ...
- Expand, adding more BCs, data etc
- Generate a CRF for the study
- Generate an aCRF
- Generate a define.xml
- Expand, adding more BCs, data etc