In [18]:

import yaml
import json
import os
from pathlib import Path
from neo4j import GraphDatabase




In [19]:
with open("config.yaml", "r") as stream:
    try:
        PARAM = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [20]:

driver = GraphDatabase.driver(PARAM["neo4j_url"], auth=(PARAM["neo4j_username"], PARAM["neo4j_password"]))

records, summary, keys = driver.execute_query(f"""
    CALL apoc.meta.schema()
    YIELD value RETURN value;
    """,
    database_="neo4j",
)
# Loop through results and do something with them
for record in records:
    schema = record.data()["value"]
    json_schema = json.dumps(record.data()["value"])
    #print (json_schema)


In [21]:
schema.keys()

dict_keys(['Condition', 'MedicineMSH', 'HPO_IS_A', 'FOCUSES_ON', 'Sponsors', 'HPO_BELONGS_TO', 'MSH_BELONGS_TO', 'Trial', 'MSH_IS_MAPPED_TO', 'ConditionHPO', 'MSH_IS_A', 'TESTS', 'ConditionMSH', 'Medicine', 'Sponsor'])

In [22]:
schema

{'Condition': {'count': 279,
  'labels': [],
  'properties': {'MSH': {'unique': False,
    'indexed': False,
    'type': 'LIST',
    'existence': False},
   'HPO': {'unique': False,
    'indexed': False,
    'type': 'LIST',
    'existence': False},
   'death_number': {'unique': False,
    'indexed': False,
    'type': 'STRING',
    'existence': False},
   'DALY_rate': {'unique': False,
    'indexed': False,
    'type': 'STRING',
    'existence': False},
   'death_percent': {'unique': False,
    'indexed': False,
    'type': 'STRING',
    'existence': False},
   'name': {'unique': False,
    'indexed': False,
    'type': 'STRING',
    'existence': False},
   'DALY_percent': {'unique': False,
    'indexed': False,
    'type': 'STRING',
    'existence': False},
   'DALY_number': {'unique': False,
    'indexed': False,
    'type': 'STRING',
    'existence': False},
   'UMLS': {'unique': True,
    'indexed': True,
    'type': 'STRING',
    'existence': False},
   'deaths_rate': {'unique': F

In [23]:
schema['Condition']['type']

'node'

In [24]:
schema['Condition']

{'count': 279,
 'labels': [],
 'properties': {'MSH': {'unique': False,
   'indexed': False,
   'type': 'LIST',
   'existence': False},
  'HPO': {'unique': False,
   'indexed': False,
   'type': 'LIST',
   'existence': False},
  'death_number': {'unique': False,
   'indexed': False,
   'type': 'STRING',
   'existence': False},
  'DALY_rate': {'unique': False,
   'indexed': False,
   'type': 'STRING',
   'existence': False},
  'death_percent': {'unique': False,
   'indexed': False,
   'type': 'STRING',
   'existence': False},
  'name': {'unique': False,
   'indexed': False,
   'type': 'STRING',
   'existence': False},
  'DALY_percent': {'unique': False,
   'indexed': False,
   'type': 'STRING',
   'existence': False},
  'DALY_number': {'unique': False,
   'indexed': False,
   'type': 'STRING',
   'existence': False},
  'UMLS': {'unique': True,
   'indexed': True,
   'type': 'STRING',
   'existence': False},
  'deaths_rate': {'unique': False,
   'indexed': False,
   'type': 'STRING',
   '

In [25]:
#Condition: {SNOMEDCT: STRING}
schema_dict = {}
node_index = {}

relation_A_B = {}

for key in schema.keys():
    if schema[key]['type'] == 'node':
        schema_dict[key] = {}

        for p in schema[key]['properties'].keys():
            schema_dict[key][p] = schema[key]['properties'][p]['type']

            if schema[key]['properties'][p]['unique'] == True:
                node_index[key] = p
        

        for r in schema[key]['relationships'].keys():
            if schema[key]['relationships'][r]["direction"] == "out":
                relation_A_B[r] = (key, schema[key]['relationships'][r]["labels"][0])

In [26]:
schema_dict

{'Condition': {'MSH': 'LIST',
  'HPO': 'LIST',
  'death_number': 'STRING',
  'DALY_rate': 'STRING',
  'death_percent': 'STRING',
  'name': 'STRING',
  'DALY_percent': 'STRING',
  'DALY_number': 'STRING',
  'UMLS': 'STRING',
  'deaths_rate': 'STRING'},
 'MedicineMSH': {'MSH': 'STRING', 'name': 'STRING'},
 'Trial': {'id': 'STRING',
  'date_added': 'STRING',
  'name': 'STRING',
  'original_condition': 'STRING',
  'phase': 'STRING',
  'analysis_ready_data': 'STRING',
  'url': 'STRING',
  'original_medicine': 'STRING'},
 'ConditionHPO': {'HPO': 'STRING', 'name': 'STRING'},
 'ConditionMSH': {'may_be_treated_by': 'LIST',
  'MSH': 'STRING',
  'name': 'STRING'},
 'Medicine': {'MSH': 'LIST',
  'HPO': 'LIST',
  'RXNORM': 'LIST',
  'name': 'STRING',
  'other_name': 'STRING',
  'UMLS': 'STRING'},
 'Sponsor': {'name': 'STRING'}}

In [27]:
node_index

{'Condition': 'UMLS',
 'MedicineMSH': 'MSH',
 'Trial': 'id',
 'ConditionHPO': 'HPO',
 'ConditionMSH': 'MSH',
 'Medicine': 'UMLS',
 'Sponsor': 'name'}

In [28]:
relation_A_B

{'HPO_IS_A': ('Condition', 'ConditionHPO'),
 'MSH_IS_A': ('Medicine', 'MedicineMSH'),
 'MSH_BELONGS_TO': ('MedicineMSH', 'MedicineMSH'),
 'MSH_IS_MAPPED_TO': ('MedicineMSH', 'MedicineMSH'),
 'FOCUSES_ON': ('Trial', 'Condition'),
 'TESTS': ('Trial', 'Medicine'),
 'HPO_BELONGS_TO': ('ConditionHPO', 'ConditionHPO'),
 'Sponsors': ('Sponsor', 'Trial')}

In [29]:
schema['Condition']['properties']

{'MSH': {'unique': False,
  'indexed': False,
  'type': 'LIST',
  'existence': False},
 'HPO': {'unique': False,
  'indexed': False,
  'type': 'LIST',
  'existence': False},
 'death_number': {'unique': False,
  'indexed': False,
  'type': 'STRING',
  'existence': False},
 'DALY_rate': {'unique': False,
  'indexed': False,
  'type': 'STRING',
  'existence': False},
 'death_percent': {'unique': False,
  'indexed': False,
  'type': 'STRING',
  'existence': False},
 'name': {'unique': False,
  'indexed': False,
  'type': 'STRING',
  'existence': False},
 'DALY_percent': {'unique': False,
  'indexed': False,
  'type': 'STRING',
  'existence': False},
 'DALY_number': {'unique': False,
  'indexed': False,
  'type': 'STRING',
  'existence': False},
 'UMLS': {'unique': True,
  'indexed': True,
  'type': 'STRING',
  'existence': False},
 'deaths_rate': {'unique': False,
  'indexed': False,
  'type': 'STRING',
  'existence': False}}

In [30]:
for label in schema.keys():
    if schema[label]['type'] == 'node':
        print (label)

Condition
MedicineMSH
Trial
ConditionHPO
ConditionMSH
Medicine
Sponsor


In [31]:
# variable_name = "t"
# output_directory = "tsv"

# for node_type in schema.keys():
#     if schema[node_type]['type'] == 'node':

#         records, summary, keys = driver.execute_query(f"""
#             MATCH ({variable_name}:{node_type})
#             RETURN {variable_name}
#             """,
#             database_="neo4j",
#         )
        
#         header = list(schema[node_type]['properties'].keys())
#         content = "\t".join(header) + "\n"
#         for record in records:
#             result = record.data()[f"{variable_name}"]
            
#             for h in header:
#                 if h in result:
#                     content += str(result[h]) + "\t"
#                 else:
#                     content += "\t"
#             content = content[:-1] + "\n"

#         with open(os.path.join(output_directory, f"{node_type}.tsv"), 'w') as f:
#             f.write(content)


In [32]:
for node_type in schema.keys():
    if schema[node_type]['type'] == 'relationship':
        print (node_type, schema[node_type])

HPO_IS_A {'count': 148, 'properties': {}, 'type': 'relationship'}
FOCUSES_ON {'count': 3713, 'properties': {}, 'type': 'relationship'}
Sponsors {'count': 3052, 'properties': {}, 'type': 'relationship'}
HPO_BELONGS_TO {'count': 478, 'properties': {}, 'type': 'relationship'}
MSH_BELONGS_TO {'count': 272, 'properties': {}, 'type': 'relationship'}
MSH_IS_MAPPED_TO {'count': 308, 'properties': {}, 'type': 'relationship'}
MSH_IS_A {'count': 547, 'properties': {}, 'type': 'relationship'}
TESTS {'count': 3450, 'properties': {}, 'type': 'relationship'}


In [33]:
variable_name = "t"
output_directory = "json"

for relation_type in relation_A_B.keys():
    A = relation_A_B[relation_type][0]
    B = relation_A_B[relation_type][1]
    
    A_index = node_index[A]
    B_index = node_index[B]


    records, summary, keys = driver.execute_query(f"""
            MATCH (A_:{A}) -[r:{relation_type}]-> (B_:{B})
            RETURN A_.{A_index}, B_.{B_index}
            """,
            database_="neo4j",
        )

    content = ""
        #header = list(schema[node_type]['properties'].keys())
        #content = "\t".join(header) + "\n"
    for record in records:
            
        from_ = record.data()[f"A_.{A_index}"]
        to_ = record.data()[f"B_.{B_index}"]

        content += json.dumps({"from": from_, "to": to_}) + "\n"
    
    with open(os.path.join(output_directory, f"{relation_type}.json"), 'w') as f:
            f.write(content)

In [34]:
output_directory = "json"

for node_type in schema.keys():
    if schema[node_type]['type'] == 'node':

        records, summary, keys = driver.execute_query(f"""
            MATCH ({variable_name}:{node_type})
            RETURN {variable_name}
            """,
            database_="neo4j",
        )
        
        content = ""
        #header = list(schema[node_type]['properties'].keys())
        #content = "\t".join(header) + "\n"
        for record in records:
            
            content += json.dumps(record.data()[variable_name]) + "\n"

        with open(os.path.join(output_directory, f"{node_type}.json"), 'w') as f:
            f.write(content)

In [35]:
from google.cloud import bigquery

In [38]:
bq_client = bigquery.Client()
bigquery_project = PARAM["bigquery_project"]
bigquery_dataset = PARAM["bigquery_dataset"]
bq_client.create_dataset(dataset=bigquery_dataset, exists_ok=True)

Dataset(DatasetReference('544977051234', 'trials_3000'))

In [39]:
#for f in bucket.list_blobs(prefix='json'):
#for f in bucket.list_blobs(prefix='tsv'):
from pathlib import Path

pathlist = Path("json").rglob('*.json')
for path in pathlist:
    full_path = str(path)
    
    print (full_path)

    filename = full_path.split("/")[1]
    nodename = filename.split(".")[0]
    # print (full_path, nodename)

    # #schema_setting = [bigquery.SchemaField(property_, schema_dict[nodename][property_]) for property_ in schema_dict[nodename].keys()]
    # schema_setting = []

    # # for property_ in schema_dict[nodename].keys():
    # #     type_ = schema_dict[nodename][property_]
    # #     if type_ != "LIST":
    # #         schema_setting.append(bigquery.SchemaField(property_, type_))
    # #     else:
 
    # #         schema_setting.append(bigquery.SchemaField(property_, "STRING", mode="REPEATED"))

    # #print (schema_setting)
    job_config = bigquery.LoadJobConfig(
    #schema=schema_setting,
    autodetect=True,
    #skip_leading_rows=1,
    #field_delimiter="\t",
    # The source format defaults to CSV, so the line below is optional.
    source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
    )

    table_id = f"{bigquery_project}.{bigquery_dataset}.{nodename}"

    with open(full_path, "rb") as source_file:
        job = bq_client.load_table_from_file(source_file, table_id, job_config=job_config)

    job.result()  # Waits for the job to complete.

    table = bq_client.get_table(table_id)  # Make an API request.
    print(
        "Loaded {} rows and {} columns to {}".format(
            table.num_rows, len(table.schema), table_id
        )
    )

    # table_id = f"{bigquery_project}.{bigquery_dataset}.{nodename}"

    # load_job = bq_client.load_table_from_uri(
    #     full_path, table_id, job_config=job_config
    # )  # Make an API request.

    # load_job.result()  # Waits for the job to complete.

    # destination_table = bq_client.get_table(table_id)  # Make an API request.
    # print("Loaded {} rows.".format(destination_table.num_rows))

json/MSH_IS_MAPPED_TO.json
Loaded 924 rows and 2 columns to neo4j-dashboard.trials_3000.MSH_IS_MAPPED_TO
json/ConditionHPO.json
Loaded 1215 rows and 2 columns to neo4j-dashboard.trials_3000.ConditionHPO
json/HPO_IS_A.json
Loaded 444 rows and 2 columns to neo4j-dashboard.trials_3000.HPO_IS_A
json/Sponsors.json
Loaded 9156 rows and 2 columns to neo4j-dashboard.trials_3000.Sponsors
json/FOCUSES_ON.json
Loaded 11139 rows and 2 columns to neo4j-dashboard.trials_3000.FOCUSES_ON
json/MSH_IS_A.json
Loaded 888 rows and 2 columns to neo4j-dashboard.trials_3000.MSH_IS_A
json/MSH_BELONGS_TO.json
Loaded 816 rows and 2 columns to neo4j-dashboard.trials_3000.MSH_BELONGS_TO
json/TESTS.json
Loaded 10350 rows and 2 columns to neo4j-dashboard.trials_3000.TESTS
json/Condition.json
Loaded 837 rows and 10 columns to neo4j-dashboard.trials_3000.Condition
json/Trial.json
Loaded 9156 rows and 8 columns to neo4j-dashboard.trials_3000.Trial
json/ConditionMSH.json
Loaded 240 rows and 3 columns to neo4j-dashboard.