In [14]:

import yaml
import json
import os

from neo4j import GraphDatabase




In [27]:
with open("config.yaml", "r") as stream:
    try:
        PARAM = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [16]:

driver = GraphDatabase.driver(PARAM["neo4j_url"], auth=(PARAM["neo4j_username"], PARAM["neo4j_password"]))

records, summary, keys = driver.execute_query(f"""
    CALL apoc.meta.schema()
    YIELD value RETURN value;
    """,
    database_="neo4j",
)
# Loop through results and do something with them
for record in records:
    schema = record.data()["value"]
    json_schema = json.dumps(record.data()["value"])
    #print (json_schema)


In [17]:
schema.keys()

dict_keys(['Condition', 'IS_A_SITE_OF', 'Site', 'IS_A_CATEGORY_OF', 'IS_CARRIED_OUT_BY', 'FOCUSES_ON', 'IS_FOUND_AT_SITE', 'Institution', 'BELONGS_TO', 'HAS_MORPHOLOGY', 'Category', 'Trial', 'Morphology'])

In [18]:
schema['Condition']['type']

'node'

In [19]:
schema['Condition']

{'count': 23,
 'labels': [],
 'properties': {'SNOMEDCT': {'unique': True,
   'indexed': True,
   'type': 'STRING',
   'existence': False},
  'name': {'unique': False,
   'indexed': False,
   'type': 'STRING',
   'existence': False},
  'UMLS': {'unique': False,
   'indexed': False,
   'type': 'STRING',
   'existence': False}},
 'type': 'node',
 'relationships': {'HAS_MORPHOLOGY': {'count': 0,
   'direction': 'out',
   'labels': ['Morphology'],
   'properties': {}},
  'FOCUSES_ON': {'count': 26,
   'direction': 'in',
   'labels': ['Trial'],
   'properties': {}},
  'IS_FOUND_AT_SITE': {'count': 0,
   'direction': 'out',
   'labels': ['Site'],
   'properties': {}},
  'BELONGS_TO': {'count': 0,
   'direction': 'out',
   'labels': ['Category'],
   'properties': {}}}}

In [20]:
#Condition: {SNOMEDCT: STRING}
schema_dict = {}
for key in schema.keys():
    if schema[key]['type'] == 'node':
        schema_dict[key] = {}

        for p in schema[key]['properties'].keys():
            schema_dict[key][p] = schema[key]['properties'][p]['type']

In [10]:
schema_dict

{'Condition': {'SNOMEDCT': 'STRING', 'name': 'STRING', 'UMLS': 'STRING'},
 'Site': {'fsn': 'STRING', 'SNOMEDCT': 'STRING', 'name': 'STRING'},
 'Institution': {'name': 'STRING', 'type': 'STRING'},
 'Category': {'fsn': 'STRING', 'SNOMEDCT': 'STRING', 'name': 'STRING'},
 'Trial': {'min_age': 'STRING',
  'healthy_volunteers': 'STRING',
  'study_results': 'STRING',
  'outcome_measures': 'LIST',
  'criteria': 'STRING',
  'status': 'STRING',
  'max_age': 'STRING',
  'study_type': 'STRING',
  'url': 'STRING',
  'title': 'STRING',
  'locations': 'LIST',
  'description': 'STRING',
  'name': 'STRING',
  'phases': 'LIST',
  'gender': 'STRING',
  'enrollment': 'STRING',
  'start_date': 'STRING',
  'sampling_method': 'STRING'},
 'Morphology': {'fsn': 'STRING', 'SNOMEDCT': 'STRING', 'name': 'STRING'}}

In [None]:
schema['Condition']['properties']

{'SNOMEDCT': {'unique': True,
  'indexed': True,
  'type': 'STRING',
  'existence': False},
 'name': {'unique': False,
  'indexed': False,
  'type': 'STRING',
  'existence': False},
 'UMLS': {'unique': False,
  'indexed': False,
  'type': 'STRING',
  'existence': False}}

In [None]:
for label in schema.keys():
    if schema[label]['type'] == 'node':
        print (label)

Condition
Site
Institution
Category
Trial
Morphology


In [None]:
# variable_name = "t"
# output_directory = "tsv"

# for node_type in schema.keys():
#     if schema[node_type]['type'] == 'node':

#         records, summary, keys = driver.execute_query(f"""
#             MATCH ({variable_name}:{node_type})
#             RETURN {variable_name}
#             """,
#             database_="neo4j",
#         )
        
#         header = list(schema[node_type]['properties'].keys())
#         content = "\t".join(header) + "\n"
#         for record in records:
#             result = record.data()[f"{variable_name}"]
            
#             for h in header:
#                 if h in result:
#                     content += str(result[h]) + "\t"
#                 else:
#                     content += "\t"
#             content = content[:-1] + "\n"

#         with open(os.path.join(output_directory, f"{node_type}.tsv"), 'w') as f:
#             f.write(content)


In [23]:
variable_name = "t"
output_directory = "json"

for node_type in schema.keys():
    if schema[node_type]['type'] == 'node':

        records, summary, keys = driver.execute_query(f"""
            MATCH ({variable_name}:{node_type})
            RETURN {variable_name}
            """,
            database_="neo4j",
        )
        
        content = ""
        #header = list(schema[node_type]['properties'].keys())
        #content = "\t".join(header) + "\n"
        for record in records:
            
            content += json.dumps(record.data()[variable_name]) + "\n"

        with open(os.path.join(output_directory, f"{node_type}.json"), 'w') as f:
            f.write(content)


In [4]:
bucket_name = "neo4j-bigquery-project"

In [None]:
#os.system(f'{PARAM["gsutil_path"]}/gsutil cp -r tsv gs://{bucket_name}/')

Copying file://tsv/Morphology.tsv [Content-Type=text/tab-separated-values]...
Copying file://tsv/Trial.tsv [Content-Type=text/tab-separated-values]...        
Copying file://tsv/Institution.tsv [Content-Type=text/tab-separated-values]...  
Copying file://tsv/Site.tsv [Content-Type=text/tab-separated-values]...         
- [4 files][ 84.3 KiB/ 84.3 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://tsv/Condition.tsv [Content-Type=text/tab-separated-values]...
Copying file://tsv/Category.tsv [Content-Type=text/tab-separated-values]...     
- [6 files][109.6 KiB/109.6 KiB]                                                
Operation completed over 6 objects/109.6 KiB.                                    


0

In [28]:
os.system(f'{PARAM["gsutil_path"]}/gsutil cp -r json gs://{bucket_name}/')

Copying file://json/Condition.json [Content-Type=application/json]...
Copying file://json/Site.json [Content-Type=application/json]...                
Copying file://json/Institution.json [Content-Type=application/json]...         
Copying file://json/Category.json [Content-Type=application/json]...            
- [4 files][ 54.4 KiB/ 54.4 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://json/Trial.json [Content-Type=application/json]...
Copying file://json/Morphology.json [Content-Type=application/json]...          
\ [6 files][133.4 KiB/133.4 KiB]                                                
Operation completed over 6 objects/133.4 KiB.                                    


0

In [30]:
from google.cloud import bigquery
from google.cloud import storage

In [32]:
bq_client = bigquery.Client()
bigquery_project = "vertex-ai-399007"
bigquery_dataset = "neo4j"
bq_client.create_dataset(dataset=bigquery_dataset, exists_ok=True)

Dataset(DatasetReference('vertex-ai-399007', 'neo4j'))

In [33]:
bucket_client = storage.Client()
bucket = bucket_client.bucket(bucket_name)

In [6]:
# for f in bucket.list_blobs(prefix='tsv'):
    
#     full_path = f"gs://{bucket_name}/{f.name}"

#     filename = f.name.split("/")[1]
#     nodename = filename.split(".")[0]
#     print (full_path, nodename)

gs://neo4j-bigquery-project/tsv/Category.tsv Category
gs://neo4j-bigquery-project/tsv/Condition.tsv Condition
gs://neo4j-bigquery-project/tsv/Institution.tsv Institution
gs://neo4j-bigquery-project/tsv/Morphology.tsv Morphology
gs://neo4j-bigquery-project/tsv/Site.tsv Site
gs://neo4j-bigquery-project/tsv/Trial.tsv Trial


In [34]:
for f in bucket.list_blobs(prefix='json'):
#for f in bucket.list_blobs(prefix='tsv'):
    
    full_path = f"gs://{bucket_name}/{f.name}"

    filename = f.name.split("/")[1]
    nodename = filename.split(".")[0]
    print (full_path, nodename)

    #schema_setting = [bigquery.SchemaField(property_, schema_dict[nodename][property_]) for property_ in schema_dict[nodename].keys()]
    schema_setting = []

    # for property_ in schema_dict[nodename].keys():
    #     type_ = schema_dict[nodename][property_]
    #     if type_ != "LIST":
    #         schema_setting.append(bigquery.SchemaField(property_, type_))
    #     else:
 
    #         schema_setting.append(bigquery.SchemaField(property_, "STRING", mode="REPEATED"))

    #print (schema_setting)
    job_config = bigquery.LoadJobConfig(
    #schema=schema_setting,
    autodetect=True,
    #skip_leading_rows=1,
    #field_delimiter="\t",
    # The source format defaults to CSV, so the line below is optional.
    source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
    )

    table_id = f"{bigquery_project}.{bigquery_dataset}.{nodename}"

    load_job = bq_client.load_table_from_uri(
        full_path, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = bq_client.get_table(table_id)  # Make an API request.
    print("Loaded {} rows.".format(destination_table.num_rows))

gs://neo4j-bigquery-project/json/Category.json Category
Loaded 300 rows.
gs://neo4j-bigquery-project/json/Condition.json Condition
Loaded 23 rows.
gs://neo4j-bigquery-project/json/Institution.json Institution
Loaded 25 rows.
gs://neo4j-bigquery-project/json/Morphology.json Morphology
Loaded 7 rows.
gs://neo4j-bigquery-project/json/Site.json Site
Loaded 128 rows.
gs://neo4j-bigquery-project/json/Trial.json Trial
Loaded 24 rows.
