# Create Neo4j Graph Database

First run notebook [1a-Strains.ipynb](./1a-Strains.ipynb) to create the node and relationship files in the /data directory.

[Neo4j Import Guide](https://neo4j.com/developer/guide-import-csv/)

[Neo4j Batch Import](https://neo4j.com/docs/operations-manual/current/tools/import/)

In [2]:
import os

In [3]:
#!"$NEO4J_HOME"/bin/neo4j-admin import --database=graph.db --id-type=STRING --max-memory=250000 \
#    --nodes=../reference_data/nodes/Outbreak.csv \
#    --nodes=../reference_data/nodes/Pathogen.csv \
#    --nodes=../reference_data/nodes/Dashboard.csv \
#    --nodes=../data/nodes/Admin1.csv \
#    --nodes=../data/nodes/Admin2.csv \
#    --nodes=../data/nodes/City.csv \
#    --nodes=../data/nodes/Country.csv \
#    --nodes=../data/nodes/Host.csv \
#    --nodes=../data/nodes/PersonAnimal.csv \
#    --nodes=../data/nodes/Strain.csv \
#    --relationships=../reference_data/relationships/Pathogen-CAUSES-Outbreak.csv \
#    --relationships=../reference_data/relationships/Outbreak-EXPLORE_IN-Dashboard.csv \
#    --relationships=../reference_data/relationships/City-EXPLORE_IN-Dashboard.csv \
#    --relationships=../data/relationships/PersonAnimal-CARRIES-Strain.csv \
#    --relationships=../data/relationships/PersonAnimal-IS_A-Host.csv \
#    --relationships=../data/relationships/Strain-FOUND_IN-Admin1.csv \
#    --relationships=../data/relationships/Strain-FOUND_IN-Admin2.csv \
#    --relationships=../data/relationships/Strain-FOUND_IN-City.csv \
#    --relationships=../data/relationships/Strain-FOUND_IN-Country.csv \
#    --relationships=../data/relationships/PersonAnimal-LOCATED_IN-Admin1.csv \
#    --relationships=../data/relationships/PersonAnimal-LOCATED_IN-Admin2.csv \
#    --relationships=../data/relationships/PersonAnimal-LOCATED_IN-City.csv \
#    --relationships=../data/relationships/PersonAnimal-LOCATED_IN-Country.csv \
#    --relationships=../data/relationships/Pathogen-HAS-Strain.csv

In [4]:
from py2neo import Graph, Node, Relationship, NodeMatcher
import re

In [5]:
from py2neo import Database
graph = Graph(host="localhost",password='covid')
matcher = NodeMatcher(graph)

pattern_id = re.compile('.*?:ID\(([\w_]+)\)')
pattern_start = re.compile(':START_ID\(([\w_]+)\)')
pattern_end = re.compile(':END_ID\(([\w_]+)\)')

In [6]:
graph.delete_all()

In [7]:
for root, dirs, files in os.walk('../reference_data/nodes/'):
     for file in files:
        tx = graph.begin()
        with open(os.path.join(root, file), "r") as auto:
            print (file) 
            headers = auto.readline().strip().split(',')  
            matcher_id = pattern_id.match(headers[0]) 
            headers[0] = matcher_id.group(1) 
            for line in auto:
                properties = line.strip().split(',')
                node_props = {}
                node_labels = None
                for i in range(len(headers)):
                    if (headers[i] !=':LABEL'):
                        node_props[headers[i]] = properties[i]
                    else:
                        node_labels = properties[i].split(';')
                node = Node.cast(node_props)
                node.update_labels(node_labels) 
                # print (node)
                tx.create(node)
            tx.commit()
        

Dashboard.csv
Outbreak.csv
Pathogen.csv
ReferenceGenome.csv
ReferenceGenome-checkpoint.csv


In [8]:
for root, dirs, files in os.walk('../data/nodes/'):
     for file in files:
        tx = graph.begin()
        with open(os.path.join(root, file), "r") as auto:
            print (file) 
            headers = auto.readline().strip().split(',')  
            matcher_id = pattern_id.match(headers[0]) 
            headers[0] = matcher_id.group(1) 
            for line in auto:
                properties = line.strip().split(',')
                node_props = {}
                node_labels = None
                for i in range(len(headers)):
                    if (headers[i] !=':LABEL'):
                        node_props[headers[i]] = properties[i]
                    else:
                        node_labels = properties[i].split(';')
                node = Node.cast(node_props)
                node.update_labels(node_labels) 
                #print (node)
                tx.create(node)
            tx.commit()

Admin1.csv
Admin2.csv
City.csv
Country.csv
Host.csv
PersonAnimal.csv
Strain.csv


In [10]:
for root, dirs, files in os.walk('../reference_data/relationships/'):
     for file in files:
        tx = graph.begin()
        with open(os.path.join(root, file), "r") as auto:
            print (file) 
            headers = auto.readline().strip().split(',')  
            for line in auto:
                properties = line.strip().split(',')
                rel_props = {}
                rel_types = None
                rel_origin_prop = None
                rel_target_prop = None
                origin_id = None
                target_id = None 
                if (len(headers) == len(properties)):
                    for i in range(len(headers)):
                        if (headers[i] ==':TYPE'):
                            rel_types = properties[i]
                        elif (pattern_start.match(headers[i])):
                            rel_origin_prop = pattern_start.match(headers[i]).group(1)  
                            origin_id = properties[i]
                        elif (pattern_end.match(headers[i])):
                            rel_target_prop = pattern_end.match(headers[i]).group(1) 
                            target_id = properties[i]
                        else:
                            rel_props[headers[i]] = properties[i]
                    origin_node = matcher.match().where("_."+rel_origin_prop+" = '"+origin_id.replace('\'','\\\'')+"'").first()
                    target_node = matcher.match().where("_."+rel_target_prop+" = '"+target_id.replace('\'','\\\'')+"'").first()
                    relationship = Relationship(origin_node,rel_types,target_node)
                    #print(relationship)
                    tx.merge(relationship)
            tx.commit()

City-EXPLORE_IN-Dashboard.csv
Outbreak-EXPLORE_IN-Dashboard.csv
Pathogen-CAUSES-Outbreak.csv
ReferenceGenome-ENCODES-Pathogen.csv
City-EXPLORE_IN-Dashboard-checkpoint.csv


In [11]:
for root, dirs, files in os.walk('../data/relationships/'):
     for file in files:
        tx = graph.begin()
        with open(os.path.join(root, file), "r") as auto:
            print (file) 
            headers = auto.readline().strip().split(',')  
            for line in auto:
                properties = line.strip().split(',')
                rel_props = {}
                rel_types = None
                origin_id = None
                target_id = None
                if (len(headers) == len(properties)):
                    for i in range(len(headers)):
                        if (headers[i] ==':TYPE'):
                            rel_types = properties[i]
                        elif (pattern_start.match(headers[i])):
                            rel_origin_prop = pattern_start.match(headers[i]).group(1)  
                            origin_id = properties[i]
                        elif (pattern_end.match(headers[i])):
                            rel_target_prop = pattern_end.match(headers[i]).group(1) 
                            target_id = properties[i]
                        else:
                            rel_props[headers[i]] = properties[i]
                    origin_node = matcher.match().where("_."+rel_origin_prop+" = '"+origin_id.replace('\'','\\\'')+"'").first()
                    target_node = matcher.match().where("_."+rel_target_prop+" = '"+target_id.replace('\'','\\\'')+"'").first()
                    relationship = Relationship(origin_node,rel_types,target_node)
                    #print(relationship)
                    tx.merge(relationship)
            tx.commit()

Pathogen-HAS-Strain.csv
PersonAnimal-CARRIES-Strain.csv
PersonAnimal-IS_A-Host.csv
PersonAnimal-LOCATED_IN-Admin1.csv
PersonAnimal-LOCATED_IN-Admin2.csv
PersonAnimal-LOCATED_IN-City.csv
PersonAnimal-LOCATED_IN-Country.csv
Strain-FOUND_IN-Admin1.csv
Strain-FOUND_IN-Admin2.csv
Strain-FOUND_IN-City.csv
Strain-FOUND_IN-Country.csv
