In [1]:
TASK = "ncbi-taxonomy"
DATE = "2022-02-05"
DIR = "$(homedir())/workspace/$(DATE)-$(TASK)"
if !isdir(DIR)
    mkdir(DIR)
end
cd(DIR)

In [2]:
NEO4J_BIN_DIR = "/home/jupyter-cjprybol/software/neo4j-community-4.4.3/bin"
if !occursin(NEO4J_BIN_DIR, ENV["PATH"])
    ENV["PATH"] = "$(NEO4J_BIN_DIR):" * ENV["PATH"]
end 
DOMAIN = "ncbi-taxonomy.cjp.garden"
NODES_FILE = "$(DIR)/ncbi_taxonomy.nodes.tsv"
EDGES_FILE = "$(DIR)/ncbi_taxonomy.edges.tsv"
USERNAME="neo4j"
PASSWORD=readline(joinpath(homedir(), ".config", "neo4j", "ncbi-taxonomy.password.txt"));
ADDRESS="neo4j://$(DOMAIN):7687"
NEO4J_IMPORT_DIRECTORY="/var/lib/neo4j/import"
DATABASE = "neo4j"

"neo4j"

In [58]:
function list_databases(;address, username, password)
    cmd = "show databases"
    database = "system"
    cmd = cypher(;address, username, password, database, cmd)
    return DataFrames.DataFrame(uCSV.read(open(cmd), header=1, quotes='"', encodings=Dict("FALSE" => false, "TRUE" => true))...)
end

list_databases (generic function with 1 method)

In [59]:
function create_database(;database, address, username, password)
    current_databases = list_databases(;address, username, password)
    if database in current_databases[!, "name"]
        return
    else
        f = run
        cmd = "create database $(database)"
        # switch database to system, so that we can create the user-specific database in the system
        database = "system"
        run(cypher(;address, username, password, database, cmd, f))
    end
end

create_database (generic function with 1 method)

In [60]:
function cypher(;address, username, password, database, cmd)
    return `cypher-shell --address $address --username $username --password $password --database $(database) --format auto $(cmd)`
end

cypher (generic function with 1 method)

In [6]:
import Pkg
pkgs = [
    "DataFrames",
    "ProgressMeter",
    "Graphs",
    "MetaGraphs",
    "uCSV"
]

for pkg in pkgs
    try
        Pkg.add(pkg)
    catch
#         # tried to install an unregistered local package
    end
    eval(Meta.parse("import $pkg"))
end

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m    Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/dev/Mycelia/docs/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/dev/Mycelia/docs/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/dev/Mycelia/docs/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/dev/Mycelia/docs/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/dev/Mycelia/docs/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/dev/Mycelia/docs/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/dev/Mycelia/docs/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/dev/Mycelia/docs/Manifest.toml`
[32m[1m   Reso

In [13]:
if !((NODES_FILE in readdir(DIR, join=true)) && (EDGES_FILE in readdir(DIR, join=true)))
    taxdump_url = "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
    taxdump_local_tarball = "$(DIR)/$(basename(taxdump_url))"

    if !isfile(taxdump_local_tarball)
        download(taxdump_url, taxdump_local_tarball)
    end

    taxdump_out = replace(taxdump_local_tarball, ".tar.gz" => "")
    if !isdir(taxdump_out)
        mkpath(taxdump_out)
        run(`tar -xvzf $(taxdump_local_tarball) -C $(taxdump_out)`)
    end

    readdir(taxdump_out)

#     Here we will create an in-memory dataframe to capture the contents of the names.dmp file

    # Taxonomy names file (names.dmp):
    # 	tax_id					-- the id of node associated with this name
    # 	name_txt				-- name itself
    # 	unique name				-- the unique variant of this name if name not unique
    # 	name class				-- (synonym, common name, ...)

    names_dmp = DataFrames.DataFrame(
        tax_id = Int[],
        name_txt = String[],
        unique_name = String[],
        name_class = String[]
    )
    ProgressMeter.@showprogress for line in split(read(open("$(taxdump_out)/names.dmp"), String), "\t|\n")
        if isempty(line)
            continue
        else
            (tax_id_string, name_txt, unique_name, name_class) = split(line, "\t|\t")
            tax_id = parse(Int, tax_id_string)
            row = (;tax_id, name_txt, unique_name, name_class)
            push!(names_dmp, row)
        end
    end
    names_dmp

#     We can see that there are sometimes multiple entries for each tax_id, the unique identifier that we will be using

    unique_tax_ids = unique(names_dmp[!, "tax_id"])

#     Here we will group the names.dmp data by tax_id, create a node in the graph for each tax_id, and sanitize and merge information appropriately

    ncbi_taxonomy = MetaGraphs.MetaDiGraph(length(unique_tax_ids))
    ProgressMeter.@showprogress for (index, group) in enumerate(collect(DataFrames.groupby(names_dmp, "tax_id")))
        MetaGraphs.set_prop!(ncbi_taxonomy, index, :tax_id, group[1, "tax_id"])
        for row in DataFrames.eachrow(group)
            unique_name = isempty(row["unique_name"]) ? row["name_txt"] : row["unique_name"]
            # remove quotes since neo4j doesn't like them
            unique_name = replace(unique_name, '"' => "")
            # replace spaces and dashes with underscores
            name_class = Symbol(replace(replace(row["name_class"], r"\s+" => "-"), "-" => "_"))
    #         name_class = Symbol(row["name_class"])
            if haskey(MetaGraphs.props(ncbi_taxonomy, index), name_class)
                current_value = MetaGraphs.get_prop(ncbi_taxonomy, index, name_class)
                if (current_value isa Array) && !(unique_name in current_value)
                    new_value = [current_value..., unique_name]
                    MetaGraphs.set_prop!(ncbi_taxonomy, index, name_class, new_value)
                elseif !(current_value isa Array) && (current_value != unique_name)
                    new_value = [current_value, unique_name]
                    MetaGraphs.set_prop!(ncbi_taxonomy, index, name_class, new_value)
                else
                    continue
                end
            else
                MetaGraphs.set_prop!(ncbi_taxonomy, index, name_class, unique_name)
            end
        end
    end

#     Here we can see that there are divisions projected onto the tree that will allow easy grouping by taxonomic "group"s such as primates, viruses, phages, etc.

    divisions = Dict()
    for line in split(read(open("$(taxdump_out)/division.dmp"), String), "\t|\n")
        if !isempty(line)
            (id_string, shorthand, full_name, notes) = split(line, "\t|\t")
            id = parse(Int, id_string)
            divisions[id] = Dict(:division_cde => shorthand, :division_name => full_name)
        end
    end
    divisions

#     And finally for the data import, here we will read in the nodes.dmp file which contains lots of other metadata about each node in the NCBI taxonomic tree. We will cross-reference the division information above to add the rest of the division information. It could be helpful to make divisions their own nodes and then create relationships between taxonomic nodes and division nodes, but we'll go with the metadata in the taxonomic nodes for now

    node_2_taxid_map = map(index -> ncbi_taxonomy.vprops[index][:tax_id], Graphs.vertices(ncbi_taxonomy))
    ProgressMeter.@showprogress for line in split(read(open("$(taxdump_out)/nodes.dmp"), String), "\t|\n")
        if isempty(line)
            continue
        else
            (tax_id_string, parent_tax_id_string, rank, embl_code, division_id_string) = split(line, "\t|\t")


            division_id = parse(Int, division_id_string)

            tax_id = parse(Int, tax_id_string)
            graph_tax_ids = searchsorted(node_2_taxid_map, tax_id)
            @assert length(graph_tax_ids) == 1
            graph_tax_id = first(graph_tax_ids)

            parent_tax_id = parse(Int, parent_tax_id_string)
            graph_parent_tax_ids = searchsorted(node_2_taxid_map, parent_tax_id)
            @assert length(graph_parent_tax_ids) == 1
            graph_parent_tax_id = first(graph_parent_tax_ids)

            Graphs.add_edge!(ncbi_taxonomy, graph_tax_id, graph_parent_tax_id)
            MetaGraphs.set_prop!(ncbi_taxonomy, graph_tax_id, :rank, rank)
            # these should probably be broken out as independent nodes!
            MetaGraphs.set_prop!(ncbi_taxonomy, graph_tax_id, :division_id, division_id)
            MetaGraphs.set_prop!(ncbi_taxonomy, graph_tax_id, :division_cde, divisions[division_id][:division_cde])
            MetaGraphs.set_prop!(ncbi_taxonomy, graph_tax_id, :division_name, divisions[division_id][:division_name])
        end
    end

#     Here we can see that there are an equal number of edges as their are nodes

    Graphs.ne(ncbi_taxonomy) == Graphs.nv(ncbi_taxonomy)

#     Here we'll produce a list of all of the metadata fields that are associated with our taxonomic nodes. Not every node will have all of these values, but this will allow us to write our in-memory graph to .tsv files for importing into neo4j

    column_names = Set(k for vertex in Graphs.vertices(ncbi_taxonomy) for k in keys(ncbi_taxonomy.vprops[vertex]))
    column_names = sort(collect(column_names))
    # column_names = filter(x -> string(x) != "in-part", column_names)

#     Here in the next 2 steps we write out .tsv files for our nodes + metadata and our edges

    open(NODES_FILE, "w") do io
        header = ["node", string.(column_names)...]
        println(io, join(header, '\t'))
        ProgressMeter.@showprogress for vertex in Graphs.vertices(ncbi_taxonomy)
            fields = String[]
            for k in column_names
                field = get(ncbi_taxonomy.vprops[vertex], k, "")
                field = string.(field)
                if field isa Array
                    field = join(field, ';')
                end
                push!(fields, field)
            end
            row = ["$(vertex)", fields...]
            println(io, join(row, '\t'))
        end
    end

    open(EDGES_FILE, "w") do io
        header = ["src", "dst"]
        println(io, join(header, '\t'))
        ProgressMeter.@showprogress for edge in Graphs.edges(ncbi_taxonomy)
            src_tax_id = ncbi_taxonomy.vprops[edge.src][:tax_id]
            dst_tax_id = ncbi_taxonomy.vprops[edge.dst][:tax_id]
            println(io, join(string.([src_tax_id, dst_tax_id]), "\t"))
        end
    end
else
    println("already done")
end

already done


Run me in a google cloud shell attached to an account with GCE enabled
```bash
# gcloud config set project [PROJECT_ID]
gcloud config set project genomics-290313
gcloud compute firewall-rules create allow-neo4j-bolt-https --allow tcp:7473,tcp:7474,tcp:7687 --source-ranges 0.0.0.0/0 --target-tags neo4j
# gcloud compute images list --project launcher-public | grep --extended-regexp "neo4j-(community|enterprise)-1-4-.*"
# neo4j-community-1-4-3-2-gds-apoc
gcloud compute instances create neo4j-taxonomy --image-project launcher-public --image neo4j-community-1-4-3-2-gds-apoc --tags neo4j
# ^ should add more to this. Disk size? instance size?
```

Notes from the neo4j.conf file
```bash
# Paths of directories in the installation.
dbms.directories.data=/var/lib/neo4j/data
#dbms.directories.plugins=/var/lib/neo4j/plugins
dbms.directories.logs=/var/lib/neo4j/logs
dbms.directories.lib=/usr/share/neo4j/lib
dbms.directories.run=/var/run/neo4j
#dbms.directories.metrics=/var/lib/neo4j/metrics
#dbms.directories.dumps.root=data/dumps

# This setting constrains all `LOAD CSV` import files to be under the `import` directory. Remove or comment it out to
# allow files to be loaded from anywhere in the filesystem; this introduces possible security problems. See the
# `LOAD CSV` section of the manual for details.
dbms.directories.import=/var/lib/neo4j/import
```

```bash
# The address at which this server can be reached by its clients. This may be the server's IP address or DNS name, or
# it may be the address of a reverse proxy which sits in front of the server. This setting may be overridden for
# individual connectors below.
#dbms.default_advertised_address=35.231.208.227
dbms.default_advertised_address=ncbi-taxonomy.cjp.garden
```

have to run `sudo neo4j start` on remote machine!!!

In [16]:
list_databases(address = ADDRESS, username = USERNAME, password = PASSWORD)

Unnamed: 0_level_0,name,address,role,requestedStatus,currentStatus,error
Unnamed: 0_level_1,String,String,String,String,String,String
1,neo4j,35.231.208.227:7687,standalone,online,online,
2,system,35.231.208.227:7687,standalone,online,online,


Here we can see a list of databases that we already have

The neo4j database name that we will use for this ncbi taxonomic tree is:

Here we will create the database if it doesn't exist

In [18]:
create_database(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE)

Here we will set constrains that no two nodes have the same taxonomic id and no two nodes have the same scientific name

Next we'll import some helpful packages

In [19]:
cmd = "CREATE CONSTRAINT ON (t:Taxonomy) ASSERT t.tax_id IS UNIQUE"
@time cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)

  0.023436 seconds (229 allocations: 15.594 KiB, 98.37% compilation time)


Process(`[4mcypher-shell[24m [4m--address[24m [4mneo4j://ncbi-taxonomy.cjp.garden:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mtempo-athlete-news-info-fresh-4482[24m [4m--database[24m [4mneo4j[24m [4m--format[24m [4mauto[24m [4m'CREATE CONSTRAINT ON (t:Taxonomy) ASSERT t.tax_id IS UNIQUE'[24m`, ProcessRunning)

An equivalent constraint already exists, 'Constraint( id=4, name='constraint_53f0c26a', type='UNIQUENESS', schema=(:Taxonomy {tax_id}), ownedIndex=3 )'.


In [20]:
cmd = "CREATE CONSTRAINT ON (t:Taxonomy) ASSERT t.`scientific name` IS UNIQUE"
@time cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)

  0.000368 seconds (92 allocations: 6.516 KiB)


Process(`[4mcypher-shell[24m [4m--address[24m [4mneo4j://ncbi-taxonomy.cjp.garden:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mtempo-athlete-news-info-fresh-4482[24m [4m--database[24m [4mneo4j[24m [4m--format[24m [4mauto[24m [4m'CREATE CONSTRAINT ON (t:Taxonomy) ASSERT t.\`scientific name\` IS UNIQUE'[24m`, ProcessRunning)

An equivalent constraint already exists, 'Constraint( id=6, name='constraint_f16727de', type='UNIQUENESS', schema=(:Taxonomy {scientific name}), ownedIndex=5 )'.


Here we will create the nodes

https://linuxize.com/post/how-to-use-scp-command-to-securely-transfer-files/
https://linuxize.com/post/how-to-setup-passwordless-ssh-login/

run me on remote neo4j server
```bash
ssh-keygen -t rsa -b 4096 -C "cameron.prybol@gmail.com"
```
add the .pub keys from each machine to the `~/.ssh/authorized_keys` on the other machine

on neo4j machine, use sudo to make an symlink between `/var/lib/neo4j/import` and `neo4j-import`

```bash
mkdir -p neo4j-import
sudo ln -s /var/lib/neo4j/import neo4j-import
```

sudo chmod 777 /var/lib/neo4j/import/

scp ncbi_taxonomy.* cameron_prybol@ncbi-taxonomy.cjp.garden:/var/lib/neo4j/import

scp ncbi_taxonomy.edges.tsv cameron_prybol@ncbi-taxonomy.cjp.garden:/var/lib/neo4j/import

In [21]:
run(`scp $NODES_FILE cameron_prybol@$(DOMAIN):$(NEO4J_IMPORT_DIRECTORY)`)

Process(`[4mscp[24m [4m/home/jupyter-cjprybol/workspace/2022-02-05-ncbi-taxonomy/ncbi_taxonomy.nodes.tsv[24m [4mcameron_prybol@ncbi-taxonomy.cjp.garden:/var/lib/neo4j/import[24m`, ProcessExited(0))

In [22]:
run(`scp $EDGES_FILE cameron_prybol@$(DOMAIN):$(NEO4J_IMPORT_DIRECTORY)`)

Process(`[4mscp[24m [4m/home/jupyter-cjprybol/workspace/2022-02-05-ncbi-taxonomy/ncbi_taxonomy.edges.tsv[24m [4mcameron_prybol@ncbi-taxonomy.cjp.garden:/var/lib/neo4j/import[24m`, ProcessExited(0))

In [55]:
cmd = 
"""
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM
'file:///$(basename(NODES_FILE))' AS row
FIELDTERMINATOR '\t'
MERGE (t:Taxonomy {tax_id: row.tax_id})
"""
cmd = rstrip(replace(cmd, '\n' => ' '))
@time cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)

  0.000518 seconds (94 allocations: 6.938 KiB)


Process(`[4mcypher-shell[24m [4m--address[24m [4mneo4j://ncbi-taxonomy.cjp.garden:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mtempo-athlete-news-info-fresh-4482[24m [4m--database[24m [4mneo4j[24m [4m--format[24m [4mauto[24m [4m"USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM 'file:///ncbi_taxonomy.nodes.tsv' AS row FIELDTERMINATOR '	' MERGE (t:Taxonomy {tax_id: row.tax_id})"[24m`, ProcessRunning)

In [None]:
# these both work for running locally
#cypher-shell --address neo4j://10.142.0.13:7687 --username neo4j --password tempo-athlete-news-info-fresh-4482 --database neo4j --format auto "USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM 'file:///ncbi_taxonomy.nodes.tsv' AS row FIELDTERMINATOR '\t' MERGE (t:Taxonomy {tax_id: row.tax_id})"
#cypher-shell --address neo4j://0.0.0.0:7687 --username neo4j --password tempo-athlete-news-info-fresh-4482 --database neo4j --format auto "USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM 'file:///ncbi_taxonomy.nodes.tsv' AS row FIELDTERMINATOR '\t' MERGE (t:Taxonomy {tax_id: row.tax_id})"

In the following commands, we will add metadata to the nodes in a piece-meal (column by column) fashion that will allow us to skip over null fields. Storing null pointers in Neo4j is discouraged (impossible?) and we will get errors if we try and set metadata properties to null values.

I tried to do this all in one command on the initial import by handling all of the nulls using the technique in [this post](https://markhneedham.com/blog/2014/08/22/neo4j-load-csv-handling-empty-columns/) but I kept getting Java errors

There is not enough memory to perform the current task. Please try increasing 'dbms.memory.heap.max_size' in the neo4j configuration (normally in 'conf/neo4j.conf' or, if you are using Neo4j Desktop, found through the user interface) or if you are running an embedded installation increase the heap by using '-Xmx' command line flag, and then restart the database.

In [83]:
# note to self, I should be able to programmatically generate this long list of metadata fields

# need to start over?
# match (n) delete n

# need to develop little by little over time?
# WITH row LIMIT 10

cmd = 
"""
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM
'file:///$(basename(NODES_FILE))' AS row
FIELDTERMINATOR '\\t'
CREATE (t:Taxonomy {
    tax_id: row.tax_id,
    scientific_name: row.scientific_name,
    division_cde: row.division_cde,
    division_id: row.division_id,
    division_name: row.division_name,
    rank: row.rank,
    acronym: row.acronym,
    in_part: row.in_part,
    includes: row.includes,
    common_name: row.common_name,
    genbank_common_name: row.genbank_common_name,
    blast_name: row.blast_name,
    synonym: row.synonym,
    genbank_synonym: row.genbank_synonym,
    type_material: row.type_material,
    authority: row.authority,
    genbank_acronym: row.genbank_acronym,
    equivalent_name: row.equivalent_name})
RETURN t LIMIT 10
"""
println(cmd)
cmd = rstrip(replace(cmd, '\n' => ' '))
cyper_cmd = cypher(address = "neo4j://0.0.0.0:7687", username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)

# ready to start consuming query after 61 ms, results consumed after another 86607 ms
# Added 2396777 nodes, Set 15244183 properties, Added 2396777 labels

USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM
'file:///ncbi_taxonomy.nodes.tsv' AS row
FIELDTERMINATOR '\t'
CREATE (t:Taxonomy {
    tax_id: row.tax_id,
    scientific_name: row.scientific_name,
    division_cde: row.division_cde,
    division_id: row.division_id,
    division_name: row.division_name,
    rank: row.rank,
    acronym: row.acronym,
    in_part: row.in_part,
    includes: row.includes,
    common_name: row.common_name,
    genbank_common_name: row.genbank_common_name,
    blast_name: row.blast_name,
    synonym: row.synonym,
    genbank_synonym: row.genbank_synonym,
    type_material: row.type_material,
    authority: row.authority,
    genbank_acronym: row.genbank_acronym,
    equivalent_name: row.equivalent_name})
RETURN t LIMIT 10



`[4mcypher-shell[24m [4m--address[24m [4mneo4j://0.0.0.0:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mtempo-athlete-news-info-fresh-4482[24m [4m--database[24m [4mneo4j[24m [4m--format[24m [4mauto[24m [4m"USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM 'file:///ncbi_taxonomy.nodes.tsv' AS row FIELDTERMINATOR '\t' CREATE (t:Taxonomy {     tax_id: row.tax_id,     scientific_name: row.scientific_name,     division_cde: row.division_cde,     division_id: row.division_id,     division_name: row.division_name,     rank: row.rank,     acronym: row.acronym,     in_part: row.in_part,     includes: row.includes,     common_name: row.common_name,     genbank_common_name: row.genbank_common_name,     blast_name: row.blast_name,     synonym: row.synonym,     genbank_synonym: row.genbank_synonym,     type_material: row.type_material,     authority: row.authority,     genbank_acronym: row.genbank_acronym,     equivalent_name: row.equivalent_name}) RETURN t LIMIT

And here in the final step we create the relationships between taxa and their parent nodes

In [85]:
cmd = 
"""
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM
'file:///$(basename(EDGES_FILE))' AS row
FIELDTERMINATOR '\\t'
MATCH (src:Taxonomy {tax_id: row.src})
MATCH (dst:Taxonomy {tax_id: row.dst})
MERGE (src)-[p:PARENT]->(dst)
"""
println(cmd)
cmd = rstrip(replace(cmd, '\n' => ' '))
cypher(address = "neo4j://0.0.0.0:7687", username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)

USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM
'file:///ncbi_taxonomy.edges.tsv' AS row
FIELDTERMINATOR '\t'
MATCH (src:Taxonomy {tax_id: row.src})
MATCH (dst:Taxonomy {tax_id: row.dst})
MERGE (src)-[p:PARENT]->(dst)

  0.000049 seconds (58 allocations: 5.234 KiB)


`[4mcypher-shell[24m [4m--address[24m [4mneo4j://0.0.0.0:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mtempo-athlete-news-info-fresh-4482[24m [4m--database[24m [4mneo4j[24m [4m--format[24m [4mauto[24m [4m"USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM 'file:///ncbi_taxonomy.edges.tsv' AS row FIELDTERMINATOR '\t' MATCH (src:Taxonomy {tax_id: row.src}) MATCH (dst:Taxonomy {tax_id: row.dst}) MERGE (src)-[p:PARENT]->(dst)"[24m`

Failed to obtain connection towards WRITE server. Known routing table is: Ttl 1644089305934, currentTime 1644089035978, routers [], writers [], readers [], database 'neo4j'
Failed to obtain connection towards WRITE server. Known routing table is: Ttl 1644089316828, currentTime 1644089046883, routers [], writers [], readers [], database 'neo4j'
Failed to obtain connection towards WRITE server. Known routing table is: Ttl 1644089334913, currentTime 1644089089649, routers [], writers [], readers [], database 'neo4j'
Failed to obtain connection towards WRITE server. Known routing table is: Ttl 1644089382426, currentTime 1644089125683, routers [], writers [], readers [], database 'neo4j'
Failed to obtain connection towards WRITE server. Known routing table is: Ttl 1644089486990, currentTime 1644089217065, routers [], writers [], readers [], database 'neo4j'


In [None]:
# ready to start consuming query after 99517 ms, results consumed after another 0 ms
# Created 2396777 relationships

And that is it! We've just rebuilt the NCBI taxonomy in neo4j to allow us to do downstream work in a taxonomy-aware way