# Initialize workspace

In [1]:
DATE_TASK = "2022-03-29-ecoli-tequatro-pangenome"
DIR = mkpath("$(homedir())/workspace/$DATE_TASK")
cd(DIR)
DATE, TASK = match(r"^(\d{4}-\d{2}-\d{2})-(.*)$", DATE_TASK).captures

2-element Vector{Union{Nothing, SubString{String}}}:
 "2022-03-29"
 "ecoli-tequatro-pangenome"

# Import packages

In [2]:
import Pkg
Pkg.update()
pkgs = [
"JSON",
"HTTP",
"Dates",
"uCSV",
"DelimitedFiles",
"DataFrames",
"ProgressMeter",
"BioSequences",
"FASTX",
"Distances",
"StatsPlots",
"StatsBase",
"Statistics",
"MultivariateStats",
"Random",
"Primes",
"SparseArrays",
"SHA",
"GenomicAnnotations",
"Combinatorics",
"OrderedCollections",
"Downloads",
"Clustering",
"Revise",
"Mmap",
"Graphs",
"MetaGraphs",
"FileIO",
"SHA",
"MD5"
]

for pkg in pkgs
    try
        eval(Meta.parse("import $pkg"))
    catch
        Pkg.add(pkg)
        eval(Meta.parse("import $pkg"))
    end
end

# works but can't update locally, need to push and restart kernel to activate changes
# "https://github.com/cjprybol/Mycelia.git#master",
# didn't work
# "$(homedir())/git/Mycelia#master",
pkg_path = "$(homedir())/git/Mycelia"
try
    eval(Meta.parse("import $(basename(pkg_path))"))
catch
    # Pkg.add(url=pkg)
    Pkg.develop(path=pkg_path)
    # pkg = replace(basename(pkg), ".git#master" => "")
    # pkg = replace(basename(pkg), "#master" => "")
    eval(Meta.parse("import $(basename(pkg_path))"))
end

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m    Updating[22m[39m git-repo `https://github.com/cjprybol/Mycelia.git#master`
[32m[1m   Installed[22m[39m IntegerMathUtils ─ v0.1.0
[32m[1m   Installed[22m[39m Primes ─────────── v0.5.2
[32m[1m   Installed[22m[39m Latexify ───────── v0.15.14
[32m[1m    Updating[22m[39m `~/git/Mycelia/docs/Project.toml`
 [90m [27ebfcd6] [39m[93m↑ Primes v0.5.1 ⇒ v0.5.2[39m
[32m[1m    Updating[22m[39m `~/git/Mycelia/docs/Manifest.toml`
 [90m [18e54dd8] [39m[92m+ IntegerMathUtils v0.1.0[39m
 [90m [23fbe1c1] [39m[93m↑ Latexify v0.15.13 ⇒ v0.15.14[39m
 [90m [27ebfcd6] [39m[93m↑ Primes v0.5.1 ⇒ v0.5.2[39m
[32m[1mPrecompiling[22m[39m project...
[32m  ✓ [39m[90mIntegerMathUtils[39m
[32m  ✓ [39mPrimes
[32m  ✓ [39m[90mLatexify[39m
[32m  ✓ [39mPlots
[32m  ✓ [39mStatsPlots
[32m  ✓ [39mMycelia
  6 dependencies successfully precompiled in 97 seconds (252 already precompiled, 6

# Declare global parameters

In [3]:
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?&id=$(tax_id)
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?lvl=0&amp;id=2733124
# root_tax_id = 2733124

root_tax_id = 10663
host = "Escherichia"
database_id = "cb969e03"

"cb969e03"

In [4]:
NEO4J_BIN_DIR = "/home/jupyter-cjprybol/software/neo4j-community-4.4.3/bin"
if !occursin(NEO4J_BIN_DIR, ENV["PATH"])
    ENV["PATH"] = "$(NEO4J_BIN_DIR):" * ENV["PATH"]
end
USERNAME="neo4j"
PASSWORD=readline(joinpath(homedir(), ".config", "neo4j", "$(database_id).pass"));
ADDRESS="neo4j+s://$(database_id).databases.neo4j.io:7687"
DATABASE = "neo4j"

"neo4j"

# New functions (add me to library)

In [5]:
function assess_edgemer(edgemer)
    k = length(edgemer)-1
    kmer_type = BioSequences.BigDNAMer{k}
    src = kmer_type(edgemer[i] for i in 1:k)
    dst = kmer_type(edgemer[i] for i in 2:k+1)
    canonical_src = BioSequences.canonical(src)
    canonical_dst = BioSequences.canonical(dst)
    src_is_canonical = src == canonical_src
    dst_is_canonical = dst == canonical_dst
    # @show edgemer
    # @show src
    # @show canonical_src
    # @show src_is_canonical
    # @show dst
    # @show canonical_dst
    # @show dst_is_canonical
    return src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical
end

assess_edgemer (generic function with 1 method)

In [6]:
function has_identifier(graph, identifier)
    return identifier in keys(graph.metaindex[:identifier])
end

has_identifier (generic function with 1 method)

In [7]:
# TODO break me out into indivual steps of additon at each level
# TODO re-write me so that I just write out node and edge tables and add them batch style
function fastx_to_metagraph(;graph, k, entity_file_map)
    @assert isodd(k) "k must be odd to avoid the possibility of kmers being reverse complements of themselves"
    @assert 3 <= k <= 63 "k must be at least 3 to allow for kmer overlaps and less than 64 to be compatible with type limits"
    kmer_type = BioSequences.BigDNAMer{k}
    edgemer_type = BioSequences.BigDNAMer{k+1}
    edge_orientations = Symbol.([
        "true_true",
        "true_false",
        "false_true",
        "false_false"
    ])
    canonical_kmers = Set{kmer_type}()
    MetaGraphs.set_indexing_prop!(graph, :identifier)
    for entity in keys(entity_file_map)
        # f_type = "Genome"
        fastx = entity_file_map[entity]["genome"]
        fastx_hash = SHA.bytes2hex(SHA.sha256(Mycelia.open_fastx(fastx)))
        @show fastx_hash
#         Graphs.add_vertex!(graph)
#         fasta_node = Graphs.nv(graph)
#         # node types are camel case
#         MetaGraphs.set_prop!(graph, fasta_node, :TYPE, "Fasta")
#         # node and edge properties are lowercase
#         # add entity identifier as a property that can be queried on
#         MetaGraphs.set_prop!(graph, fasta_node, :path, fastx)
#         # for identifier, set 
#         MetaGraphs.set_prop!(graph, fasta_node, :identifier, entity)
#         for record in Mycelia.open_fastx(fastx)
#             Graphs.add_vertex!(graph)
#             record_node = Graphs.nv(graph)
            
#             MetaGraphs.set_prop!(graph, record_node, :TYPE, "FastaRecord")
#             MetaGraphs.set_prop!(graph, record_node, :identifier, FASTX.identifier(record))
#             MetaGraphs.set_prop!(graph, record_node, :description, FASTX.description(record))
#             MetaGraphs.set_prop!(graph, record_node, :sequence, FASTX.sequence(record))
#             if typeof(FASTX.sequence(record)) == BioSequences.LongDNASeq
#                 sequence_type = "dna"
#             elseif typeof(FASTX.sequence(record)) == BioSequences.LongAminoAcidSeq
#                 sequence_type = "aa"
#             elseif typeof(FASTX.sequence(record)) == BioSequences.LongRNASeq
#                 sequence_type = "rna"
#             end
#             MetaGraphs.set_prop!(graph, record_node, :sequence_type, sequence_type)
#             edge = Graphs.Edge(fasta_node, record_node)
#             Graphs.add_edge!(graph, edge)
#             # edge types are all caps
#             MetaGraphs.set_prop!(graph, edge, :TYPE, "CONTAINS_RECORD")
#             # for kmer in collect(BioSequences.each(kmer_type, FASTX.sequence(record)))[1:2^3]
#             for kmer in BioSequences.each(kmer_type, FASTX.sequence(record))
#                 canonical_kmer = BioSequences.canonical(kmer)
#                 if canonical_kmer in canonical_kmers
#                     kmer_node = graph[canonical_kmer, :identifier]
#                 else
#                     push!(canonical_kmers, canonical_kmer)
#                     Graphs.add_vertex!(graph)
#                     kmer_node = Graphs.nv(graph)
#                     MetaGraphs.set_prop!(graph, kmer_node, :TYPE, "Kmer")
#                     MetaGraphs.set_prop!(graph, kmer_node, :identifier, BioSequences.canonical(kmer))
#                     MetaGraphs.set_prop!(graph, kmer_node, :sequence_type, sequence_type)
#                 end
#                 edge = Graphs.Edge(record_node, kmer_node)
#                 if !Graphs.has_edge(graph, edge)
#                     Graphs.add_edge!(graph, edge)
#                     MetaGraphs.set_prop!(graph, edge, :TYPE, "CONTAINS_KMER")
#                     # can't use vectors as properties :(
#                     MetaGraphs.set_prop!(graph, edge, :count, 0)
#                     MetaGraphs.set_prop!(graph, edge, Symbol(true), 0)
#                     MetaGraphs.set_prop!(graph, edge, Symbol(false), 0)
#                 end
#                 is_canonical = Symbol(BioSequences.iscanonical(kmer.fw))
#                 current_count = MetaGraphs.get_prop(graph, edge, is_canonical) + 1
#                 MetaGraphs.set_prop!(graph, edge, is_canonical, current_count)

#                 current_count = MetaGraphs.get_prop(graph, edge, Symbol(true)) + MetaGraphs.get_prop(graph, edge, Symbol(false))
#                 MetaGraphs.set_prop!(graph, edge, :count, current_count)                
#             end
#             # for edgemer in collect(BioSequences.each(edgemer_type, FASTX.sequence(record)))[1:2^3-1]
#             for edgemer in BioSequences.each(edgemer_type, FASTX.sequence(record))
#                 src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical = assess_edgemer(edgemer.fw)
#                 if canonical_dst < canonical_src
#                     src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical = assess_edgemer(edgemer.bw)
#                 end
#                 src_index, dst_index = graph[canonical_src, :identifier], graph[canonical_dst, :identifier]
#                 edge = Graphs.Edge(src_index, dst_index)
#                 if !Graphs.has_edge(graph, edge)
#                     Graphs.add_edge!(graph, edge)
#                     MetaGraphs.set_prop!(graph, edge, :TYPE, "CONNECTS_TO")
#                     for o in edge_orientations
#                         MetaGraphs.set_prop!(graph, edge, o, 0)
#                     end
#                     MetaGraphs.set_prop!(graph, edge, :count, 0)
#                 end
#                 orientations = Symbol("$(src_is_canonical)_$(dst_is_canonical)")
#                 current_count = MetaGraphs.get_prop(graph, edge, orientations) + 1
#                 MetaGraphs.set_prop!(graph, edge, orientations, current_count)
                
#                 current_count = sum(MetaGraphs.get_prop(graph, edge, o) for o in edge_orientations)
#                 MetaGraphs.set_prop!(graph, edge, :count, current_count) 
#             end
#         end
    end
    return graph
end

fastx_to_metagraph (generic function with 1 method)

In [8]:
function upload_node_type_over_url(node_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE, window_size=100)
    node_type_params = Set{Symbol}()
    vertices_of_type = [v for v in Graphs.vertices(graph) if (graph.vprops[v][:TYPE] == node_type)]
    
    V = length(vertices_of_type)
    windows = [i:min(i+window_size-1,V) for i in 1:window_size:V]
    
    ProgressMeter.@showprogress for window in windows
        cmds = []
        for (i, v) in enumerate(vertices_of_type[window])
            node_params = filter(p -> p != :TYPE, keys(MetaGraphs.props(graph, v)))
            params = ["$(string(param)):'$(MetaGraphs.get_prop(graph, v, param))'" for param in node_params]
            # params = ["$(string(param)):'$(escape_string(MetaGraphs.get_prop(graph, v, param)))'" for param in node_params]
            joint_params = join(params, ", ")
            cmd = "MERGE (node$(i):$(node_type) {$(joint_params)})"
            push!(cmds, cmd)
        end
        cmd = join(cmds, ' ')
        cypher_cmd = Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)
        run(cypher_cmd)
    end    
end

upload_node_type_over_url (generic function with 2 methods)

In [74]:
function upload_edge_type_over_url(src_type, dst_type, edge_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE; window_size=100)    
    src_nodes = filter(v -> MetaGraphs.get_prop(graph, v, :TYPE) == src_type, Graphs.vertices(graph))
    dst_nodes = filter(v -> MetaGraphs.get_prop(graph, v, :TYPE) == dst_type, Graphs.vertices(graph))
    edges_to_upload = []
    for src_node in src_nodes
        outneighbors = Graphs.outneighbors(graph, src_node)
        outneighbors = filter(outneighbor -> outneighbor in dst_nodes, outneighbors)
        for outneighbor in outneighbors
            this_edge = Graphs.Edge(src_node, outneighbor)
            @assert MetaGraphs.get_prop(graph, this_edge, :TYPE) == edge_type
            push!(edges_to_upload, this_edge)
        end
    end
    # edge_type_params = filter(param -> param != :TYPE, reduce(union, keys(graph.eprops[e]) for e in edges_to_upload))
    
    N = length(edges_to_upload)
    windows = [i:min(i+window_size-1,N) for i in 1:window_size:N]
    
    ProgressMeter.@showprogress for window in windows
        cmds = []
        for (i, e) in enumerate(edges_to_upload[window])
            edge_params = filter(p -> p != :TYPE, keys(MetaGraphs.props(graph, e)))
            params = ["$(string(param)):'$(MetaGraphs.get_prop(graph, e, param))'" for param in edge_params]
            joint_params = join(params, ", ")
            node_cmds = 
            """
            MERGE (src$(i):$(MetaGraphs.props(graph, e.src)[:TYPE]) {identifier: '$(MetaGraphs.props(graph, e.src)[:identifier])'})
            MERGE (dst$(i):$(MetaGraphs.props(graph, e.dst)[:TYPE]) {identifier: '$(MetaGraphs.props(graph, e.dst)[:identifier])'})
            """
            # match_cmds = 
            # """
            # MATCH 
            #     (src$(i):$(MetaGraphs.props(graph, e.src)[:TYPE]) {identifier: '$(MetaGraphs.props(graph, e.src)[:identifier])'}),
            #     (dst$(i):$(MetaGraphs.props(graph, e.dst)[:TYPE]) {identifier: '$(MetaGraphs.props(graph, e.dst)[:identifier])'})
            # """
            if !isempty(joint_params)
                relationship_cmd = "MERGE (src$(i))-[r$(i):$(MetaGraphs.props(graph, e)[:TYPE]) {$(joint_params)}]->(dst$(i))"
            else
                relationship_cmd = "MERGE (src$(i))-[r$(i):$(MetaGraphs.props(graph, e)[:TYPE])]->(dst$(i))"
            end
            cmd = node_cmds * relationship_cmd
            cmd = replace(cmd, '\n' => ' ')
            push!(cmds, cmd)
        end
        cmd = join(cmds, ' ')
        cypher_cmd = Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)
        run(cypher_cmd)
        # println(cypher_cmd)
    end    
end

upload_edge_type_over_url (generic function with 1 method)

In [10]:
function upload_node_type_over_read_csv(node_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE, window_size=1)
        
    node_type_params = Set{Symbol}()
    vertices_of_type = [v for v in Graphs.vertices(graph) if (graph.vprops[v][:TYPE] == node_type)]
    node_type_params = filter(param -> param != :TYPE, reduce(union, keys(graph.vprops[v]) for v in vertices_of_type))
    
    V = length(vertices_of_type)
    windows = [i:min(i+window_size-1,V) for i in 1:window_size:V]
    
    temp_upload_dir = mkpath(joinpath(DIR, "temp_upload"))
    
    ProgressMeter.@showprogress for window in windows
        for (i, v) in enumerate(vertices_of_type[window])

            f = "node$v.tsv"
            local_f_path = "$(temp_upload_dir)/$(f)"
            uCSV.write(local_f_path, DataFrames.DataFrame(MetaGraphs.props(graph, v)), delim='\t')
            
            remote_f_path = "gs://neo4j-upload/$(f)"
            run(`gsutil cp $(local_f_path) $(remote_f_path)`)
            # gcloud iam service-accounts keys create ~/.config/gcloud/url-signer-key.json --iam-account="928365250020-compute@developer.gserviceaccount.com"
            signer_credential_path = "$(homedir())/.config/gcloud/url-signer-key.json"
            sign_url_cmd = `gsutil signurl $(signer_credential_path) $(remote_f_path)`
            signed_url_table = DataFrames.DataFrame(uCSV.read(open(sign_url_cmd), header=1, delim='\t')...)
            signed_url = signed_url_table[1, "Signed URL"]
            parameters = ["$(p): row.$(p)" for p in filter(x -> x != :TYPE, keys(MetaGraphs.props(graph, v)))]
            parameters = "{" * join(parameters, ", ") * "}"

            cmd =
            """
            LOAD CSV WITH HEADERS FROM '$(signed_url)' AS row FIELDTERMINATOR '\t'
            MERGE (node:$(node_type) $(parameters))
            """

            cmd = rstrip(replace(cmd, '\n' => ' '))
            cypher_cmd = Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)
            run(cypher_cmd)            
        end
    end    
end

upload_node_type_over_read_csv (generic function with 2 methods)

In [11]:
function add_node_table_to_graph!(;graph, table)
    ProgressMeter.@showprogress for row in DataFrames.eachrow(table)
        if !has_identifier(graph, row[:identifier])
            Graphs.add_vertex!(graph)
            v = Graphs.nv(graph)
        else
            v = graph[row[:identifier], :identifier]
        end
        for col in names(row)
            occursin(r"\s+", col) && error("space found in parameter name, please sanitize")
            scol = Symbol(col)
            if !ismissing(row[col])
                if MetaGraphs.has_prop(graph, v, scol)
                    current_value = MetaGraphs.props(graph, v)[scol]
                    @assert row[col] == current_value
                else
                    MetaGraphs.set_prop!(graph, v, scol, row[col])
                end
            end
        end
    end
    return graph
end

add_node_table_to_graph! (generic function with 1 method)

In [12]:
function add_edge_table_to_graph!(;graph, table)
    ProgressMeter.@showprogress for row in DataFrames.eachrow(table)
        src = graph[row[:src], :identifier]
        dst = graph[row[:dst], :identifier]
        # edge_type = row[:TYPE]
        edge = Graphs.Edge(src, dst)
        if !Graphs.has_edge(graph, edge)
            Graphs.add_edge!(graph, edge)
        end
        for col in filter(col -> !(col in ("src", "dst")), names(row))
            occursin(r"\s+", col) && error("space found in parameter name, please sanitize")
            scol = Symbol(col)
            if !ismissing(row[col])
                if MetaGraphs.has_prop(graph, edge, scol)
                    current_value = MetaGraphs.props(graph, edge)[scol]
                    @assert row[col] == current_value
                else
                    MetaGraphs.set_prop!(graph, edge, scol, row[col])
                end
            end
        end
    end
    return graph
end

add_edge_table_to_graph! (generic function with 1 method)

# Script

## obtain NCBI taxonomy

In [13]:
taxdump_url = "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
taxdump_local_tarball = "$(DIR)/$(basename(taxdump_url))"

if !isfile(taxdump_local_tarball)
    download(taxdump_url, taxdump_local_tarball)
end

taxdump_out = replace(taxdump_local_tarball, ".tar.gz" => "")
if !isdir(taxdump_out)
    mkpath(taxdump_out)
    run(`tar -xvzf $(taxdump_local_tarball) -C $(taxdump_out)`)
end

readdir(taxdump_out)

9-element Vector{String}:
 "citations.dmp"
 "delnodes.dmp"
 "division.dmp"
 "gc.prt"
 "gencode.dmp"
 "merged.dmp"
 "names.dmp"
 "nodes.dmp"
 "readme.txt"

In [14]:
#     Here we will create an in-memory dataframe to capture the contents of the names.dmp file

# Taxonomy names file (names.dmp):
# 	tax_id					-- the id of node associated with this name
# 	name_txt				-- name itself
# 	unique name				-- the unique variant of this name if name not unique
# 	name class				-- (synonym, common name, ...)

names_dmp = DataFrames.DataFrame(
    tax_id = String[],
    name_txt = String[],
    unique_name = String[],
    name_class = String[]
)
ProgressMeter.@showprogress for line in split(read(open("$(taxdump_out)/names.dmp"), String), "\t|\n")
    if isempty(line)
        continue
    else
        (tax_id, name_txt, unique_name, name_class) = split(line, "\t|\t")
        # tax_id = parse(Int, tax_id_string)
        row = (;tax_id, name_txt, unique_name, name_class)
        push!(names_dmp, row)
    end
end
names_dmp
# all are unique
# unique!(names_dmp)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:25[39m39m


Unnamed: 0_level_0,tax_id,name_txt
Unnamed: 0_level_1,String,String
1,1,all
2,1,root
3,2,Bacteria
4,2,bacteria
5,2,eubacteria
6,2,Monera
7,2,Procaryotae
8,2,Prokaryotae
9,2,Prokaryota
10,2,prokaryote


In [15]:
names_dmp_transformed = DataFrames.DataFrame(
    Dict(
    [nc => Union{Missing, String}[] for nc in unique(names_dmp[!, "name_class"])]...,
    "tax_id" => String[]
    )
)

ProgressMeter.@showprogress for identifier_group in DataFrames.groupby(names_dmp, "tax_id")
    fields = Dict{String, Union{String, Missing}}(n => missing for n in names(names_dmp_transformed))
    for name_class in DataFrames.groupby(identifier_group, "name_class")
        nc = name_class[1, "name_class"]
        all_names = unique(filter(n -> !isempty(n), (name_class[!, "name_txt"]...,name_class[!, "unique_name"]...)))
        fields[nc] = join(all_names, ";;;")
    end
    fields["tax_id"] = identifier_group[1, "tax_id"]
    # @show fields
    push!(names_dmp_transformed, fields)
end
names_dmp_transformed

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:06:09[39mm:59[39m


Unnamed: 0_level_0,acronym,authority
Unnamed: 0_level_1,String?,String?
1,missing,missing
2,missing,missing
3,missing,Azorhizobium Dreyfus et al. 1988 emend. Lang et al. 2013
4,missing,Azorhizobium caulinodans Dreyfus et al. 1988
5,missing,Buchnera aphidicola Munson et al. 1991
6,missing,"Cellvibrio (ex Winogradsky 1929) Blackall et al. 1986 emend. Humphry et al. 2003;;;Cellvibrio (ex Winogradsky 1929) Blackall et al. 1986 emend. Suarez et al. 2014;;;""Cellvibrio"" Winogradsky 1929"
7,missing,"""Cellulomonas gilvus"" (Hulcher and King 1958) Christopherson et al. 2013;;;""Cellvibrio gilvus"" Hulcher and King 1958"
8,missing,Dictyoglomus Saiki et al. 1985
9,missing,Dictyoglomus thermophilum Saiki et al. 1985
10,missing,Methylophilus Jenkins et al. 1987


In [16]:
#     Here we can see that there are divisions projected onto the tree that will allow easy grouping by taxonomic "group"s such as primates, viruses, phages, etc.

division_table = DataFrames.DataFrame(
    division_id = String[],
    division_name = String[],
    division_cde = String[],
    notes = String[]
)
for line in split(read(open("$(taxdump_out)/division.dmp"), String), "\t|\n")
    if !isempty(line)
        (division_id, division_cde, division_name, notes) = split(line, "\t|\t")
        row = (;division_id, division_cde, division_name, notes)
        push!(division_table, row)
    end
end
division_table

Unnamed: 0_level_0,division_id,division_name,division_cde,notes
Unnamed: 0_level_1,String,String,String,String
1,0,Bacteria,BCT,
2,1,Invertebrates,INV,
3,2,Mammals,MAM,
4,3,Phages,PHG,
5,4,Plants and Fungi,PLN,
6,5,Primates,PRI,
7,6,Rodents,ROD,
8,7,Synthetic and Chimeric,SYN,
9,8,Unassigned,UNA,No species nodes should inherit this division assignment
10,9,Viruses,VRL,


In [17]:
node_table = DataFrames.DataFrame(
    tax_id = String[],
    parent_tax_id = String[],
    rank = String[],
    embl_code = String[],
    division_id = String[]
)
ProgressMeter.@showprogress for line in split(read(open("$(taxdump_out)/nodes.dmp"), String), "\t|\n")
    if isempty(line)
        continue
    else
        (tax_id, parent_tax_id, rank, embl_code, division_id) = split(line, "\t|\t")
        row = (;tax_id, parent_tax_id, rank, embl_code, division_id)
        push!(node_table, row)
    end
end
node_table

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:54[39m:12[39m


Unnamed: 0_level_0,tax_id,parent_tax_id,rank,embl_code,division_id
Unnamed: 0_level_1,String,String,String,String,String
1,1,1,no rank,,8
2,2,131567,superkingdom,,0
3,6,335928,genus,,0
4,7,6,species,AC,0
5,9,32199,species,BA,0
6,10,1706371,genus,,0
7,11,1707,species,CG,0
8,13,203488,genus,,0
9,14,13,species,DT,0
10,16,32011,genus,,0


In [18]:
joint_node_metadata = 
DataFrames.outerjoin(
    names_dmp_transformed, 
    DataFrames.outerjoin(node_table, division_table, on="division_id"),
    on="tax_id"
)

Unnamed: 0_level_0,acronym,authority
Unnamed: 0_level_1,String?,String?
1,missing,missing
2,missing,missing
3,missing,Azorhizobium Dreyfus et al. 1988 emend. Lang et al. 2013
4,missing,Azorhizobium caulinodans Dreyfus et al. 1988
5,missing,Buchnera aphidicola Munson et al. 1991
6,missing,"Cellvibrio (ex Winogradsky 1929) Blackall et al. 1986 emend. Humphry et al. 2003;;;Cellvibrio (ex Winogradsky 1929) Blackall et al. 1986 emend. Suarez et al. 2014;;;""Cellvibrio"" Winogradsky 1929"
7,missing,"""Cellulomonas gilvus"" (Hulcher and King 1958) Christopherson et al. 2013;;;""Cellvibrio gilvus"" Hulcher and King 1958"
8,missing,Dictyoglomus Saiki et al. 1985
9,missing,Dictyoglomus thermophilum Saiki et al. 1985
10,missing,Methylophilus Jenkins et al. 1987


In [19]:
# joint_node_metadata[!, "identifier"] .= map(id -> "tax_id_$(id)", joint_node_metadata[!, "tax_id"])
joint_node_metadata[!, "identifier"] .= joint_node_metadata[!, "tax_id"]

2411405-element Vector{String}:
 "1"
 "2"
 "6"
 "7"
 "9"
 "10"
 "11"
 "13"
 "14"
 "16"
 "17"
 "18"
 "19"
 ⋮
 "2929560"
 "2929561"
 "2929562"
 "2929563"
 "2929564"
 "2929565"
 "2929751"
 "2929752"
 "2929762"
 "2929835"
 "2929839"
 "2929841"

In [20]:
joint_node_metadata[!, "TYPE"] .= "Taxonomy"

2411405-element Vector{String}:
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 ⋮
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"

In [21]:
node_table = joint_node_metadata
for re in (r"\s+", r"-")
    for col in filter(n -> occursin(re, n), names(node_table))
        rename_map = col => replace(col, re => "_")
        @show rename_map
        DataFrames.rename!(node_table, rename_map)
    end
end

rename_map = "blast name" => "blast_name"
rename_map = "common name" => "common_name"
rename_map = "equivalent name" => "equivalent_name"
rename_map = "genbank acronym" => "genbank_acronym"
rename_map = "genbank common name" => "genbank_common_name"
rename_map = "scientific name" => "scientific_name"
rename_map = "type material" => "type_material"
rename_map = "in-part" => "in_part"


In [22]:
ProgressMeter.@showprogress for col in names(node_table)
    # strip away single-quotes
    node_table[!, col] .= map(x -> x isa AbstractString ? replace(x, "'" => "") : x, node_table[!, col])
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:07[39m


In [23]:
graph = MetaGraphs.MetaDiGraph()
MetaGraphs.set_indexing_prop!(graph, :identifier)

Set{Symbol} with 1 element:
  :identifier

In [24]:
add_node_table_to_graph!(graph=graph, table=joint_node_metadata[!, DataFrames.Not("parent_tax_id")])

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:02:25[39m


{2411405, 0} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [28]:
edge_table = DataFrames.rename(joint_node_metadata[!, ["parent_tax_id", "identifier"]], ["identifier" => "dst", "parent_tax_id" => "src"])
edge_table[!, "TYPE"] .= "IS_PARENT_OF"
edge_table

Unnamed: 0_level_0,src,dst,TYPE
Unnamed: 0_level_1,String,String,String
1,1,1,IS_PARENT_OF
2,131567,2,IS_PARENT_OF
3,335928,6,IS_PARENT_OF
4,6,7,IS_PARENT_OF
5,32199,9,IS_PARENT_OF
6,1706371,10,IS_PARENT_OF
7,1707,11,IS_PARENT_OF
8,203488,13,IS_PARENT_OF
9,13,14,IS_PARENT_OF
10,32011,16,IS_PARENT_OF


In [29]:
add_edge_table_to_graph!(graph=graph, table=edge_table)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:45[39m0:53[39m


{2411405, 2411405} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [None]:
# subset nodes - find all nodes above node of interest as well as all nodes below

In [68]:
in_bfs_tree = Graphs.bfs_tree(graph, graph["$(root_tax_id)", :identifier], dir=:in)
in_subgraph, node_map = Graphs.induced_subgraph(graph, collect(Graphs.edges(in_bfs_tree)))
in_node_identifiers = [MetaGraphs.props(in_subgraph, v)[:identifier] for v in Graphs.vertices(in_subgraph)]
out_bfs_tree = Graphs.bfs_tree(graph, graph["$(root_tax_id)", :identifier], dir=:out)
out_subgraph, node_map = Graphs.induced_subgraph(graph, collect(Graphs.edges(out_bfs_tree)))
out_node_identifiers = [MetaGraphs.props(out_subgraph, v)[:identifier] for v in Graphs.vertices(out_subgraph)]
identifiers_to_keep = union(in_node_identifiers, out_node_identifiers)

407-element Vector{String}:
 "10239"
 "1"
 "10662"
 "28883"
 "10663"
 "1198136"
 "2731619"
 "2731341"
 "2731360"
 "2731618"
 "10664"
 "10665"
 "45406"
 ⋮
 "2025823"
 "2656520"
 "2716729"
 "2591056"
 "2591063"
 "1913049"
 "1863008"
 "1863009"
 "2024321"
 "2321390"
 "2691085"
 "2052930"

In [69]:
vertices_to_keep = [graph[i, :identifier] for i in identifiers_to_keep]

407-element Vector{Int64}:
    8265
       1
    8584
   12478
    8585
  933601
 2272589
 2272385
 2272404
 2272588
    8586
    8587
   25681
       ⋮
 1667767
 2214139
 2259927
 2155753
 2155760
 1567900
 1522687
 1522688
 1666423
 1909567
 2240787
 1689598

In [70]:
filtered_subgraph, vertex_map = Graphs.induced_subgraph(graph, vertices_to_keep)

({407, 407} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0), [8265, 1, 8584, 12478, 8585, 933601, 2272589, 2272385, 2272404, 2272588  …  2259927, 2155753, 2155760, 1567900, 1522687, 1522688, 1666423, 1909567, 2240787, 1689598])

In [71]:
# 5 days?! @ 10 row bundles
# 2 days @ 100 row bundles
node_type_to_upload = "Taxonomy"
upload_node_type_over_url(node_type_to_upload, filtered_subgraph, ADDRESS, USERNAME, PASSWORD, DATABASE, 100)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:23[39m


In [77]:
src_type = dst_type = "Taxonomy"
edge_type = "IS_PARENT_OF"
upload_edge_type_over_url(src_type, dst_type, edge_type, filtered_subgraph, ADDRESS, USERNAME, PASSWORD, DATABASE)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:30[39m


In [None]:
child_tax_ids = vcat(Mycelia.taxonomic_id_to_children(root_tax_id), root_tax_id)

In [None]:
# TODO
# here is where we should apply a filter where host == Escherichia
# need to load host information into neo4j taxonomy

In [None]:
# refseq_metadata = Mycelia.load_refseq_metadata()
ncbi_metadata = Mycelia.load_genbank_metadata()

In [None]:
show(ncbi_metadata[1:1, :], allcols=true)

In [None]:
tax_id_filter = map(taxid -> taxid in child_tax_ids, ncbi_metadata[!, "taxid"])
is_right_host = map(x -> occursin(Regex(host, "i"), x), ncbi_metadata[!, "organism_name"])
not_excluded = ncbi_metadata[!, "excluded_from_refseq"] .== ""
is_full = ncbi_metadata[!, "genome_rep"] .== "Full"
# assembly_levels = ["Complete Genome"]
assembly_levels = ["Complete Genome", "Chromosome"]
# assembly_levels = ["Complete Genome", "Chromosome", "Scaffold"]
# assembly_levels = ["Complete Genome", "Chromosome", "Scaffold", "Contig"]
assembly_level_filter = map(x -> x in assembly_levels, ncbi_metadata[!, "assembly_level"])
full_filter = is_full .& not_excluded .& assembly_level_filter .& tax_id_filter .& is_right_host
count(full_filter)

In [None]:
# TODO
# here is another place we could enforce host == escherichia
# we'll use a manual filter as a temporary solution

In [None]:
ncbi_metadata_of_interest[!, "ftp_path"]

In [None]:
ncbi_metadata_of_interest = ncbi_metadata[full_filter, :]

In [None]:
# https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=genbank&id=GCA_021354775

In [None]:
# for col in names(ncbi_metadata_of_interest)
#     @show col, ncbi_metadata_of_interest[1, col]
# end

In [None]:
N_FASTAS = 2

In [None]:
entity_file_maps

In [None]:
entity_file_maps = Dict()
ProgressMeter.@showprogress for row in DataFrames.eachrow(ncbi_metadata_of_interest[1:N_FASTAS, :])
    entity_id = row["# assembly_accession"]
    entity_file_maps[entity_id] = Dict()
    # generally will want genome + GFF || genbank
    # everything else can be derived from these inputs
    for (file_type, extension) in [
            "genome" => "genomic.fna.gz",
            "annotations" => "genomic.gff.gz",
            "genbank" => "genomic.gbff.gz",
            # "proteins" => "protein.faa.gz",
        ]
        outdir = mkpath(joinpath(DIR, extension))
        url = Mycelia.ncbi_ftp_path_to_url(row["ftp_path"], extension)
        outfile = joinpath(outdir, basename(url))
        if !isfile(outfile)
            try
                Downloads.download(url, outfile)
            catch e
                showerror(stdout, e)
            end
        end
        if isfile(outfile)
            entity_file_maps[entity_id][file_type] = outfile
        end
    end
end
entity_file_maps

In [None]:
# graph = Mycelia.fastx_to_kmer_graph(BioSequences.BigDNAMer{kmer_size}, fastx_files)

In [None]:
# use basename of files as identifiers

In [None]:
# kmer_size = Mycelia.assess_dnamer_saturation(fastx_files)
kmer_size = 7

In [None]:
@time graph = fastx_to_metagraph(kmer_size, entity_file_maps)

In [None]:
graph_outfile = "$DIR/root-tax-id_$(root_tax_id).k_$(kmer_size).genome-graph"
# Mycelia.save_graph(graph, graph_outfile)
# Mycelia.graph_to_gfa(graph, graph_outfile * ".gfa")
# Mycelia.load_graph(graph_outfile)

In [None]:
cmd = "CREATE CONSTRAINT ON (k:Kmer) ASSERT k.identifier IS UNIQUE"
@time Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd);

In [None]:
node_types = unique(graph.vprops[v][:TYPE] for v in Graphs.vertices(graph))

In [None]:
# initialize graph

In [None]:
# add fasta records to graph

In [None]:
# add fasta records to Neo4J
# add over URL

In [None]:
upload_node_type_over_url("Fasta", graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

In [None]:
# add fasta records to graph

In [None]:
# add fasta records to Neo4J
# batch file upload

In [None]:
upload_node_type_over_read_csv("FastaRecord", graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

In [None]:
src_type = "Fasta"
dst_type = "FastaRecord"
edge_type = "CONTAINS_RECORD"

In [None]:
upload_edge_type_over_url(src_type, dst_type, edge_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE, window_size=10)

In [None]:
# add kmers to graph

In [None]:
# add kmers to Neo4J
# add over URL

In [None]:
upload_node_type_over_url("Kmer", graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

In [None]:
# add edges from records to kmers

In [None]:
src_type = "FastaRecord"
dst_type = "Kmer"
edge_type = "CONTAINS_KMER"
upload_edge_type_over_url(src_type, dst_type, edge_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

In [None]:
# upload to Neo4j

In [None]:
# Kmer to Kmer connections

In [None]:
# on merge with these Kmer to Kmer connections we'll need to increment the counts!
# any time we increase the number of genomes, the values will become out of date
src_type = "Kmer"
dst_type = "Kmer"
edge_type = "CONNECTS_TO"
upload_edge_type_over_url(src_type, dst_type, edge_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

In [None]:
# get the full list of kmers under the fasta of interest

In [None]:
# for each kmer, count the number of incoming "CONTAINS_KMER" edges

In [None]:
# divide by max count

In [None]:
# pull in annotations!!!

In [None]:
ncbi_metadata_of_interest

In [None]:
entity_metadata_table = ncbi_metadata_of_interest

In [None]:
entity_metadata_table = DataFrames.rename(entity_metadata_table, "# assembly_accession" => "identifier")

In [None]:
entity_metadata_table[!, "TYPE"] .= "Entity"

In [None]:
entity_metadata_table

In [None]:
show(entity_metadata_table[1:1, :], allcols=true)

In [None]:
graph.indices

In [None]:
keys(graph.metaindex[:identifier])

In [None]:
for row in DataFrames.eachrow(entity_metadata_table)
    identifier = row["identifier"]
    @show has_identifier(graph, identifier)
    if !has_identifier(graph, identifier)
        Graphs.add_vertex!(graph)
        vertex = Graphs.nv(graph)
    else
        vertex = graph[]
    end
    @show vertex
    for col in names(row)
        if !MetaGraphs.has_prop(graph, vertex, Symbol(col))
            MetaGraphs.set_prop!(graph, vertex, Symbol(col), row[col])
        else
            current_value = MetaGraphs.get_prop(graph, vertex, Symbol(col))
            if current_value != row[col]
                @warn "property $(col)'s current value $(current_value) != this value $(row[col])"
            end
        end
    end
    @show MetaGraphs.props(graph, vertex)
end

In [None]:

    for entity in keys(entity_file_map)
        # f_type = "Genome"
        fastx = entity_file_map[entity]["genome"]
        Graphs.add_vertex!(graph)
        fasta_node = Graphs.nv(graph)
        # node types are camel case
        MetaGraphs.set_prop!(graph, fasta_node, :TYPE, "Fasta")
        # node and edge properties are lowercase
        # add entity identifier as a property that can be queried on
        MetaGraphs.set_prop!(graph, fasta_node, :path, fastx)
        MetaGraphs.set_prop!(graph, fasta_node, :identifier, entity)
        for record in Mycelia.open_fastx(fastx)
            Graphs.add_vertex!(graph)
            record_node = Graphs.nv(graph)
            
            MetaGraphs.set_prop!(graph, record_node, :TYPE, "FastaRecord")
            MetaGraphs.set_prop!(graph, record_node, :identifier, FASTX.identifier(record))
            MetaGraphs.set_prop!(graph, record_node, :description, FASTX.description(record))
            MetaGraphs.set_prop!(graph, record_node, :sequence, FASTX.sequence(record))
            if typeof(FASTX.sequence(record)) == BioSequences.LongDNASeq
                sequence_type = "dna"
            elseif typeof(FASTX.sequence(record)) == BioSequences.LongAminoAcidSeq
                sequence_type = "aa"
            elseif typeof(FASTX.sequence(record)) == BioSequences.LongRNASeq
                sequence_type = "rna"
            end
            MetaGraphs.set_prop!(graph, record_node, :sequence_type, sequence_type)
            edge = Graphs.Edge(fasta_node, record_node)
            Graphs.add_edge!(graph, edge)
            # edge types are all caps
            MetaGraphs.set_prop!(graph, edge, :TYPE, "CONTAINS_RECORD")
            # for kmer in collect(BioSequences.each(kmer_type, FASTX.sequence(record)))[1:2^3]
            for kmer in BioSequences.each(kmer_type, FASTX.sequence(record))
                canonical_kmer = BioSequences.canonical(kmer)
                if canonical_kmer in canonical_kmers
                    kmer_node = graph[canonical_kmer, :identifier]
                else
                    push!(canonical_kmers, canonical_kmer)
                    Graphs.add_vertex!(graph)
                    kmer_node = Graphs.nv(graph)
                    MetaGraphs.set_prop!(graph, kmer_node, :TYPE, "Kmer")
                    MetaGraphs.set_prop!(graph, kmer_node, :identifier, BioSequences.canonical(kmer))
                    MetaGraphs.set_prop!(graph, kmer_node, :sequence_type, sequence_type)
                end
                edge = Graphs.Edge(record_node, kmer_node)
                if !Graphs.has_edge(graph, edge)
                    Graphs.add_edge!(graph, edge)
                    MetaGraphs.set_prop!(graph, edge, :TYPE, "CONTAINS_KMER")
                    # can't use vectors as properties :(
                    MetaGraphs.set_prop!(graph, edge, :count, 0)
                    MetaGraphs.set_prop!(graph, edge, Symbol(true), 0)
                    MetaGraphs.set_prop!(graph, edge, Symbol(false), 0)
                end
                is_canonical = Symbol(BioSequences.iscanonical(kmer.fw))
                current_count = MetaGraphs.get_prop(graph, edge, is_canonical) + 1
                MetaGraphs.set_prop!(graph, edge, is_canonical, current_count)

                current_count = MetaGraphs.get_prop(graph, edge, Symbol(true)) + MetaGraphs.get_prop(graph, edge, Symbol(false))
                MetaGraphs.set_prop!(graph, edge, :count, current_count)                
            end
            # for edgemer in collect(BioSequences.each(edgemer_type, FASTX.sequence(record)))[1:2^3-1]
            for edgemer in BioSequences.each(edgemer_type, FASTX.sequence(record))
                src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical = assess_edgemer(edgemer.fw)
                if canonical_dst < canonical_src
                    src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical = assess_edgemer(edgemer.bw)
                end
                src_index, dst_index = graph[canonical_src, :identifier], graph[canonical_dst, :identifier]
                edge = Graphs.Edge(src_index, dst_index)
                if !Graphs.has_edge(graph, edge)
                    Graphs.add_edge!(graph, edge)
                    MetaGraphs.set_prop!(graph, edge, :TYPE, "CONNECTS_TO")
                    for o in edge_orientations
                        MetaGraphs.set_prop!(graph, edge, o, 0)
                    end
                    MetaGraphs.set_prop!(graph, edge, :count, 0)
                end
                orientations = Symbol("$(src_is_canonical)_$(dst_is_canonical)")
                current_count = MetaGraphs.get_prop(graph, edge, orientations) + 1
                MetaGraphs.set_prop!(graph, edge, orientations, current_count)
                
                current_count = sum(MetaGraphs.get_prop(graph, edge, o) for o in edge_orientations)
                MetaGraphs.set_prop!(graph, edge, :count, current_count) 
            end
        end
    end
    return graph
end