# Initialize workspace

In [2]:
DATE_TASK = "2022-03-29-ecoli-tequatro-pangenome"
DIR = mkpath("$(homedir())/workspace/$DATE_TASK")
cd(DIR)
DATE, TASK = match(r"^(\d{4}-\d{2}-\d{2})-(.*)$", DATE_TASK).captures

2-element Vector{Union{Nothing, SubString{String}}}:
 "2022-03-29"
 "ecoli-tequatro-pangenome"

# Import packages

In [56]:
import Pkg
Pkg.update()
pkgs = [
"JSON",
"HTTP",
"Dates",
"uCSV",
"DelimitedFiles",
"DataFrames",
"ProgressMeter",
"BioSequences",
"FASTX",
"Distances",
"StatsPlots",
"StatsBase",
"Statistics",
"MultivariateStats",
"Random",
"Primes",
"SparseArrays",
"SHA",
"GenomicAnnotations",
"Combinatorics",
"OrderedCollections",
"Downloads",
"Clustering",
"Revise",
"Mmap",
"Graphs",
"MetaGraphs",
"FileIO",
"SHA",
"MD5"
]

for pkg in pkgs
    try
        eval(Meta.parse("import $pkg"))
    catch
        Pkg.add(pkg)
        eval(Meta.parse("import $pkg"))
    end
end

# works but can't update locally, need to push and restart kernel to activate changes
# "https://github.com/cjprybol/Mycelia.git#master",
# didn't work
# "$(homedir())/git/Mycelia#master",
pkg_path = "$(homedir())/git/Mycelia"
try
    eval(Meta.parse("import $(basename(pkg_path))"))
catch
    # Pkg.add(url=pkg)
    Pkg.develop(path=pkg_path)
    # pkg = replace(basename(pkg), ".git#master" => "")
    # pkg = replace(basename(pkg), "#master" => "")
    eval(Meta.parse("import $(basename(pkg_path))"))
end

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m    Updating[22m[39m git-repo `https://github.com/cjprybol/Mycelia.git#master`
[32m[1m  No Changes[22m[39m to `~/git/Mycelia/docs/Project.toml`
[32m[1m  No Changes[22m[39m to `~/git/Mycelia/docs/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m   Installed[22m[39m MD5 ─ v0.2.1
[32m[1m    Updating[22m[39m `~/git/Mycelia/docs/Project.toml`
 [90m [6ac74813] [39m[92m+ MD5 v0.2.1[39m
[32m[1m    Updating[22m[39m `~/git/Mycelia/docs/Manifest.toml`
 [90m [6ac74813] [39m[92m+ MD5 v0.2.1[39m
[32m[1mPrecompiling[22m[39m project...
[32m  ✓ [39mMD5
  1 dependency successfully precompiled in 2 seconds (256 already precompiled, 6 skipped during auto due to previous errors)


# Declare global parameters

In [9]:
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?&id=$(tax_id)
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?lvl=0&amp;id=2733124
# root_tax_id = 2733124

root_tax_id = 10663
host = "Escherichia"
database_id = "cb969e03"

"cb969e03"

In [11]:
NEO4J_BIN_DIR = "/home/jupyter-cjprybol/software/neo4j-community-4.4.3/bin"
if !occursin(NEO4J_BIN_DIR, ENV["PATH"])
    ENV["PATH"] = "$(NEO4J_BIN_DIR):" * ENV["PATH"]
end
USERNAME="neo4j"
PASSWORD=readline(joinpath(homedir(), ".config", "neo4j", "$(database_id).pass"));
ADDRESS="neo4j+s://$(database_id).databases.neo4j.io:7687"
DATABASE = "neo4j"

"neo4j"

# New functions (add me to library)

In [6]:
function assess_edgemer(edgemer)
    k = length(edgemer)-1
    kmer_type = BioSequences.BigDNAMer{k}
    src = kmer_type(edgemer[i] for i in 1:k)
    dst = kmer_type(edgemer[i] for i in 2:k+1)
    canonical_src = BioSequences.canonical(src)
    canonical_dst = BioSequences.canonical(dst)
    src_is_canonical = src == canonical_src
    dst_is_canonical = dst == canonical_dst
    # @show edgemer
    # @show src
    # @show canonical_src
    # @show src_is_canonical
    # @show dst
    # @show canonical_dst
    # @show dst_is_canonical
    return src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical
end

assess_edgemer (generic function with 1 method)

In [62]:
function has_identifier(graph, identifier)
    return identifier in keys(graph.metaindex[:identifier])
end

has_identifier (generic function with 1 method)

In [None]:
# TODO break me out into indivual steps of additon at each level
# TODO re-write me so that I just write out node and edge tables and add them batch style
function fastx_to_metagraph(;graph, k, entity_file_map)
    @assert isodd(k) "k must be odd to avoid the possibility of kmers being reverse complements of themselves"
    @assert 3 <= k <= 63 "k must be at least 3 to allow for kmer overlaps and less than 64 to be compatible with type limits"
    kmer_type = BioSequences.BigDNAMer{k}
    edgemer_type = BioSequences.BigDNAMer{k+1}
    edge_orientations = Symbol.([
        "true_true",
        "true_false",
        "false_true",
        "false_false"
    ])
    canonical_kmers = Set{kmer_type}()
    MetaGraphs.set_indexing_prop!(graph, :identifier)
    for entity in keys(entity_file_map)
        # f_type = "Genome"
        fastx = entity_file_map[entity]["genome"]
        fastx_hash = SHA.bytes2hex(SHA.sha256(Mycelia.open_fastx(fastx)))
        @show fastx_hash
#         Graphs.add_vertex!(graph)
#         fasta_node = Graphs.nv(graph)
#         # node types are camel case
#         MetaGraphs.set_prop!(graph, fasta_node, :TYPE, "Fasta")
#         # node and edge properties are lowercase
#         # add entity identifier as a property that can be queried on
#         MetaGraphs.set_prop!(graph, fasta_node, :path, fastx)
#         # for identifier, set 
#         MetaGraphs.set_prop!(graph, fasta_node, :identifier, entity)
#         for record in Mycelia.open_fastx(fastx)
#             Graphs.add_vertex!(graph)
#             record_node = Graphs.nv(graph)
            
#             MetaGraphs.set_prop!(graph, record_node, :TYPE, "FastaRecord")
#             MetaGraphs.set_prop!(graph, record_node, :identifier, FASTX.identifier(record))
#             MetaGraphs.set_prop!(graph, record_node, :description, FASTX.description(record))
#             MetaGraphs.set_prop!(graph, record_node, :sequence, FASTX.sequence(record))
#             if typeof(FASTX.sequence(record)) == BioSequences.LongDNASeq
#                 sequence_type = "dna"
#             elseif typeof(FASTX.sequence(record)) == BioSequences.LongAminoAcidSeq
#                 sequence_type = "aa"
#             elseif typeof(FASTX.sequence(record)) == BioSequences.LongRNASeq
#                 sequence_type = "rna"
#             end
#             MetaGraphs.set_prop!(graph, record_node, :sequence_type, sequence_type)
#             edge = Graphs.Edge(fasta_node, record_node)
#             Graphs.add_edge!(graph, edge)
#             # edge types are all caps
#             MetaGraphs.set_prop!(graph, edge, :TYPE, "CONTAINS_RECORD")
#             # for kmer in collect(BioSequences.each(kmer_type, FASTX.sequence(record)))[1:2^3]
#             for kmer in BioSequences.each(kmer_type, FASTX.sequence(record))
#                 canonical_kmer = BioSequences.canonical(kmer)
#                 if canonical_kmer in canonical_kmers
#                     kmer_node = graph[canonical_kmer, :identifier]
#                 else
#                     push!(canonical_kmers, canonical_kmer)
#                     Graphs.add_vertex!(graph)
#                     kmer_node = Graphs.nv(graph)
#                     MetaGraphs.set_prop!(graph, kmer_node, :TYPE, "Kmer")
#                     MetaGraphs.set_prop!(graph, kmer_node, :identifier, BioSequences.canonical(kmer))
#                     MetaGraphs.set_prop!(graph, kmer_node, :sequence_type, sequence_type)
#                 end
#                 edge = Graphs.Edge(record_node, kmer_node)
#                 if !Graphs.has_edge(graph, edge)
#                     Graphs.add_edge!(graph, edge)
#                     MetaGraphs.set_prop!(graph, edge, :TYPE, "CONTAINS_KMER")
#                     # can't use vectors as properties :(
#                     MetaGraphs.set_prop!(graph, edge, :count, 0)
#                     MetaGraphs.set_prop!(graph, edge, Symbol(true), 0)
#                     MetaGraphs.set_prop!(graph, edge, Symbol(false), 0)
#                 end
#                 is_canonical = Symbol(BioSequences.iscanonical(kmer.fw))
#                 current_count = MetaGraphs.get_prop(graph, edge, is_canonical) + 1
#                 MetaGraphs.set_prop!(graph, edge, is_canonical, current_count)

#                 current_count = MetaGraphs.get_prop(graph, edge, Symbol(true)) + MetaGraphs.get_prop(graph, edge, Symbol(false))
#                 MetaGraphs.set_prop!(graph, edge, :count, current_count)                
#             end
#             # for edgemer in collect(BioSequences.each(edgemer_type, FASTX.sequence(record)))[1:2^3-1]
#             for edgemer in BioSequences.each(edgemer_type, FASTX.sequence(record))
#                 src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical = assess_edgemer(edgemer.fw)
#                 if canonical_dst < canonical_src
#                     src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical = assess_edgemer(edgemer.bw)
#                 end
#                 src_index, dst_index = graph[canonical_src, :identifier], graph[canonical_dst, :identifier]
#                 edge = Graphs.Edge(src_index, dst_index)
#                 if !Graphs.has_edge(graph, edge)
#                     Graphs.add_edge!(graph, edge)
#                     MetaGraphs.set_prop!(graph, edge, :TYPE, "CONNECTS_TO")
#                     for o in edge_orientations
#                         MetaGraphs.set_prop!(graph, edge, o, 0)
#                     end
#                     MetaGraphs.set_prop!(graph, edge, :count, 0)
#                 end
#                 orientations = Symbol("$(src_is_canonical)_$(dst_is_canonical)")
#                 current_count = MetaGraphs.get_prop(graph, edge, orientations) + 1
#                 MetaGraphs.set_prop!(graph, edge, orientations, current_count)
                
#                 current_count = sum(MetaGraphs.get_prop(graph, edge, o) for o in edge_orientations)
#                 MetaGraphs.set_prop!(graph, edge, :count, current_count) 
#             end
#         end
    end
    return graph
end

In [None]:
function upload_node_type_over_url(node_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE, window_size=100)
    node_type_params = Set{Symbol}()
    vertices_of_type = [v for v in Graphs.vertices(graph) if (graph.vprops[v][:TYPE] == node_type)]
    node_type_params = filter(param -> param != :TYPE, reduce(union, keys(graph.vprops[v]) for v in vertices_of_type))
    
    V = length(vertices_of_type)
    windows = [i:min(i+window_size-1,V) for i in 1:window_size:V]
    
    ProgressMeter.@showprogress for window in windows
        cmds = []
        for (i, v) in enumerate(vertices_of_type[window])
            params = ["$(string(param)):'$(MetaGraphs.get_prop(graph, v, param))'" for param in node_type_params]
            joint_params = join(params, ", ")
            cmd = "MERGE (node$(i):$(node_type) {$(joint_params)})"
            push!(cmds, cmd)
        end
        cmd = join(cmds, ' ')
        cypher_cmd = Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)
        run(cypher_cmd)
    end    
end

In [None]:
function upload_edge_type_over_url(src_type, dst_type, edge_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE; window_size=100)    
    src_nodes = filter(v -> MetaGraphs.get_prop(graph, v, :TYPE) == src_type, Graphs.vertices(graph))
    dst_nodes = filter(v -> MetaGraphs.get_prop(graph, v, :TYPE) == dst_type, Graphs.vertices(graph))
    edges_to_upload = []
    for src_node in src_nodes
        outneighbors = Graphs.outneighbors(graph, src_node)
        outneighbors = filter(outneighbor -> outneighbor in dst_nodes, outneighbors)
        for outneighbor in outneighbors
            this_edge = Graphs.Edge(src_node, outneighbor)
            @assert MetaGraphs.get_prop(graph, this_edge, :TYPE) == edge_type
            push!(edges_to_upload, this_edge)
        end
    end
    edge_type_params = filter(param -> param != :TYPE, reduce(union, keys(graph.eprops[e]) for e in edges_to_upload))
    
    N = length(edges_to_upload)
    windows = [i:min(i+window_size-1,N) for i in 1:window_size:N]
    
    ProgressMeter.@showprogress for window in windows
        cmds = []
        for (i, e) in enumerate(edges_to_upload[window])
            params = ["$(string(param)):'$(MetaGraphs.get_prop(graph, e, param))'" for param in edge_type_params]
            joint_params = join(params, ", ")
            node_cmds = 
            """
            MERGE (src$(i):$(MetaGraphs.props(graph, e.src)[:TYPE]) {identifier: '$(MetaGraphs.props(graph, e.src)[:identifier])'})
            MERGE (dst$(i):$(MetaGraphs.props(graph, e.dst)[:TYPE]) {identifier: '$(MetaGraphs.props(graph, e.dst)[:identifier])'})
            """
            # match_cmds = 
            # """
            # MATCH 
            #     (src$(i):$(MetaGraphs.props(graph, e.src)[:TYPE]) {identifier: '$(MetaGraphs.props(graph, e.src)[:identifier])'}),
            #     (dst$(i):$(MetaGraphs.props(graph, e.dst)[:TYPE]) {identifier: '$(MetaGraphs.props(graph, e.dst)[:identifier])'})
            # """
            if !isempty(joint_params)
                relationship_cmd = "MERGE (src$(i))-[r$(i):$(MetaGraphs.props(graph, e)[:TYPE]) {$(joint_params)}]->(dst$(i))"
            else
                relationship_cmd = "MERGE (src$(i))-[r$(i):$(MetaGraphs.props(graph, e)[:TYPE])]->(dst$(i))"
            end
            cmd = node_cmds * relationship_cmd
            cmd = replace(cmd, '\n' => ' ')
            push!(cmds, cmd)
        end
        cmd = join(cmds, ' ')
        cypher_cmd = Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)
        run(cypher_cmd)
        # println(cypher_cmd)
    end    
end

In [None]:
function upload_node_type_over_read_csv(node_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE, window_size=1)
        
    node_type_params = Set{Symbol}()
    vertices_of_type = [v for v in Graphs.vertices(graph) if (graph.vprops[v][:TYPE] == node_type)]
    node_type_params = filter(param -> param != :TYPE, reduce(union, keys(graph.vprops[v]) for v in vertices_of_type))
    
    V = length(vertices_of_type)
    windows = [i:min(i+window_size-1,V) for i in 1:window_size:V]
    
    temp_upload_dir = mkpath(joinpath(DIR, "temp_upload"))
    
    ProgressMeter.@showprogress for window in windows
        for (i, v) in enumerate(vertices_of_type[window])

            f = "node$v.tsv"
            local_f_path = "$(temp_upload_dir)/$(f)"
            uCSV.write(local_f_path, DataFrames.DataFrame(MetaGraphs.props(graph, v)), delim='\t')
            
            remote_f_path = "gs://neo4j-upload/$(f)"
            run(`gsutil cp $(local_f_path) $(remote_f_path)`)
            # gcloud iam service-accounts keys create ~/.config/gcloud/url-signer-key.json --iam-account="928365250020-compute@developer.gserviceaccount.com"
            signer_credential_path = "$(homedir())/.config/gcloud/url-signer-key.json"
            sign_url_cmd = `gsutil signurl $(signer_credential_path) $(remote_f_path)`
            signed_url_table = DataFrames.DataFrame(uCSV.read(open(sign_url_cmd), header=1, delim='\t')...)
            signed_url = signed_url_table[1, "Signed URL"]
            parameters = ["$(p): row.$(p)" for p in filter(x -> x != :TYPE, keys(MetaGraphs.props(graph, v)))]
            parameters = "{" * join(parameters, ", ") * "}"

            cmd =
            """
            LOAD CSV WITH HEADERS FROM '$(signed_url)' AS row FIELDTERMINATOR '\t'
            MERGE (node:$(node_type) $(parameters))
            """

            cmd = rstrip(replace(cmd, '\n' => ' '))
            cypher_cmd = Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)
            run(cypher_cmd)            
        end
    end    
end

In [65]:
function add_node_table_to_graph!(;graph, table)
    ProgressMeter.@showprogress for row in DataFrames.eachrow(table)
        if !has_identifier(graph, row[:identifier])
            Graphs.add_vertex!(graph)
            v = Graphs.nv(graph)
        else
            v = graph[row[:identifier], :identifier]
        end
        for col in names(row)
            scol = Symbol(col)
            if !ismissing(row[col])
                if MetaGraphs.has_prop(graph, v, scol)
                    current_value = MetaGraphs.props(graph, v)[scol]
                    @assert row[col] == current_value
                else
                    MetaGraphs.set_prop!(graph, v, scol, row[col])
                end
            end
        end
    end
    return graph
end

add_node_table_to_graph! (generic function with 1 method)

# Script

## obtain NCBI taxonomy

In [13]:
taxdump_url = "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
taxdump_local_tarball = "$(DIR)/$(basename(taxdump_url))"

if !isfile(taxdump_local_tarball)
    download(taxdump_url, taxdump_local_tarball)
end

taxdump_out = replace(taxdump_local_tarball, ".tar.gz" => "")
if !isdir(taxdump_out)
    mkpath(taxdump_out)
    run(`tar -xvzf $(taxdump_local_tarball) -C $(taxdump_out)`)
end

readdir(taxdump_out)

citations.dmp
delnodes.dmp
division.dmp
gencode.dmp
merged.dmp
names.dmp
nodes.dmp
gc.prt
readme.txt


9-element Vector{String}:
 "citations.dmp"
 "delnodes.dmp"
 "division.dmp"
 "gc.prt"
 "gencode.dmp"
 "merged.dmp"
 "names.dmp"
 "nodes.dmp"
 "readme.txt"

In [38]:
#     Here we will create an in-memory dataframe to capture the contents of the names.dmp file

# Taxonomy names file (names.dmp):
# 	tax_id					-- the id of node associated with this name
# 	name_txt				-- name itself
# 	unique name				-- the unique variant of this name if name not unique
# 	name class				-- (synonym, common name, ...)

names_dmp = DataFrames.DataFrame(
    tax_id = String[],
    name_txt = String[],
    unique_name = String[],
    name_class = String[]
)
ProgressMeter.@showprogress for line in split(read(open("$(taxdump_out)/names.dmp"), String), "\t|\n")
    if isempty(line)
        continue
    else
        (tax_id, name_txt, unique_name, name_class) = split(line, "\t|\t")
        # tax_id = parse(Int, tax_id_string)
        row = (;tax_id, name_txt, unique_name, name_class)
        push!(names_dmp, row)
    end
end
names_dmp
# all are unique
# unique!(names_dmp)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:22[39m0:05[39m


Unnamed: 0_level_0,tax_id,name_txt
Unnamed: 0_level_1,String,String
1,1,all
2,1,root
3,2,Bacteria
4,2,bacteria
5,2,eubacteria
6,2,Monera
7,2,Procaryotae
8,2,Prokaryotae
9,2,Prokaryota
10,2,prokaryote


In [39]:
# rename tax_id to identifier to maintain consistent langauge across node types

In [93]:
# DataFrames.rename!(names_dmp, "tax_id" => "identifier")
# names_dmp[!, "identifier"] .= names_dmp[!, "tax_id"]

3581658-element Vector{String}:
 "1"
 "1"
 "2"
 "2"
 "2"
 "2"
 "2"
 "2"
 "2"
 "2"
 "2"
 "6"
 "6"
 ⋮
 "2929752"
 "2929752"
 "2929762"
 "2929762"
 "2929762"
 "2929762"
 "2929762"
 "2929762"
 "2929835"
 "2929839"
 "2929841"
 "2929841"

In [94]:
show(names_dmp, allcols=true)

[1m3581658×5 DataFrame[0m
[1m     Row [0m│[1m tax_id  [0m[1m name_txt                          [0m[1m unique_name              [0m[1m name_class          [0m[1m identifier [0m
[1m         [0m│[90m String  [0m[90m String                            [0m[90m String                   [0m[90m String              [0m[90m String     [0m
─────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────
       1 │ 1        all                                                          synonym              1
       2 │ 1        root                                                         scientific name      1
       3 │ 2        Bacteria                           Bacteria <bacteria>       scientific name      2
       4 │ 2        bacteria                                                     blast name           2
       5 │ 2        eubacteria                                                   genbank common name  2
       6 

In [52]:
names_dmp_transformed = DataFrames.DataFrame(
    Dict(
    [nc => Union{Missing, String}[] for nc in unique(names_dmp[!, "name_class"])]...,
    "tax_id" => String[]
)
)

Unnamed: 0_level_0,acronym,authority,blast name,common name,equivalent name,genbank acronym,genbank common name
Unnamed: 0_level_1,String?,String?,String?,String?,String?,String?,String?


In [55]:
ProgressMeter.@showprogress for identifier_group in DataFrames.groupby(names_dmp, "tax_id")
    fields = Dict{String, Union{String, Missing}}(n => missing for n in names(names_dmp_transformed))
    for name_class in DataFrames.groupby(identifier_group, "name_class")
        nc = name_class[1, "name_class"]
        all_names = unique(filter(n -> !isempty(n), (name_class[!, "name_txt"]...,name_class[!, "unique_name"]...)))
        fields[nc] = join(all_names, ";;;")
    end
    fields["tax_id"] = identifier_group[1, "tax_id"]
    # @show fields
    push!(names_dmp_transformed, fields)
end
names_dmp_transformed

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:06:54[39mm:30[39m


Unnamed: 0_level_0,acronym,authority
Unnamed: 0_level_1,String?,String?
1,missing,missing
2,missing,missing
3,missing,Azorhizobium Dreyfus et al. 1988 emend. Lang et al. 2013
4,missing,Azorhizobium caulinodans Dreyfus et al. 1988
5,missing,Buchnera aphidicola Munson et al. 1991
6,missing,"Cellvibrio (ex Winogradsky 1929) Blackall et al. 1986 emend. Humphry et al. 2003;;;Cellvibrio (ex Winogradsky 1929) Blackall et al. 1986 emend. Suarez et al. 2014;;;""Cellvibrio"" Winogradsky 1929"
7,missing,"""Cellulomonas gilvus"" (Hulcher and King 1958) Christopherson et al. 2013;;;""Cellvibrio gilvus"" Hulcher and King 1958"
8,missing,Dictyoglomus Saiki et al. 1985
9,missing,Dictyoglomus thermophilum Saiki et al. 1985
10,missing,Methylophilus Jenkins et al. 1987


In [49]:
graph = MetaGraphs.MetaDiGraph()
MetaGraphs.set_indexing_prop!(graph, :identifier)

Set{Symbol} with 1 element:
  :identifier

In [57]:
names_dmp_transformed[!, "TYPE"] .= "Taxonomy"

2573407-element Vector{String}:
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 ⋮
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"
 "Taxonomy"

In [58]:
table = names_dmp_transformed

Unnamed: 0_level_0,acronym,authority
Unnamed: 0_level_1,String?,String?
1,missing,missing
2,missing,missing
3,missing,Azorhizobium Dreyfus et al. 1988 emend. Lang et al. 2013
4,missing,Azorhizobium caulinodans Dreyfus et al. 1988
5,missing,Buchnera aphidicola Munson et al. 1991
6,missing,"Cellvibrio (ex Winogradsky 1929) Blackall et al. 1986 emend. Humphry et al. 2003;;;Cellvibrio (ex Winogradsky 1929) Blackall et al. 1986 emend. Suarez et al. 2014;;;""Cellvibrio"" Winogradsky 1929"
7,missing,"""Cellulomonas gilvus"" (Hulcher and King 1958) Christopherson et al. 2013;;;""Cellvibrio gilvus"" Hulcher and King 1958"
8,missing,Dictyoglomus Saiki et al. 1985
9,missing,Dictyoglomus thermophilum Saiki et al. 1985
10,missing,Methylophilus Jenkins et al. 1987


In [66]:
add_node_table_to_graph!(;graph, table)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:57[39m3:19[39m


{2411405, 0} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [77]:
# uCSV.read("$(taxdump_out)/division.dmp", delim="\t|\n")

In [86]:
#     Here we can see that there are divisions projected onto the tree that will allow easy grouping by taxonomic "group"s such as primates, viruses, phages, etc.

division_table = DataFrames.DataFrame(
    division_id = String[],
    division_name = String[],
    division_cde = String[],
    notes = String[]
)
for line in split(read(open("$(taxdump_out)/division.dmp"), String), "\t|\n")
    if !isempty(line)
        (division_id, division_cde, division_name, notes) = split(line, "\t|\t")
        row = (;division_id, division_cde, division_name, notes)
        push!(division_table, row)
    end
end
division_table
division_table[!, "identifier"] .= division_table[!, "division_name"]
division_table[!, "TYPE"] .= "Division"

Unnamed: 0_level_0,division_id,division_name,division_cde,notes
Unnamed: 0_level_1,String,String,String,String
1,0,Bacteria,BCT,
2,1,Invertebrates,INV,
3,2,Mammals,MAM,
4,3,Phages,PHG,
5,4,Plants and Fungi,PLN,
6,5,Primates,PRI,
7,6,Rodents,ROD,
8,7,Synthetic and Chimeric,SYN,
9,8,Unassigned,UNA,No species nodes should inherit this division assignment
10,9,Viruses,VRL,


In [None]:
add_node_table_to_graph!(graph=graph, table=division_table)

In [95]:
node_table = DataFrames.DataFrame(
    tax_id = String[],
    parent_tax_id = String[],
    rank = String[],
    embl_code = String[],
    division_id = String[]
)
ProgressMeter.@showprogress for line in split(read(open("$(taxdump_out)/nodes.dmp"), String), "\t|\n")
    if isempty(line)
        continue
    else
        (tax_id, parent_tax_id, rank, embl_code, division_id) = split(line, "\t|\t")
        row = (;tax_id, parent_tax_id, rank, embl_code, division_id)
        push!(node_table, row)
    end
end
node_table

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:45[39m


Unnamed: 0_level_0,tax_id,parent_tax_id,rank,embl_code,division_id
Unnamed: 0_level_1,String,String,String,String,String
1,1,1,no rank,,8
2,2,131567,superkingdom,,0
3,6,335928,genus,,0
4,7,6,species,AC,0
5,9,32199,species,BA,0
6,10,1706371,genus,,0
7,11,1707,species,CG,0
8,13,203488,genus,,0
9,14,13,species,DT,0
10,16,32011,genus,,0


In [98]:
DataFrames.outerjoin(
    names_dmp_transformed, 
    DataFrames.outerjoin(node_table, division_table, on="division_id"),
    on="tax_id"
)

LoadError: ArgumentError: column name :tax_id not found in the data frame

In [100]:
names(names_dmp_transformed)

14-element Vector{String}:
 "acronym"
 "authority"
 "blast name"
 "common name"
 "equivalent name"
 "genbank acronym"
 "genbank common name"
 "identifier"
 "in-part"
 "includes"
 "scientific name"
 "synonym"
 "type material"
 "TYPE"

In [97]:
names_dmp_transformed

Unnamed: 0_level_0,acronym,authority
Unnamed: 0_level_1,String?,String?
1,missing,missing
2,missing,missing
3,missing,Azorhizobium Dreyfus et al. 1988 emend. Lang et al. 2013
4,missing,Azorhizobium caulinodans Dreyfus et al. 1988
5,missing,Buchnera aphidicola Munson et al. 1991
6,missing,"Cellvibrio (ex Winogradsky 1929) Blackall et al. 1986 emend. Humphry et al. 2003;;;Cellvibrio (ex Winogradsky 1929) Blackall et al. 1986 emend. Suarez et al. 2014;;;""Cellvibrio"" Winogradsky 1929"
7,missing,"""Cellulomonas gilvus"" (Hulcher and King 1958) Christopherson et al. 2013;;;""Cellvibrio gilvus"" Hulcher and King 1958"
8,missing,Dictyoglomus Saiki et al. 1985
9,missing,Dictyoglomus thermophilum Saiki et al. 1985
10,missing,Methylophilus Jenkins et al. 1987


Unnamed: 0_level_0,tax_id,parent_tax_id,rank,embl_code,division_id,division_name,division_cde
Unnamed: 0_level_1,String?,String?,String?,String?,String,String?,String?
1,1,1,no rank,,8,Unassigned,UNA
2,2,131567,superkingdom,,0,Bacteria,BCT
3,6,335928,genus,,0,Bacteria,BCT
4,7,6,species,AC,0,Bacteria,BCT
5,9,32199,species,BA,0,Bacteria,BCT
6,10,1706371,genus,,0,Bacteria,BCT
7,11,1707,species,CG,0,Bacteria,BCT
8,13,203488,genus,,0,Bacteria,BCT
9,14,13,species,DT,0,Bacteria,BCT
10,16,32011,genus,,0,Bacteria,BCT


Here we will set constrains that no two nodes have the same taxonomic id and no two nodes have the same scientific name

In [19]:
cmd = "CREATE CONSTRAINT ON (t:Taxonomy) ASSERT t.tax_id IS UNIQUE"
@time cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)

  0.023436 seconds (229 allocations: 15.594 KiB, 98.37% compilation time)


Process(`[4mcypher-shell[24m [4m--address[24m [4mneo4j://ncbi-taxonomy.cjp.garden:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mtempo-athlete-news-info-fresh-4482[24m [4m--database[24m [4mneo4j[24m [4m--format[24m [4mauto[24m [4m'CREATE CONSTRAINT ON (t:Taxonomy) ASSERT t.tax_id IS UNIQUE'[24m`, ProcessRunning)

An equivalent constraint already exists, 'Constraint( id=4, name='constraint_53f0c26a', type='UNIQUENESS', schema=(:Taxonomy {tax_id}), ownedIndex=3 )'.


In [20]:
cmd = "CREATE CONSTRAINT ON (t:Taxonomy) ASSERT t.`scientific name` IS UNIQUE"
@time cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)

  0.000368 seconds (92 allocations: 6.516 KiB)


Process(`[4mcypher-shell[24m [4m--address[24m [4mneo4j://ncbi-taxonomy.cjp.garden:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mtempo-athlete-news-info-fresh-4482[24m [4m--database[24m [4mneo4j[24m [4m--format[24m [4mauto[24m [4m'CREATE CONSTRAINT ON (t:Taxonomy) ASSERT t.\`scientific name\` IS UNIQUE'[24m`, ProcessRunning)

An equivalent constraint already exists, 'Constraint( id=6, name='constraint_f16727de', type='UNIQUENESS', schema=(:Taxonomy {scientific name}), ownedIndex=5 )'.


In [55]:
cmd = 
"""
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM
'file:///$(basename(NODES_FILE))' AS row
FIELDTERMINATOR '\t'
MERGE (t:Taxonomy {tax_id: row.tax_id})
"""
cmd = rstrip(replace(cmd, '\n' => ' '))
@time cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)

  0.000518 seconds (94 allocations: 6.938 KiB)


Process(`[4mcypher-shell[24m [4m--address[24m [4mneo4j://ncbi-taxonomy.cjp.garden:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mtempo-athlete-news-info-fresh-4482[24m [4m--database[24m [4mneo4j[24m [4m--format[24m [4mauto[24m [4m"USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM 'file:///ncbi_taxonomy.nodes.tsv' AS row FIELDTERMINATOR '	' MERGE (t:Taxonomy {tax_id: row.tax_id})"[24m`, ProcessRunning)

In [83]:
# note to self, I should be able to programmatically generate this long list of metadata fields

# need to start over?
# match (n) delete n

# need to develop little by little over time?
# WITH row LIMIT 10

cmd = 
"""
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM
'file:///$(basename(NODES_FILE))' AS row
FIELDTERMINATOR '\\t'
CREATE (t:Taxonomy {
    tax_id: row.tax_id,
    scientific_name: row.scientific_name,
    division_cde: row.division_cde,
    division_id: row.division_id,
    division_name: row.division_name,
    rank: row.rank,
    acronym: row.acronym,
    in_part: row.in_part,
    includes: row.includes,
    common_name: row.common_name,
    genbank_common_name: row.genbank_common_name,
    blast_name: row.blast_name,
    synonym: row.synonym,
    genbank_synonym: row.genbank_synonym,
    type_material: row.type_material,
    authority: row.authority,
    genbank_acronym: row.genbank_acronym,
    equivalent_name: row.equivalent_name})
RETURN t LIMIT 10
"""
println(cmd)
cmd = rstrip(replace(cmd, '\n' => ' '))
cyper_cmd = cypher(address = "neo4j://0.0.0.0:7687", username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)

# ready to start consuming query after 61 ms, results consumed after another 86607 ms
# Added 2396777 nodes, Set 15244183 properties, Added 2396777 labels

USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM
'file:///ncbi_taxonomy.nodes.tsv' AS row
FIELDTERMINATOR '\t'
CREATE (t:Taxonomy {
    tax_id: row.tax_id,
    scientific_name: row.scientific_name,
    division_cde: row.division_cde,
    division_id: row.division_id,
    division_name: row.division_name,
    rank: row.rank,
    acronym: row.acronym,
    in_part: row.in_part,
    includes: row.includes,
    common_name: row.common_name,
    genbank_common_name: row.genbank_common_name,
    blast_name: row.blast_name,
    synonym: row.synonym,
    genbank_synonym: row.genbank_synonym,
    type_material: row.type_material,
    authority: row.authority,
    genbank_acronym: row.genbank_acronym,
    equivalent_name: row.equivalent_name})
RETURN t LIMIT 10



`[4mcypher-shell[24m [4m--address[24m [4mneo4j://0.0.0.0:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mtempo-athlete-news-info-fresh-4482[24m [4m--database[24m [4mneo4j[24m [4m--format[24m [4mauto[24m [4m"USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM 'file:///ncbi_taxonomy.nodes.tsv' AS row FIELDTERMINATOR '\t' CREATE (t:Taxonomy {     tax_id: row.tax_id,     scientific_name: row.scientific_name,     division_cde: row.division_cde,     division_id: row.division_id,     division_name: row.division_name,     rank: row.rank,     acronym: row.acronym,     in_part: row.in_part,     includes: row.includes,     common_name: row.common_name,     genbank_common_name: row.genbank_common_name,     blast_name: row.blast_name,     synonym: row.synonym,     genbank_synonym: row.genbank_synonym,     type_material: row.type_material,     authority: row.authority,     genbank_acronym: row.genbank_acronym,     equivalent_name: row.equivalent_name}) RETURN t LIMIT

And here in the final step we create the relationships between taxa and their parent nodes

In [85]:
cmd = 
"""
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM
'file:///$(basename(EDGES_FILE))' AS row
FIELDTERMINATOR '\\t'
MATCH (src:Taxonomy {tax_id: row.src})
MATCH (dst:Taxonomy {tax_id: row.dst})
MERGE (src)-[p:PARENT]->(dst)
"""
println(cmd)
cmd = rstrip(replace(cmd, '\n' => ' '))
cypher(address = "neo4j://0.0.0.0:7687", username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)

USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM
'file:///ncbi_taxonomy.edges.tsv' AS row
FIELDTERMINATOR '\t'
MATCH (src:Taxonomy {tax_id: row.src})
MATCH (dst:Taxonomy {tax_id: row.dst})
MERGE (src)-[p:PARENT]->(dst)

  0.000049 seconds (58 allocations: 5.234 KiB)


`[4mcypher-shell[24m [4m--address[24m [4mneo4j://0.0.0.0:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mtempo-athlete-news-info-fresh-4482[24m [4m--database[24m [4mneo4j[24m [4m--format[24m [4mauto[24m [4m"USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM 'file:///ncbi_taxonomy.edges.tsv' AS row FIELDTERMINATOR '\t' MATCH (src:Taxonomy {tax_id: row.src}) MATCH (dst:Taxonomy {tax_id: row.dst}) MERGE (src)-[p:PARENT]->(dst)"[24m`

Failed to obtain connection towards WRITE server. Known routing table is: Ttl 1644089305934, currentTime 1644089035978, routers [], writers [], readers [], database 'neo4j'
Failed to obtain connection towards WRITE server. Known routing table is: Ttl 1644089316828, currentTime 1644089046883, routers [], writers [], readers [], database 'neo4j'
Failed to obtain connection towards WRITE server. Known routing table is: Ttl 1644089334913, currentTime 1644089089649, routers [], writers [], readers [], database 'neo4j'
Failed to obtain connection towards WRITE server. Known routing table is: Ttl 1644089382426, currentTime 1644089125683, routers [], writers [], readers [], database 'neo4j'
Failed to obtain connection towards WRITE server. Known routing table is: Ttl 1644089486990, currentTime 1644089217065, routers [], writers [], readers [], database 'neo4j'


In [None]:
# ready to start consuming query after 99517 ms, results consumed after another 0 ms
# Created 2396777 relationships

In [None]:
child_tax_ids = vcat(Mycelia.taxonomic_id_to_children(root_tax_id), root_tax_id)

In [None]:
# TODO
# here is where we should apply a filter where host == Escherichia
# need to load host information into neo4j taxonomy

In [None]:
# refseq_metadata = Mycelia.load_refseq_metadata()
ncbi_metadata = Mycelia.load_genbank_metadata()

In [None]:
show(ncbi_metadata[1:1, :], allcols=true)

In [None]:
tax_id_filter = map(taxid -> taxid in child_tax_ids, ncbi_metadata[!, "taxid"])
is_right_host = map(x -> occursin(Regex(host, "i"), x), ncbi_metadata[!, "organism_name"])
not_excluded = ncbi_metadata[!, "excluded_from_refseq"] .== ""
is_full = ncbi_metadata[!, "genome_rep"] .== "Full"
# assembly_levels = ["Complete Genome"]
assembly_levels = ["Complete Genome", "Chromosome"]
# assembly_levels = ["Complete Genome", "Chromosome", "Scaffold"]
# assembly_levels = ["Complete Genome", "Chromosome", "Scaffold", "Contig"]
assembly_level_filter = map(x -> x in assembly_levels, ncbi_metadata[!, "assembly_level"])
full_filter = is_full .& not_excluded .& assembly_level_filter .& tax_id_filter .& is_right_host
count(full_filter)

In [None]:
# TODO
# here is another place we could enforce host == escherichia
# we'll use a manual filter as a temporary solution

In [None]:
ncbi_metadata_of_interest[!, "ftp_path"]

In [None]:
ncbi_metadata_of_interest = ncbi_metadata[full_filter, :]

In [None]:
# https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=genbank&id=GCA_021354775

In [None]:
# for col in names(ncbi_metadata_of_interest)
#     @show col, ncbi_metadata_of_interest[1, col]
# end

In [None]:
N_FASTAS = 2

In [None]:
entity_file_maps

In [None]:
entity_file_maps = Dict()
ProgressMeter.@showprogress for row in DataFrames.eachrow(ncbi_metadata_of_interest[1:N_FASTAS, :])
    entity_id = row["# assembly_accession"]
    entity_file_maps[entity_id] = Dict()
    # generally will want genome + GFF || genbank
    # everything else can be derived from these inputs
    for (file_type, extension) in [
            "genome" => "genomic.fna.gz",
            "annotations" => "genomic.gff.gz",
            "genbank" => "genomic.gbff.gz",
            # "proteins" => "protein.faa.gz",
        ]
        outdir = mkpath(joinpath(DIR, extension))
        url = Mycelia.ncbi_ftp_path_to_url(row["ftp_path"], extension)
        outfile = joinpath(outdir, basename(url))
        if !isfile(outfile)
            try
                Downloads.download(url, outfile)
            catch e
                showerror(stdout, e)
            end
        end
        if isfile(outfile)
            entity_file_maps[entity_id][file_type] = outfile
        end
    end
end
entity_file_maps

In [None]:
# graph = Mycelia.fastx_to_kmer_graph(BioSequences.BigDNAMer{kmer_size}, fastx_files)

In [None]:
# use basename of files as identifiers

In [None]:
# kmer_size = Mycelia.assess_dnamer_saturation(fastx_files)
kmer_size = 7

In [None]:
@time graph = fastx_to_metagraph(kmer_size, entity_file_maps)

In [None]:
graph_outfile = "$DIR/root-tax-id_$(root_tax_id).k_$(kmer_size).genome-graph"
# Mycelia.save_graph(graph, graph_outfile)
# Mycelia.graph_to_gfa(graph, graph_outfile * ".gfa")
# Mycelia.load_graph(graph_outfile)

In [None]:
cmd = "CREATE CONSTRAINT ON (k:Kmer) ASSERT k.identifier IS UNIQUE"
@time Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd);

In [None]:
node_types = unique(graph.vprops[v][:TYPE] for v in Graphs.vertices(graph))

In [None]:
# initialize graph

In [None]:
# add fasta records to graph

In [None]:
# add fasta records to Neo4J
# add over URL

In [None]:
upload_node_type_over_url("Fasta", graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

In [None]:
# add fasta records to graph

In [None]:
# add fasta records to Neo4J
# batch file upload

In [None]:
upload_node_type_over_read_csv("FastaRecord", graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

In [None]:
src_type = "Fasta"
dst_type = "FastaRecord"
edge_type = "CONTAINS_RECORD"

In [None]:
upload_edge_type_over_url(src_type, dst_type, edge_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE, window_size=10)

In [None]:
# add kmers to graph

In [None]:
# add kmers to Neo4J
# add over URL

In [None]:
upload_node_type_over_url("Kmer", graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

In [None]:
# add edges from records to kmers

In [None]:
src_type = "FastaRecord"
dst_type = "Kmer"
edge_type = "CONTAINS_KMER"
upload_edge_type_over_url(src_type, dst_type, edge_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

In [None]:
# upload to Neo4j

In [None]:
# Kmer to Kmer connections

In [None]:
# on merge with these Kmer to Kmer connections we'll need to increment the counts!
# any time we increase the number of genomes, the values will become out of date
src_type = "Kmer"
dst_type = "Kmer"
edge_type = "CONNECTS_TO"
upload_edge_type_over_url(src_type, dst_type, edge_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

In [None]:
# get the full list of kmers under the fasta of interest

In [None]:
# for each kmer, count the number of incoming "CONTAINS_KMER" edges

In [None]:
# divide by max count

In [None]:
# pull in annotations!!!

In [None]:
ncbi_metadata_of_interest

In [None]:
entity_metadata_table = ncbi_metadata_of_interest

In [None]:
entity_metadata_table = DataFrames.rename(entity_metadata_table, "# assembly_accession" => "identifier")

In [None]:
entity_metadata_table[!, "TYPE"] .= "Entity"

In [None]:
entity_metadata_table

In [None]:
show(entity_metadata_table[1:1, :], allcols=true)

In [None]:
graph.indices

In [None]:
keys(graph.metaindex[:identifier])

In [None]:
for row in DataFrames.eachrow(entity_metadata_table)
    identifier = row["identifier"]
    @show has_identifier(graph, identifier)
    if !has_identifier(graph, identifier)
        Graphs.add_vertex!(graph)
        vertex = Graphs.nv(graph)
    else
        vertex = graph[]
    end
    @show vertex
    for col in names(row)
        if !MetaGraphs.has_prop(graph, vertex, Symbol(col))
            MetaGraphs.set_prop!(graph, vertex, Symbol(col), row[col])
        else
            current_value = MetaGraphs.get_prop(graph, vertex, Symbol(col))
            if current_value != row[col]
                @warn "property $(col)'s current value $(current_value) != this value $(row[col])"
            end
        end
    end
    @show MetaGraphs.props(graph, vertex)
end

In [None]:

    for entity in keys(entity_file_map)
        # f_type = "Genome"
        fastx = entity_file_map[entity]["genome"]
        Graphs.add_vertex!(graph)
        fasta_node = Graphs.nv(graph)
        # node types are camel case
        MetaGraphs.set_prop!(graph, fasta_node, :TYPE, "Fasta")
        # node and edge properties are lowercase
        # add entity identifier as a property that can be queried on
        MetaGraphs.set_prop!(graph, fasta_node, :path, fastx)
        MetaGraphs.set_prop!(graph, fasta_node, :identifier, entity)
        for record in Mycelia.open_fastx(fastx)
            Graphs.add_vertex!(graph)
            record_node = Graphs.nv(graph)
            
            MetaGraphs.set_prop!(graph, record_node, :TYPE, "FastaRecord")
            MetaGraphs.set_prop!(graph, record_node, :identifier, FASTX.identifier(record))
            MetaGraphs.set_prop!(graph, record_node, :description, FASTX.description(record))
            MetaGraphs.set_prop!(graph, record_node, :sequence, FASTX.sequence(record))
            if typeof(FASTX.sequence(record)) == BioSequences.LongDNASeq
                sequence_type = "dna"
            elseif typeof(FASTX.sequence(record)) == BioSequences.LongAminoAcidSeq
                sequence_type = "aa"
            elseif typeof(FASTX.sequence(record)) == BioSequences.LongRNASeq
                sequence_type = "rna"
            end
            MetaGraphs.set_prop!(graph, record_node, :sequence_type, sequence_type)
            edge = Graphs.Edge(fasta_node, record_node)
            Graphs.add_edge!(graph, edge)
            # edge types are all caps
            MetaGraphs.set_prop!(graph, edge, :TYPE, "CONTAINS_RECORD")
            # for kmer in collect(BioSequences.each(kmer_type, FASTX.sequence(record)))[1:2^3]
            for kmer in BioSequences.each(kmer_type, FASTX.sequence(record))
                canonical_kmer = BioSequences.canonical(kmer)
                if canonical_kmer in canonical_kmers
                    kmer_node = graph[canonical_kmer, :identifier]
                else
                    push!(canonical_kmers, canonical_kmer)
                    Graphs.add_vertex!(graph)
                    kmer_node = Graphs.nv(graph)
                    MetaGraphs.set_prop!(graph, kmer_node, :TYPE, "Kmer")
                    MetaGraphs.set_prop!(graph, kmer_node, :identifier, BioSequences.canonical(kmer))
                    MetaGraphs.set_prop!(graph, kmer_node, :sequence_type, sequence_type)
                end
                edge = Graphs.Edge(record_node, kmer_node)
                if !Graphs.has_edge(graph, edge)
                    Graphs.add_edge!(graph, edge)
                    MetaGraphs.set_prop!(graph, edge, :TYPE, "CONTAINS_KMER")
                    # can't use vectors as properties :(
                    MetaGraphs.set_prop!(graph, edge, :count, 0)
                    MetaGraphs.set_prop!(graph, edge, Symbol(true), 0)
                    MetaGraphs.set_prop!(graph, edge, Symbol(false), 0)
                end
                is_canonical = Symbol(BioSequences.iscanonical(kmer.fw))
                current_count = MetaGraphs.get_prop(graph, edge, is_canonical) + 1
                MetaGraphs.set_prop!(graph, edge, is_canonical, current_count)

                current_count = MetaGraphs.get_prop(graph, edge, Symbol(true)) + MetaGraphs.get_prop(graph, edge, Symbol(false))
                MetaGraphs.set_prop!(graph, edge, :count, current_count)                
            end
            # for edgemer in collect(BioSequences.each(edgemer_type, FASTX.sequence(record)))[1:2^3-1]
            for edgemer in BioSequences.each(edgemer_type, FASTX.sequence(record))
                src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical = assess_edgemer(edgemer.fw)
                if canonical_dst < canonical_src
                    src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical = assess_edgemer(edgemer.bw)
                end
                src_index, dst_index = graph[canonical_src, :identifier], graph[canonical_dst, :identifier]
                edge = Graphs.Edge(src_index, dst_index)
                if !Graphs.has_edge(graph, edge)
                    Graphs.add_edge!(graph, edge)
                    MetaGraphs.set_prop!(graph, edge, :TYPE, "CONNECTS_TO")
                    for o in edge_orientations
                        MetaGraphs.set_prop!(graph, edge, o, 0)
                    end
                    MetaGraphs.set_prop!(graph, edge, :count, 0)
                end
                orientations = Symbol("$(src_is_canonical)_$(dst_is_canonical)")
                current_count = MetaGraphs.get_prop(graph, edge, orientations) + 1
                MetaGraphs.set_prop!(graph, edge, orientations, current_count)
                
                current_count = sum(MetaGraphs.get_prop(graph, edge, o) for o in edge_orientations)
                MetaGraphs.set_prop!(graph, edge, :count, current_count) 
            end
        end
    end
    return graph
end