# Kmer Graph

In [None]:
DATE_TASK = "2022-03-12-ecoli-tequatrovirus"
DIR = mkpath("$(homedir())/workspace/$DATE_TASK")
cd(DIR)
DATE, TASK = match(r"^(\d{4}-\d{2}-\d{2})-(.*)$", DATE_TASK).captures

In [None]:
import Pkg
Pkg.update()
pkgs = [
"JSON",
"HTTP",
"Dates",
"uCSV",
"DelimitedFiles",
"DataFrames",
"ProgressMeter",
"BioSequences",
"FASTX",
"Distances",
"StatsPlots",
"StatsBase",
"Statistics",
"MultivariateStats",
"Random",
"Primes",
"SparseArrays",
"SHA",
"GenomicAnnotations",
"Combinatorics",
"OrderedCollections",
"Downloads",
"Clustering",
"Revise",
"Mmap",
"Graphs",
"MetaGraphs",
"FileIO"
]

for pkg in pkgs
    try
        eval(Meta.parse("import $pkg"))
    catch
        Pkg.add(pkg)
        eval(Meta.parse("import $pkg"))
    end
end

# works but can't update locally, need to push and restart kernel to activate changes
# "https://github.com/cjprybol/Mycelia.git#master",
# didn't work
# "$(homedir())/git/Mycelia#master",
pkg_path = "$(homedir())/git/Mycelia"
try
    eval(Meta.parse("import $(basename(pkg_path))"))
catch
    # Pkg.add(url=pkg)
    Pkg.develop(path=pkg_path)
    # pkg = replace(basename(pkg), ".git#master" => "")
    # pkg = replace(basename(pkg), "#master" => "")
    eval(Meta.parse("import $(basename(pkg_path))"))
end

In [None]:
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?&id=$(tax_id)
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?lvl=0&amp;id=2733124
# root_tax_id = 2733124

root_tax_id = 10663
host = "Escherichia"

In [None]:
child_tax_ids = vcat(Mycelia.taxonomic_id_to_children(root_tax_id), root_tax_id)
# # child_tax_ids = vcat(child_tax_ids, root_tax_id)

In [None]:
# TODO
# here is where we should apply a filter where host == Escherichia
# need to load host information into neo4j taxonomy

In [None]:
# refseq_metadata = Mycelia.load_refseq_metadata()
ncbi_metadata = Mycelia.load_genbank_metadata()

In [None]:
show(ncbi_metadata[1:1, :], allcols=true)

In [None]:
tax_id_filter = map(taxid -> taxid in child_tax_ids, ncbi_metadata[!, "taxid"])
is_right_host = map(x -> occursin(Regex(host, "i"), x), ncbi_metadata[!, "organism_name"])
not_excluded = ncbi_metadata[!, "excluded_from_refseq"] .== ""
is_full = ncbi_metadata[!, "genome_rep"] .== "Full"
# assembly_levels = ["Complete Genome"]
assembly_levels = ["Complete Genome", "Chromosome"]
# assembly_levels = ["Complete Genome", "Chromosome", "Scaffold"]
# assembly_levels = ["Complete Genome", "Chromosome", "Scaffold", "Contig"]
assembly_level_filter = map(x -> x in assembly_levels, ncbi_metadata[!, "assembly_level"])
full_filter = is_full .& not_excluded .& assembly_level_filter .& tax_id_filter .& is_right_host
count(full_filter)

In [None]:
# TODO
# here is another place we could enforce host == escherichia
# we'll use a manual filter as a temporary solution

In [None]:
# ncbi_metadata_of_interest = ncbi_metadata[full_filter, :]

In [None]:
# https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=genbank&id=GCA_021354775

In [None]:
# for col in names(ncbi_metadata_of_interest)
#     @show col, ncbi_metadata_of_interest[1, col]
# end

In [None]:
# # can I also get genbank record?????
# # for extension in ["genomic.fna.gz", "protein.faa.gz"]
# for extension in ["genomic.fna.gz", "protein.faa.gz", "genomic.gbff.gz"]
#     outdir = mkpath(joinpath(DIR, extension))
#     ProgressMeter.@showprogress for row in DataFrames.eachrow(ncbi_metadata_of_interest)
#         url = Mycelia.ncbi_ftp_path_to_url(row["ftp_path"], extension)
#         outfile = joinpath(outdir, basename(url))
#         if !isfile(outfile)
#             try
#                 Downloads.download(url, outfile)
#             catch e
#                 # @show e
#                 showerror(stdout, e)
#                 # @assert extension == "protein.faa.gz"
#                 # here is where we should call prodigal to fill in protein annotations if we don't otherwise see them
#             end
#         end
#     end
# end

In [None]:
extension = "genomic.fna.gz"
outdir = mkpath(joinpath(DIR, extension))

In [None]:
fastx_files = filter(x -> !occursin(".ipynb_checkpoints", x), readdir(outdir, join=true))

In [None]:
entities = map(f -> match(r"^(GCA_\d+\.\d+)_.*$", f).captures[1], basename.(fastx_files))

In [None]:
# kmer_size = Mycelia.assess_dnamer_saturation(fastx_files)
kmer_size = 7

In [None]:
# graph = Mycelia.fastx_to_kmer_graph(BioSequences.BigDNAMer{kmer_size}, fastx_files)

In [None]:
# use basename of files as identifiers

In [None]:
function assess_edgemer(edgemer)
    k = length(edgemer)-1
    kmer_type = BioSequences.BigDNAMer{k}
    src = kmer_type(edgemer[i] for i in 1:k)
    dst = kmer_type(edgemer[i] for i in 2:k+1)
    canonical_src = BioSequences.canonical(src)
    canonical_dst = BioSequences.canonical(dst)
    src_is_canonical = src == canonical_src
    dst_is_canonical = dst == canonical_dst
    # @show edgemer
    # @show src
    # @show canonical_src
    # @show src_is_canonical
    # @show dst
    # @show canonical_dst
    # @show dst_is_canonical
    return src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical
end

In [None]:
function fastx_to_metagraph(;k, fastx_files, entities)
    @assert isodd(k) "k must be odd to avoid the possibility of kmers being reverse complements of themselves"
    @assert 3 <= k <= 63 "k must be at least 3 to allow for kmer overlaps and less than 64 to be compatible with type limits"
    kmer_type = BioSequences.BigDNAMer{k}
    edgemer_type = BioSequences.BigDNAMer{k+1}
    edge_orientations = Symbol.([
        "true_true",
        "true_false",
        "false_true",
        "false_false"
    ])
    canonical_kmers = Set{kmer_type}()
    graph = MetaGraphs.MetaDiGraph()
    MetaGraphs.set_indexing_prop!(graph, :kmer)
    for (fastx, entity) in zip(fastx_files, entities)
        Graphs.add_vertex!(graph)
        fasta_node = Graphs.nv(graph)
        # node types are camel case
        MetaGraphs.set_prop!(graph, fasta_node, :TYPE, "Fasta")
        # node and edge properties are lowercase
        # add entity identifier as a property that can be queried on
        MetaGraphs.set_prop!(graph, fasta_node, :path, fastx)
        MetaGraphs.set_prop!(graph, fasta_node, :entity, entity)
        for record in Mycelia.open_fastx(fastx)
            Graphs.add_vertex!(graph)
            record_node = Graphs.nv(graph)
            
            MetaGraphs.set_prop!(graph, record_node, :TYPE, "FastaRecord")
            MetaGraphs.set_prop!(graph, record_node, :identifier, FASTX.identifier(record))
            MetaGraphs.set_prop!(graph, record_node, :description, FASTX.description(record))
            MetaGraphs.set_prop!(graph, record_node, :sequence, FASTX.sequence(record))
            if typeof(FASTX.sequence(record)) == BioSequences.LongDNASeq
                sequence_type = "dna"
            elseif typeof(FASTX.sequence(record)) == BioSequences.LongAminoAcidSeq
                sequence_type = "aa"
            elseif typeof(FASTX.sequence(record)) == BioSequences.LongRNASeq
                sequence_type = "rna"
            end
            MetaGraphs.set_prop!(graph, record_node, :sequence_type, sequence_type)
            edge = Graphs.Edge(fasta_node, record_node)
            Graphs.add_edge!(graph, edge)
            # edge types are all caps
            MetaGraphs.set_prop!(graph, edge, :TYPE, "CONTAINS")
            # for kmer in collect(BioSequences.each(kmer_type, FASTX.sequence(record)))[1:2^3]
            for kmer in BioSequences.each(kmer_type, FASTX.sequence(record))
                canonical_kmer = BioSequences.canonical(kmer)
                if canonical_kmer in canonical_kmers
                    kmer_node = graph[canonical_kmer, :kmer]
                else
                    push!(canonical_kmers, canonical_kmer)
                    Graphs.add_vertex!(graph)
                    kmer_node = Graphs.nv(graph)
                    MetaGraphs.set_prop!(graph, kmer_node, :TYPE, "Kmer")
                    MetaGraphs.set_prop!(graph, kmer_node, :sequence_type, sequence_type)
                    MetaGraphs.set_prop!(graph, kmer_node, :kmer, BioSequences.canonical(kmer))
                end
                edge = Graphs.Edge(record_node, kmer_node)
                if !Graphs.has_edge(graph, edge)
                    Graphs.add_edge!(graph, edge)
                    MetaGraphs.set_prop!(graph, edge, :TYPE, "CONTAINS")
                    # can't use vectors as properties :(
                    MetaGraphs.set_prop!(graph, edge, :count, 0)
                    MetaGraphs.set_prop!(graph, edge, Symbol(true), 0)
                    MetaGraphs.set_prop!(graph, edge, Symbol(false), 0)
                end
                is_canonical = Symbol(BioSequences.iscanonical(kmer.fw))
                current_count = MetaGraphs.get_prop(graph, edge, is_canonical) + 1
                MetaGraphs.set_prop!(graph, edge, is_canonical, current_count)

                current_count = MetaGraphs.get_prop(graph, edge, Symbol(true)) + MetaGraphs.get_prop(graph, edge, Symbol(false))
                MetaGraphs.set_prop!(graph, edge, :count, current_count)                
            end
            # for edgemer in collect(BioSequences.each(edgemer_type, FASTX.sequence(record)))[1:2^3-1]
            for edgemer in BioSequences.each(edgemer_type, FASTX.sequence(record))
                src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical = assess_edgemer(edgemer.fw)
                if canonical_dst < canonical_src
                    src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical = assess_edgemer(edgemer.bw)
                end
                src_index, dst_index = graph[canonical_src, :kmer], graph[canonical_dst, :kmer]
                edge = Graphs.Edge(src_index, dst_index)
                if !Graphs.has_edge(graph, edge)
                    Graphs.add_edge!(graph, edge)
                    MetaGraphs.set_prop!(graph, edge, :TYPE, "CONNECTS_TO")
                    for o in edge_orientations
                        MetaGraphs.set_prop!(graph, edge, o, 0)
                    end
                    MetaGraphs.set_prop!(graph, edge, :count, 0)
                end
                orientations = Symbol("$(src_is_canonical)_$(dst_is_canonical)")
                current_count = MetaGraphs.get_prop(graph, edge, orientations) + 1
                MetaGraphs.set_prop!(graph, edge, orientations, current_count)
                
                current_count = sum(MetaGraphs.get_prop(graph, edge, o) for o in edge_orientations)
                MetaGraphs.set_prop!(graph, edge, :count, current_count) 
            end
        end
    end
    return graph
end

In [None]:
@time graph = fastx_to_metagraph(k=kmer_size, fastx_files=fastx_files, entities=entities)

In [None]:
graph_outfile = "$DIR/root-tax-id_$(root_tax_id).k_$(kmer_size).genome-graph"
# Mycelia.save_graph(graph, graph_outfile)
# Mycelia.graph_to_gfa(graph, graph_outfile * ".gfa")
# Mycelia.load_graph(graph_outfile)

In [None]:
# TODO: use new database

In [None]:
NEO4J_BIN_DIR = "/home/jupyter-cjprybol/software/neo4j-community-4.4.3/bin"
if !occursin(NEO4J_BIN_DIR, ENV["PATH"])
    ENV["PATH"] = "$(NEO4J_BIN_DIR):" * ENV["PATH"]
end
database_id = "1f883654"
USERNAME="neo4j"
PASSWORD=readline(joinpath(homedir(), ".config", "neo4j", "$(database_id).pass"));
ADDRESS="neo4j+s://$(database_id).databases.neo4j.io:7687"
DATABASE = "neo4j"

In [None]:
cmd = "CREATE CONSTRAINT ON (k:Kmer) ASSERT k.sequence IS UNIQUE"
@time Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)

In [None]:
node_types = unique(graph.vprops[v][:TYPE] for v in Graphs.vertices(graph))

In [None]:
# initialize graph

In [None]:
# add fasta records to graph

In [None]:
# add fasta records to Neo4J
# add over URL

In [None]:
# add fasta records to graph

In [None]:
# add fasta records to Neo4J
# batch file upload

In [None]:
# add kmers to graph

In [None]:
# add kmers to Neo4J
# add over URL

In [None]:
# upload fasta records as tsv files on batch

In [None]:
for node_type in node_types[3:3]
    @show node_type
    node_type_params = Set{Symbol}()
    vertices_of_type = [v for v in Graphs.vertices(graph) if (graph.vprops[v][:TYPE] == node_type)]
    
    node_type_params = filter(param -> param != :TYPE, reduce(union, keys(graph.vprops[v]) for v in vertices_of_type))
    
    window_size = 100
    V = length(vertices_of_type)
    windows = [i:min(i+window_size-1,V) for i in 1:window_size:V]
    
    ProgressMeter.@showprogress for window in windows
        cmds = []
        for (i, v) in enumerate(vertices_of_type[window])
            params = ["$(string(param)):'$(MetaGraphs.get_prop(graph, v, param))'" for param in node_type_params]
            joint_params = join(params, ", ")
            cmd = "MERGE (node$(i):$(node_type) {$(joint_params)})"
            push!(cmds, cmd)
        end
        cmd = join(cmds, ' ')
        cypher_cmd = Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)
        run(cypher_cmd)
    end    
end

In [None]:
function upload_node_type(graph, node_type)
    
end

In [None]:
# Alexander said that setting a unique constaint automatically creates an index?
# # index the kmer space to improve performance??
# cmd = 
# """
# CREATE INDEX
# FOR (k:Kmer)
# ON (k.kmer)
# """
# cmd = strip(replace(cmd, '\n' => ' '))
# cypher_cmd = Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)
# run(cypher_cmd)  

In [None]:
# create an arbitrary node upload

In [None]:
# for each node class
    # create an upload command that maps properties in julia graph to properties in Neo4J graph
    # upload in batches of 100
# end

In [None]:
t = filtered_node_table
window_size = 100
windows = [i:min(i+window_size-1,DataFrames.nrow(t)) for i in 1:window_size:DataFrames.nrow(t)]
ProgressMeter.@showprogress for window in windows  
    cmds = ["MERGE (k$(w):Kmer {kmer: '$kmer'})" for (w, kmer) in zip(window, t[window, "kmer"])]
    cmd = join(cmds, ' ')
    cypher_cmd = Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)
    run(cypher_cmd)
end

In [None]:
# Work on adding edges now!