# Kmer Graph

In [None]:
# TODO
# properly set types for parameters

In [1]:
DATE_TASK = "2022-03-24-ecoli-tequatro-pangenome"
DIR = mkpath("$(homedir())/workspace/$DATE_TASK")
cd(DIR)
DATE, TASK = match(r"^(\d{4}-\d{2}-\d{2})-(.*)$", DATE_TASK).captures

2-element Vector{Union{Nothing, SubString{String}}}:
 "2022-03-24"
 "ecoli-tequatro-pangenome"

In [2]:
import Pkg
Pkg.update()
pkgs = [
"JSON",
"HTTP",
"Dates",
"uCSV",
"DelimitedFiles",
"DataFrames",
"ProgressMeter",
"BioSequences",
"FASTX",
"Distances",
"StatsPlots",
"StatsBase",
"Statistics",
"MultivariateStats",
"Random",
"Primes",
"SparseArrays",
"SHA",
"GenomicAnnotations",
"Combinatorics",
"OrderedCollections",
"Downloads",
"Clustering",
"Revise",
"Mmap",
"Graphs",
"MetaGraphs",
"FileIO"
]

for pkg in pkgs
    try
        eval(Meta.parse("import $pkg"))
    catch
        Pkg.add(pkg)
        eval(Meta.parse("import $pkg"))
    end
end

# works but can't update locally, need to push and restart kernel to activate changes
# "https://github.com/cjprybol/Mycelia.git#master",
# didn't work
# "$(homedir())/git/Mycelia#master",
pkg_path = "$(homedir())/git/Mycelia"
try
    eval(Meta.parse("import $(basename(pkg_path))"))
catch
    # Pkg.add(url=pkg)
    Pkg.develop(path=pkg_path)
    # pkg = replace(basename(pkg), ".git#master" => "")
    # pkg = replace(basename(pkg), "#master" => "")
    eval(Meta.parse("import $(basename(pkg_path))"))
end

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m    Updating[22m[39m git-repo `https://github.com/cjprybol/Mycelia.git#master`
[32m[1m   Installed[22m[39m DualNumbers ───── v0.6.8
[32m[1m   Installed[22m[39m RecipesPipeline ─ v0.5.2
[32m[1m   Installed[22m[39m PooledArrays ──── v1.4.1
[32m[1m   Installed[22m[39m Plots ─────────── v1.27.2
[32m[1m    Updating[22m[39m `~/git/Mycelia/docs/Project.toml`
 [90m [453d265d] [39m[93m~ Mycelia v0.1.0 `https://github.com/cjprybol/Mycelia.git#master#master` ⇒ v0.1.0 `https://github.com/cjprybol/Mycelia.git#master#master`[39m
 [90m [91a5bcdd] [39m[93m↑ Plots v1.27.1 ⇒ v1.27.2[39m
[32m[1m    Updating[22m[39m `~/git/Mycelia/docs/Manifest.toml`
 [90m [fa6b7ba4] [39m[93m↑ DualNumbers v0.6.7 ⇒ v0.6.8[39m
 [90m [453d265d] [39m[93m~ Mycelia v0.1.0 `https://github.com/cjprybol/Mycelia.git#master#master` ⇒ v0.1.0 `https://github.com/cjprybol/Mycelia.git#master#master`[39m
 [90m [91

In [13]:
function assess_edgemer(edgemer)
    k = length(edgemer)-1
    kmer_type = BioSequences.BigDNAMer{k}
    src = kmer_type(edgemer[i] for i in 1:k)
    dst = kmer_type(edgemer[i] for i in 2:k+1)
    canonical_src = BioSequences.canonical(src)
    canonical_dst = BioSequences.canonical(dst)
    src_is_canonical = src == canonical_src
    dst_is_canonical = dst == canonical_dst
    # @show edgemer
    # @show src
    # @show canonical_src
    # @show src_is_canonical
    # @show dst
    # @show canonical_dst
    # @show dst_is_canonical
    return src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical
end

assess_edgemer (generic function with 1 method)

In [14]:
# TODO break me out into indivual steps of additon at each level
function fastx_to_metagraph(k, entity_file_map)
    @assert isodd(k) "k must be odd to avoid the possibility of kmers being reverse complements of themselves"
    @assert 3 <= k <= 63 "k must be at least 3 to allow for kmer overlaps and less than 64 to be compatible with type limits"
    kmer_type = BioSequences.BigDNAMer{k}
    edgemer_type = BioSequences.BigDNAMer{k+1}
    edge_orientations = Symbol.([
        "true_true",
        "true_false",
        "false_true",
        "false_false"
    ])
    canonical_kmers = Set{kmer_type}()
    graph = MetaGraphs.MetaDiGraph()
    MetaGraphs.set_indexing_prop!(graph, :identifier)
    for entity in keys(entity_file_map)
        # f_type = "Genome"
        fastx = entity_file_map[entity]["genome"]
        Graphs.add_vertex!(graph)
        fasta_node = Graphs.nv(graph)
        # node types are camel case
        MetaGraphs.set_prop!(graph, fasta_node, :TYPE, "Fasta")
        # node and edge properties are lowercase
        # add entity identifier as a property that can be queried on
        MetaGraphs.set_prop!(graph, fasta_node, :path, fastx)
        MetaGraphs.set_prop!(graph, fasta_node, :identifier, entity)
        for record in Mycelia.open_fastx(fastx)
            Graphs.add_vertex!(graph)
            record_node = Graphs.nv(graph)
            
            MetaGraphs.set_prop!(graph, record_node, :TYPE, "FastaRecord")
            MetaGraphs.set_prop!(graph, record_node, :identifier, FASTX.identifier(record))
            MetaGraphs.set_prop!(graph, record_node, :description, FASTX.description(record))
            MetaGraphs.set_prop!(graph, record_node, :sequence, FASTX.sequence(record))
            if typeof(FASTX.sequence(record)) == BioSequences.LongDNASeq
                sequence_type = "dna"
            elseif typeof(FASTX.sequence(record)) == BioSequences.LongAminoAcidSeq
                sequence_type = "aa"
            elseif typeof(FASTX.sequence(record)) == BioSequences.LongRNASeq
                sequence_type = "rna"
            end
            MetaGraphs.set_prop!(graph, record_node, :sequence_type, sequence_type)
            edge = Graphs.Edge(fasta_node, record_node)
            Graphs.add_edge!(graph, edge)
            # edge types are all caps
            MetaGraphs.set_prop!(graph, edge, :TYPE, "CONTAINS_RECORD")
            # for kmer in collect(BioSequences.each(kmer_type, FASTX.sequence(record)))[1:2^3]
            for kmer in BioSequences.each(kmer_type, FASTX.sequence(record))
                canonical_kmer = BioSequences.canonical(kmer)
                if canonical_kmer in canonical_kmers
                    kmer_node = graph[canonical_kmer, :identifier]
                else
                    push!(canonical_kmers, canonical_kmer)
                    Graphs.add_vertex!(graph)
                    kmer_node = Graphs.nv(graph)
                    MetaGraphs.set_prop!(graph, kmer_node, :TYPE, "Kmer")
                    MetaGraphs.set_prop!(graph, kmer_node, :identifier, BioSequences.canonical(kmer))
                    MetaGraphs.set_prop!(graph, kmer_node, :sequence_type, sequence_type)
                end
                edge = Graphs.Edge(record_node, kmer_node)
                if !Graphs.has_edge(graph, edge)
                    Graphs.add_edge!(graph, edge)
                    MetaGraphs.set_prop!(graph, edge, :TYPE, "CONTAINS_KMER")
                    # can't use vectors as properties :(
                    MetaGraphs.set_prop!(graph, edge, :count, 0)
                    MetaGraphs.set_prop!(graph, edge, Symbol(true), 0)
                    MetaGraphs.set_prop!(graph, edge, Symbol(false), 0)
                end
                is_canonical = Symbol(BioSequences.iscanonical(kmer.fw))
                current_count = MetaGraphs.get_prop(graph, edge, is_canonical) + 1
                MetaGraphs.set_prop!(graph, edge, is_canonical, current_count)

                current_count = MetaGraphs.get_prop(graph, edge, Symbol(true)) + MetaGraphs.get_prop(graph, edge, Symbol(false))
                MetaGraphs.set_prop!(graph, edge, :count, current_count)                
            end
            # for edgemer in collect(BioSequences.each(edgemer_type, FASTX.sequence(record)))[1:2^3-1]
            for edgemer in BioSequences.each(edgemer_type, FASTX.sequence(record))
                src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical = assess_edgemer(edgemer.fw)
                if canonical_dst < canonical_src
                    src, canonical_src, src_is_canonical, dst, canonical_dst, dst_is_canonical = assess_edgemer(edgemer.bw)
                end
                src_index, dst_index = graph[canonical_src, :identifier], graph[canonical_dst, :identifier]
                edge = Graphs.Edge(src_index, dst_index)
                if !Graphs.has_edge(graph, edge)
                    Graphs.add_edge!(graph, edge)
                    MetaGraphs.set_prop!(graph, edge, :TYPE, "CONNECTS_TO")
                    for o in edge_orientations
                        MetaGraphs.set_prop!(graph, edge, o, 0)
                    end
                    MetaGraphs.set_prop!(graph, edge, :count, 0)
                end
                orientations = Symbol("$(src_is_canonical)_$(dst_is_canonical)")
                current_count = MetaGraphs.get_prop(graph, edge, orientations) + 1
                MetaGraphs.set_prop!(graph, edge, orientations, current_count)
                
                current_count = sum(MetaGraphs.get_prop(graph, edge, o) for o in edge_orientations)
                MetaGraphs.set_prop!(graph, edge, :count, current_count) 
            end
        end
    end
    return graph
end

fastx_to_metagraph (generic function with 1 method)

In [15]:
function upload_node_type_over_url(node_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE, window_size=100)
    node_type_params = Set{Symbol}()
    vertices_of_type = [v for v in Graphs.vertices(graph) if (graph.vprops[v][:TYPE] == node_type)]
    node_type_params = filter(param -> param != :TYPE, reduce(union, keys(graph.vprops[v]) for v in vertices_of_type))
    
    V = length(vertices_of_type)
    windows = [i:min(i+window_size-1,V) for i in 1:window_size:V]
    
    ProgressMeter.@showprogress for window in windows
        cmds = []
        for (i, v) in enumerate(vertices_of_type[window])
            params = ["$(string(param)):'$(MetaGraphs.get_prop(graph, v, param))'" for param in node_type_params]
            joint_params = join(params, ", ")
            cmd = "MERGE (node$(i):$(node_type) {$(joint_params)})"
            push!(cmds, cmd)
        end
        cmd = join(cmds, ' ')
        cypher_cmd = Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)
        run(cypher_cmd)
    end    
end

upload_node_type_over_url (generic function with 2 methods)

In [16]:
function upload_edge_type_over_url(src_type, dst_type, edge_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE; window_size=100)    
    src_nodes = filter(v -> MetaGraphs.get_prop(graph, v, :TYPE) == src_type, Graphs.vertices(graph))
    dst_nodes = filter(v -> MetaGraphs.get_prop(graph, v, :TYPE) == dst_type, Graphs.vertices(graph))
    edges_to_upload = []
    for src_node in src_nodes
        outneighbors = Graphs.outneighbors(graph, src_node)
        outneighbors = filter(outneighbor -> outneighbor in dst_nodes, outneighbors)
        for outneighbor in outneighbors
            this_edge = Graphs.Edge(src_node, outneighbor)
            @assert MetaGraphs.get_prop(graph, this_edge, :TYPE) == edge_type
            push!(edges_to_upload, this_edge)
        end
    end
    edge_type_params = filter(param -> param != :TYPE, reduce(union, keys(graph.eprops[e]) for e in edges_to_upload))
    
    N = length(edges_to_upload)
    windows = [i:min(i+window_size-1,N) for i in 1:window_size:N]
    
    ProgressMeter.@showprogress for window in windows
        cmds = []
        for (i, e) in enumerate(edges_to_upload[window])
            params = ["$(string(param)):'$(MetaGraphs.get_prop(graph, e, param))'" for param in edge_type_params]
            joint_params = join(params, ", ")
            node_cmds = 
            """
            MERGE (src$(i):$(MetaGraphs.props(graph, e.src)[:TYPE]) {identifier: '$(MetaGraphs.props(graph, e.src)[:identifier])'})
            MERGE (dst$(i):$(MetaGraphs.props(graph, e.dst)[:TYPE]) {identifier: '$(MetaGraphs.props(graph, e.dst)[:identifier])'})
            """
            # match_cmds = 
            # """
            # MATCH 
            #     (src$(i):$(MetaGraphs.props(graph, e.src)[:TYPE]) {identifier: '$(MetaGraphs.props(graph, e.src)[:identifier])'}),
            #     (dst$(i):$(MetaGraphs.props(graph, e.dst)[:TYPE]) {identifier: '$(MetaGraphs.props(graph, e.dst)[:identifier])'})
            # """
            if !isempty(joint_params)
                relationship_cmd = "MERGE (src$(i))-[r$(i):$(MetaGraphs.props(graph, e)[:TYPE]) {$(joint_params)}]->(dst$(i))"
            else
                relationship_cmd = "MERGE (src$(i))-[r$(i):$(MetaGraphs.props(graph, e)[:TYPE])]->(dst$(i))"
            end
            cmd = node_cmds * relationship_cmd
            cmd = replace(cmd, '\n' => ' ')
            push!(cmds, cmd)
        end
        cmd = join(cmds, ' ')
        cypher_cmd = Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)
        run(cypher_cmd)
        # println(cypher_cmd)
    end    
end

upload_edge_type_over_url (generic function with 1 method)

In [17]:
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?&id=$(tax_id)
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?lvl=0&amp;id=2733124
# root_tax_id = 2733124

root_tax_id = 10663
host = "Escherichia"
database_id = "191a822b"

"191a822b"

In [4]:
child_tax_ids = vcat(Mycelia.taxonomic_id_to_children(root_tax_id), root_tax_id)

MATCH (n)<-[*]-(n2) WHERE n.tax_id IS NOT NULL AND n.tax_id = "10663" RETURN DISTINCT n2.tax_id AS tax_id


389-element Vector{Int64}:
 2844206
 2724310
 2844259
 2697536
 2844181
 1837867
 2844208
 2508180
 2844129
 1651198
 2844214
 2184699
 2844202
       ⋮
 2656520
 2846199
 2591063
 2846680
 2321390
 2846682
 2052930
 2846681
 2691085
 2844196
 2081604
   10663

In [5]:
# TODO
# here is where we should apply a filter where host == Escherichia
# need to load host information into neo4j taxonomy

In [6]:
# refseq_metadata = Mycelia.load_refseq_metadata()
ncbi_metadata = Mycelia.load_genbank_metadata()

Unnamed: 0_level_0,# assembly_accession,bioproject,biosample,wgs_master,refseq_category
Unnamed: 0_level_1,String,String,String,String,String
1,GCA_000001215.4,PRJNA13812,SAMN02803731,,reference genome
2,GCA_000001405.29,PRJNA31257,,,reference genome
3,GCA_000001515.5,PRJNA13184,SAMN02981217,AACZ00000000.4,na
4,GCA_000001545.3,PRJNA20869,SAMN02981238,ABGA00000000.1,na
5,GCA_000001635.9,PRJNA20689,,,reference genome
6,GCA_000001735.2,PRJNA10719,SAMN03081427,,reference genome
7,GCA_000001765.3,PRJNA10626,SAMN00779672,AADE00000000.2,na
8,GCA_000001895.4,PRJNA10629,SAMN02808228,AABR00000000.7,na
9,GCA_000001905.1,PRJNA12569,SAMN02953622,AAGU00000000.3,representative genome
10,GCA_000001985.1,PRJNA19555,SAMN02953685,ABAR00000000.1,representative genome


In [7]:
show(ncbi_metadata[1:1, :], allcols=true)

[1m1×23 DataFrame[0m
[1m Row [0m│[1m # assembly_accession [0m[1m bioproject [0m[1m biosample    [0m[1m wgs_master [0m[1m refseq_category  [0m[1m taxid [0m[1m species_taxid [0m[1m organism_name           [0m[1m infraspecific_name [0m[1m isolate [0m[1m version_status [0m[1m assembly_level [0m[1m release_type [0m[1m genome_rep [0m[1m seq_rel_date [0m[1m asm_name               [0m[1m submitter                         [0m[1m gbrs_paired_asm [0m[1m paired_asm_comp [0m[1m ftp_path                          [0m[1m excluded_from_refseq [0m[1m relation_to_type_material [0m[1m asm_not_live_date [0m
[1m     [0m│[90m String               [0m[90m String     [0m[90m String       [0m[90m String     [0m[90m String           [0m[90m Int64 [0m[90m Int64         [0m[90m String                  [0m[90m String             [0m[90m String  [0m[90m String         [0m[90m String         [0m[90m String       [0m[90m String     [0m[90

In [8]:
tax_id_filter = map(taxid -> taxid in child_tax_ids, ncbi_metadata[!, "taxid"])
is_right_host = map(x -> occursin(Regex(host, "i"), x), ncbi_metadata[!, "organism_name"])
not_excluded = ncbi_metadata[!, "excluded_from_refseq"] .== ""
is_full = ncbi_metadata[!, "genome_rep"] .== "Full"
# assembly_levels = ["Complete Genome"]
assembly_levels = ["Complete Genome", "Chromosome"]
# assembly_levels = ["Complete Genome", "Chromosome", "Scaffold"]
# assembly_levels = ["Complete Genome", "Chromosome", "Scaffold", "Contig"]
assembly_level_filter = map(x -> x in assembly_levels, ncbi_metadata[!, "assembly_level"])
full_filter = is_full .& not_excluded .& assembly_level_filter .& tax_id_filter .& is_right_host
count(full_filter)

138

In [9]:
# TODO
# here is another place we could enforce host == escherichia
# we'll use a manual filter as a temporary solution

In [10]:
ncbi_metadata_of_interest = ncbi_metadata[full_filter, :]

Unnamed: 0_level_0,# assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid
Unnamed: 0_level_1,String,String,String,String,String,Int64
1,GCA_000836945.1,,,,reference genome,2681598
2,GCA_000870165.1,,,,na,2681602
3,GCA_000884775.1,,,,na,2681597
4,GCA_000899615.1,,,,na,1141141
5,GCA_000900835.1,,,,na,1054480
6,GCA_000902495.1,,,,na,1054834
7,GCA_000918255.1,,,,na,1495285
8,GCA_000925055.1,,,,na,1204522
9,GCA_001041535.1,,,,na,1434323
10,GCA_001310115.1,PRJNA32381,,,na,66711


In [None]:
# https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=genbank&id=GCA_021354775

In [None]:
# for col in names(ncbi_metadata_of_interest)
#     @show col, ncbi_metadata_of_interest[1, col]
# end

In [11]:
N_FASTAS = 2

2

In [12]:
entity_file_maps = Dict()
ProgressMeter.@showprogress for row in DataFrames.eachrow(ncbi_metadata_of_interest[1:N_FASTAS, :])
    entity_id = row["# assembly_accession"]
    entity_file_maps[entity_id] = Dict()
    for (file_type, extension) in [
            "proteins" => "protein.faa.gz",
            "genome" => "genomic.fna.gz",
            "genbank" => "genomic.gbff.gz"
        ]
        outdir = mkpath(joinpath(DIR, extension))
        url = Mycelia.ncbi_ftp_path_to_url(row["ftp_path"], extension)
        outfile = joinpath(outdir, basename(url))
        if !isfile(outfile)
            try
                Downloads.download(url, outfile)
            catch e
                showerror(stdout, e)
            end
        end
        if isfile(outfile)
            entity_file_maps[entity_id][file_type] = outfile
        end
    end
end
entity_file_maps

Dict{Any, Any} with 2 entries:
  "GCA_000870165.1" => Dict{Any, Any}("proteins"=>"/home/jupyter-cameron.prybol…
  "GCA_000836945.1" => Dict{Any, Any}("proteins"=>"/home/jupyter-cameron.prybol…

In [None]:
# graph = Mycelia.fastx_to_kmer_graph(BioSequences.BigDNAMer{kmer_size}, fastx_files)

In [None]:
# use basename of files as identifiers

In [18]:
# kmer_size = Mycelia.assess_dnamer_saturation(fastx_files)
kmer_size = 7

7

In [19]:
@time graph = fastx_to_metagraph(kmer_size, entity_file_maps)

 15.348164 seconds (25.41 M allocations: 1.950 GiB, 22.50% gc time)


{8153, 45775} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [20]:
graph_outfile = "$DIR/root-tax-id_$(root_tax_id).k_$(kmer_size).genome-graph"
# Mycelia.save_graph(graph, graph_outfile)
# Mycelia.graph_to_gfa(graph, graph_outfile * ".gfa")
# Mycelia.load_graph(graph_outfile)

"/home/jupyter-cameron.prybol/workspace/2022-03-24-ecoli-tequatro-pangenome/root-tax-id_10663.k_7.genome-graph"

In [21]:
NEO4J_BIN_DIR = "/home/jupyter-cjprybol/software/neo4j-community-4.4.3/bin"
if !occursin(NEO4J_BIN_DIR, ENV["PATH"])
    ENV["PATH"] = "$(NEO4J_BIN_DIR):" * ENV["PATH"]
end
USERNAME="neo4j"
PASSWORD=readline(joinpath(homedir(), ".config", "neo4j", "$(database_id).pass"));
ADDRESS="neo4j+s://$(database_id).databases.neo4j.io:7687"
DATABASE = "neo4j"

"neo4j"

In [22]:
cmd = "CREATE CONSTRAINT ON (k:Kmer) ASSERT k.identifier IS UNIQUE"
@time Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd);

  0.109799 seconds (467.62 k allocations: 25.648 MiB, 97.84% compilation time)


`[4mcypher-shell[24m [4m--address[24m [4mneo4j+s://191a822b.databases.neo4j.io:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mpE3dLStRnrfQrF6FZaeCwMVI2ueSwAmLPbb5Myy75Lo[24m [4m--database[24m [4mneo4j[24m [4m--format[24m [4mauto[24m [4m'CREATE CONSTRAINT ON (k:Kmer) ASSERT k.identifier IS UNIQUE'[24m`

In [23]:
node_types = unique(graph.vprops[v][:TYPE] for v in Graphs.vertices(graph))

3-element Vector{String}:
 "Fasta"
 "FastaRecord"
 "Kmer"

In [None]:
# initialize graph

In [None]:
# add fasta records to graph

In [None]:
# add fasta records to Neo4J
# add over URL

In [25]:
upload_node_type_over_url("Fasta", graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

In [26]:
# add fasta records to graph

In [None]:
# add fasta records to Neo4J
# batch file upload

In [27]:
function upload_node_type_over_read_csv(node_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE, window_size=1)
        
    node_type_params = Set{Symbol}()
    vertices_of_type = [v for v in Graphs.vertices(graph) if (graph.vprops[v][:TYPE] == node_type)]
    node_type_params = filter(param -> param != :TYPE, reduce(union, keys(graph.vprops[v]) for v in vertices_of_type))
    
    V = length(vertices_of_type)
    windows = [i:min(i+window_size-1,V) for i in 1:window_size:V]
    
    temp_upload_dir = mkpath(joinpath(DIR, "temp_upload"))
    
    ProgressMeter.@showprogress for window in windows
        for (i, v) in enumerate(vertices_of_type[window])

            f = "node$v.tsv"
            local_f_path = "$(temp_upload_dir)/$(f)"
            uCSV.write(local_f_path, DataFrames.DataFrame(MetaGraphs.props(graph, v)), delim='\t')
            
            remote_f_path = "gs://neo4j-upload/$(f)"
            run(`gsutil cp $(local_f_path) $(remote_f_path)`)
            # gcloud iam service-accounts keys create ~/.config/gcloud/url-signer-key.json --iam-account="928365250020-compute@developer.gserviceaccount.com"
            signer_credential_path = "$(homedir())/.config/gcloud/url-signer-key.json"
            sign_url_cmd = `gsutil signurl $(signer_credential_path) $(remote_f_path)`
            signed_url_table = DataFrames.DataFrame(uCSV.read(open(sign_url_cmd), header=1, delim='\t')...)
            signed_url = signed_url_table[1, "Signed URL"]
            parameters = ["$(p): row.$(p)" for p in filter(x -> x != :TYPE, keys(MetaGraphs.props(graph, v)))]
            parameters = "{" * join(parameters, ", ") * "}"

            cmd =
            """
            LOAD CSV WITH HEADERS FROM '$(signed_url)' AS row FIELDTERMINATOR '\t'
            MERGE (node:$(node_type) $(parameters))
            """

            cmd = rstrip(replace(cmd, '\n' => ' '))
            cypher_cmd = Mycelia.cypher(address = ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE, cmd = cmd)
            run(cypher_cmd)            
        end
    end    
end

upload_node_type_over_read_csv (generic function with 2 methods)

In [28]:
upload_node_type_over_read_csv("FastaRecord", graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

Copying file:///home/jupyter-cameron.prybol/workspace/2022-03-24-ecoli-tequatro-pangenome/temp_upload/node2.tsv [Content-Type=text/tab-separated-values]...
/ [1 files][162.1 KiB/162.1 KiB]                                                
Operation completed over 1 objects/162.1 KiB.                                    
Copying file:///home/jupyter-cameron.prybol/workspace/2022-03-24-ecoli-tequatro-pangenome/temp_upload/node8094.tsv [Content-Type=text/tab-separated-values]...
/ [1 files][165.1 KiB/165.1 KiB]                                                
Operation completed over 1 objects/165.1 KiB.                                    


In [29]:
src_type = "Fasta"
dst_type = "FastaRecord"
edge_type = "CONTAINS_RECORD"

"CONTAINS_RECORD"

In [30]:
upload_edge_type_over_url(src_type, dst_type, edge_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE, window_size=10)

In [31]:
# add kmers to graph

In [32]:
# add kmers to Neo4J
# add over URL

In [33]:
upload_node_type_over_url("Kmer", graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:03:51[39m


In [34]:
# add edges from records to kmers

In [35]:
src_type = "FastaRecord"
dst_type = "Kmer"
edge_type = "CONTAINS_KMER"
upload_edge_type_over_url(src_type, dst_type, edge_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:09:35[39m


In [None]:
# upload to Neo4j

In [None]:
# Kmer to Kmer connections

In [37]:
# on merge with these Kmer to Kmer connections we'll need to increment the counts!
# any time we increase the number of genomes, the values will become out of date
src_type = "Kmer"
dst_type = "Kmer"
edge_type = "CONNECTS_TO"
upload_edge_type_over_url(src_type, dst_type, edge_type, graph, ADDRESS, USERNAME, PASSWORD, DATABASE)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:25:14[39m


In [None]:
# get the full list of kmers under the fasta of interest

In [None]:
# for each kmer, count the number of incoming "CONTAINS_KMER" edges

In [None]:
# divide by max count