In [1]:
import Pkg
pkgs = [
    "Eisenia",
    "Random",
    "Revise",
    "BioSequences",
    "Test",
    "LightGraphs",
    "Primes",
    "Statistics",
    "Plots",
    "uCSV",
    "DataFrames",
    "HTTP",
    "CodecZlib",
    "FASTX",
    "ProgressMeter",
    "MetaGraphs",
    "BioSymbols"
]

for pkg in pkgs
    try
        Pkg.add(pkg)
    catch
#         # tried to install an unregistered local package
    end
    eval(Meta.parse("import $pkg"))
end

[32m[1m   Updating[22m[39m registry at `~/.julia/registries/General`


[?25l    

[32m[1m   Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`




[32m[1m  Resolving[22m[39m package versions...
[32m[1m  Resolving[22m[39m package versions...
[32m[1m  Installed[22m[39m Qt_jll ─ v5.15.2+3
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Project.toml`
[32m[1mUpdating[22m[39m `~/.julia/environments/v1.5/Manifest.toml`
 [90m [ede63266] [39m[93m↑ Qt_jll v5.15.2+2 ⇒ v5.15.2+3[39m
[32m[1m  Resolving[22m[39m package versions...
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Project.toml`
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Manifest.toml`
[32m[1m  Resolving[22m[39m package versions...
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Project.toml`
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Manifest.toml`
[32m[1m  Resolving[22m[39m package versions...
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Project.toml`
[32m[1mNo Changes[22m[39m to `~/.julia/environments/v1.5/Manifest.toml`
[32m[1m  Resolving[22m[39m p

In [2]:
TASK = "making-neo4j-graphs"
DATE = "2021-03-18"
DIR = "$(homedir())/$(DATE)-$(TASK)"
if !isdir(DIR)
    mkdir(DIR)
end
cd(DIR)

In [3]:
if !occursin("cypher-shell", ENV["PATH"])
    ENV["PATH"] = "/Users/cameronprybol/Software/cypher-shell:$(ENV["PATH"])"
end

"/Users/cameronprybol/Software/cypher-shell:/Applications/Julia-1.5.app/Contents/Resources/julia/bin:/Applications/Julia-1.5.app/Contents/Resources/julia/bin:/Users/cameronprybol/miniconda/bin:/Users/cameronprybol/.gem/ruby/2.6.0/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"

In [4]:
function get_genome_from_ftp_directory(ftp_path)
    fasta_url = "$(ftp_path)/$(basename(ftp_path))_genomic.fna.gz"
    fasta_buffer = CodecZlib.GzipDecompressorStream(IOBuffer(HTTP.get(fasta_url).body))
    return collect(FASTX.FASTA.Reader(fasta_buffer))
end

get_genome_from_ftp_directory (generic function with 1 method)

In [5]:
function list_databases(;address="neo4j://localhost:7687", username="neo4j", password="password")
    io = open(`cypher-shell --address $(address) --username $(username) --password $(password) --database system --format auto "show databases"`)
    return DataFrames.DataFrame(uCSV.read(io, header=1, quotes='"', encodings=Dict("FALSE" => false, "TRUE" => true))...)
end

list_databases (generic function with 1 method)

In [6]:
function create_database(database_id; address="neo4j://localhost:7687", username="neo4j", password="password")
    current_databases = list_databases(;address, username, password)
    if database_id in current_databases[!, "name"]
        return true
    else
        run(`cypher-shell --address $(address) --username $(username) --password $(password) --database system --format auto "create database $(database_id)"`)
    end
end

create_database (generic function with 1 method)

In [7]:
function cypher(database_id, cmd; address="neo4j://localhost:7687", username="neo4j", password="password")
    read(`cypher-shell --address $(address) --username $(username) --password $(password) --database $(database_id) --format auto $(cmd)`, String)
end

cypher (generic function with 1 method)

In [8]:
function reverse_edge(edge)
    new_src = edge.dst.node
    @assert new_src == BioSequences.canonical(new_src)
    new_dst = edge.src.node
    @assert new_dst == BioSequences.canonical(new_dst)
    new_src_orientation = !edge.dst.orientation
    new_dst_orientaion = !edge.src.orientation
    new_edge = (src = (node = new_src, orientation = new_src_orientation),
                dst = (node = new_dst, orientation = new_dst_orientaion))
    return new_edge
end

reverse_edge (generic function with 1 method)

In [9]:
function canonical(edge)
    if edge.src.node < edge.dst.node
        return edge
    else
        return reverse_edge(edge)
    end
end

canonical (generic function with 1 method)

In [10]:
# https://genomevolution.org/wiki/index.php/Ambiguous_nucleotide
function replace_ambiguous(seq)
    for (i, x) in enumerate(seq)
        if x == BioSequences.DNA_N
            seq[i] = rand([BioSequences.DNA_A, BioSequences.DNA_C, BioSequences.DNA_G, BioSequences.DNA_T])
        elseif x == BioSequences.DNA_R
            seq[i] = rand([BioSequences.DNA_A, BioSequences.DNA_G])
        elseif x == BioSequences.DNA_Y
            seq[i] = rand([BioSequences.DNA_T, BioSequences.DNA_C])
        elseif x == BioSequences.DNA_K
            seq[i] = rand([BioSequences.DNA_G, BioSequences.DNA_T])
        elseif x == BioSequences.DNA_M
            seq[i] = rand([BioSequences.DNA_A, BioSequences.DNA_C])
        elseif x == BioSequences.DNA_S
            seq[i] = rand([BioSequences.DNA_G, BioSequences.DNA_C])
        elseif x == BioSequences.DNA_W
            seq[i] = rand([BioSequences.DNA_A, BioSequences.DNA_T])
        elseif x == BioSequences.DNA_B
            seq[i] = rand([BioSequences.DNA_C, BioSequences.DNA_G, BioSequences.DNA_T])
        elseif x == BioSequences.DNA_D
            seq[i] = rand([BioSequences.DNA_A, BioSequences.DNA_G, BioSequences.DNA_T])
        elseif x == BioSequences.DNA_H
            seq[i] = rand([BioSequences.DNA_A, BioSequences.DNA_C, BioSequences.DNA_T])
        elseif x == BioSequences.DNA_V
            seq[i] = rand([BioSequences.DNA_A, BioSequences.DNA_C, BioSequences.DNA_G])
        end
    end
    return seq
end

replace_ambiguous (generic function with 1 method)

In [11]:
db = "genbank"
assembly_summary_url = "https://ftp.ncbi.nlm.nih.gov/genomes/$(db)/assembly_summary_$(db).txt"
assembly_summary_file = "$(DIR)/$(basename(assembly_summary_url))"
if !isfile(assembly_summary_file)
    open(assembly_summary_file, "w") do io
        assembly_summary_response = HTTP.get(assembly_summary_url)
        write(io, assembly_summary_response.body)
    end
end
assembly_summary_table = DataFrames.DataFrame(uCSV.read(assembly_summary_file, delim='\t', header=2, skipmalformed=true)...)

Unnamed: 0_level_0,# assembly_accession,bioproject,biosample,wgs_master,refseq_category
Unnamed: 0_level_1,String,String,String,String,String
1,GCA_000001215.4,PRJNA13812,SAMN02803731,,reference genome
2,GCA_000001405.28,PRJNA31257,,,reference genome
3,GCA_000001515.5,PRJNA13184,SAMN02981217,AACZ00000000.4,na
4,GCA_000001545.3,PRJNA20869,SAMN02981238,ABGA00000000.1,na
5,GCA_000001635.9,PRJNA20689,,,reference genome
6,GCA_000001735.2,PRJNA10719,SAMN03081427,,reference genome
7,GCA_000001765.3,PRJNA10626,SAMN00779672,AADE00000000.2,na
8,GCA_000001895.4,PRJNA10629,SAMN02808228,AABR00000000.7,na
9,GCA_000001905.1,PRJNA12569,SAMN02953622,AAGU00000000.3,representative genome
10,GCA_000001985.1,PRJNA19555,SAMN02953685,ABAR00000000.1,representative genome


In [12]:
genome_size_table_url = "ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/species_genome_size.txt.gz"
genome_size_table_file = "$(DIR)/$(basename(genome_size_table_url))"
if !isfile(genome_size_table_file)
    open(genome_size_table_file, "w") do io
        genome_size_table_response = HTTP.get(genome_size_table_url)
        write(io, genome_size_table_response.body)
    end
end
genome_size_table_buffer = CodecZlib.GzipDecompressorStream(open(genome_size_table_file))
genome_size_table = DataFrames.DataFrame(uCSV.read(genome_size_table_buffer, header=1, delim='\t')...)

Unnamed: 0_level_0,#species_taxid,min_ungapped_length,max_ungapped_length,expected_ungapped_length
Unnamed: 0_level_1,Int64,Int64,Int64,Int64
1,9,318000,956000,636876
2,24,2950000,6050000,4500062
3,34,7405000,11109000,9257167
4,56,5994000,15000000,11989871
5,69,4891000,7337000,6113769
6,114,7212000,10820000,9015893
7,122,6222000,9334000,7777997
8,124,5461000,9315000,7387897
9,139,813000,1786000,1299537
10,140,688000,2067000,1377933


In [13]:
joint_table = DataFrames.innerjoin(assembly_summary_table, genome_size_table, on=["species_taxid" => "#species_taxid"])
sort!(joint_table, "expected_ungapped_length")

Unnamed: 0_level_0,# assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid
Unnamed: 0_level_1,String,String,String,String,String,Int64
1,GCA_002829805.1,,,,na,230604
2,GCA_003972145.1,,,,na,230604
3,GCA_003972165.1,,,,na,230604
4,GCA_003972185.1,,,,na,230604
5,GCA_003972205.1,,,,na,230604
6,GCA_003972225.1,,,,na,230604
7,GCA_003972245.1,,,,na,230604
8,GCA_003972265.1,,,,na,230604
9,GCA_000851485.1,,,,reference genome,154834
10,GCA_000856565.1,,,,na,12475


In [None]:
# 112903 on refseq
# 795 on genbank!

In [14]:
group = first(DataFrames.groupby(joint_table, "taxid"))

tax_id = group[1, "taxid"]

230604

In [15]:
# cmd = 
# """
# MATCH (t:Taxonomy) WHERE
# t.division_name = "Phages" AND t.rank = "family" AND t.scientific_name = "Myoviridae <Myoviridae>"
# MATCH (x)-[*]->(t)
# WHERE size(()-->(x)) = 0
# return x.tax_id
# """

# myoviridae_tax_ids = parse.(Int, DataFrames.DataFrame(uCSV.read(IOBuffer(cypher("ncbitaxonomy", cmd)), header=1, quotes='"')...)["x.tax_id"])
# group = joint_table[map(x -> x in myoviridae_tax_ids, joint_table[!, "taxid"]), :]

In [16]:
show(group, allcols=true)

8×26 SubDataFrame
│ Row │ # assembly_accession │ bioproject │ biosample │ wgs_master │
│     │ [90mString[39m               │ [90mString[39m     │ [90mString[39m    │ [90mString[39m     │
├─────┼──────────────────────┼────────────┼───────────┼────────────┤
│ 1   │ GCA_002829805.1      │            │           │            │
│ 2   │ GCA_003972145.1      │            │           │            │
│ 3   │ GCA_003972165.1      │            │           │            │
│ 4   │ GCA_003972185.1      │            │           │            │
│ 5   │ GCA_003972205.1      │            │           │            │
│ 6   │ GCA_003972225.1      │            │           │            │
│ 7   │ GCA_003972245.1      │            │           │            │
│ 8   │ GCA_003972265.1      │            │           │            │

│ Row │ refseq_category │ taxid  │ species_taxid │ organism_name     │
│     │ [90mString[39m          │ [90mInt64[39m  │ [90mInt64[39m         │ [90mString[39m            │


In [None]:
# database_id = "taxid$(taxid)"

In [None]:
# # delete the database if it exists
# cypher("system", "DROP DATABASE $(database_id)")
# # build it again
# create_database(database_id)

In [None]:
# for (accession, ftp_path) in DataFrames.eachrow(group[!, ["# assembly_accession", "ftp_path"]])
#     for fasta in get_genome_from_ftp_directory(ftp_path)
#         identifier = FASTX.identifier(fasta)
#         description = FASTX.description(fasta)
#         sequence = FASTX.sequence(fasta)
#         cypher_call = "MERGE (g:GENOME {accession: '$(accession)'})<-[s:SOURCE]-(f:FASTA {identifier: '$(identifier)', description: '$(description)', sequence: '$(sequence)'}) RETURN *"
#         cypher(database_id, cypher_call)
#     end
# end

In [None]:
# graphs = []
# for record in get_genome_from_ftp_directory(group[1, "ftp_path"])
#     graph = Eisenia.KmerGraph(BioSequences.DNAMer{7}, record)
#     push!(graphs, graph)
# end
# graphs

In [None]:
# genome_accession = group[1, "# assembly_accession"]
# genome = get_genome_from_ftp_directory(group[1, "ftp_path"])

In [17]:
k = 13
node_type = BioSequences.DNAMer{k}

BioSequences.Mer{BioSequences.DNAAlphabet{2},13}

In [18]:
edge_type = typeof(
    (src = (node = node_type(BioSequences.randdnaseq(k)), orientation = true),
    dst = (node = node_type(BioSequences.randdnaseq(k)), orientation = true)))

NamedTuple{(:src, :dst),Tuple{NamedTuple{(:node, :orientation),Tuple{BioSequences.Mer{BioSequences.DNAAlphabet{2},13},Bool}},NamedTuple{(:node, :orientation),Tuple{BioSequences.Mer{BioSequences.DNAAlphabet{2},13},Bool}}}}

In [19]:
# edge_type = Pair{Pair{node_type, Bool}, Pair{node_type, Bool}}
evidence_type = typeof((accession = "", identifier = "", index = 1, orientation = true))

NamedTuple{(:accession, :identifier, :index, :orientation),Tuple{String,String,Int64,Bool}}

In [26]:
graph = Dict{Symbol, Any}(
    :nodes => Dict{node_type, Set{evidence_type}}(),
    :edges => Dict{edge_type, Set{evidence_type}}())
n = k+1
ProgressMeter.@showprogress for (accession, ftp_path) in DataFrames.eachrow(group[!, ["# assembly_accession", "ftp_path"]])
    genome = get_genome_from_ftp_directory(ftp_path)
    for record in genome
        seq = replace_ambiguous(FASTX.sequence(record))
        for i in 1:length(seq)-n+1
            slice = view(seq,i:i+n-1)
            raw_src = node_type(slice[1:end-1])
            canonical_src = BioSequences.canonical(raw_src)
            src_orientation = canonical_src == raw_src
            src_evidence = (
                accession = accession,
                identifier = FASTX.identifier(record),
                index = i,
                orientation = src_orientation)
            graph[:nodes][canonical_src] = push!(get(graph[:nodes], canonical_src, Set{evidence_type}()), src_evidence)

            raw_dst = node_type(slice[2:end])
            canonical_dst = BioSequences.canonical(raw_dst)
            dst_orientation = canonical_dst == raw_dst
            dst_evidence = (
                accession = accession,
                identifier = FASTX.identifier(record),
                index = i+1,
                orientation = dst_orientation)
            graph[:nodes][canonical_dst] = push!(get(graph[:nodes], canonical_dst, Set{evidence_type}()), dst_evidence)

            edge = (src = (node = canonical_src, orientation = src_orientation),
                    dst = (node = canonical_dst, orientation = dst_orientation))
            canonical_edge = canonical(edge)

            edge_orientation = canonical_edge == edge

            edge_evidence = (
                accession = accession,
                identifier = FASTX.identifier(record),
                index = i,
                orientation = edge_orientation)

            graph[:edges][canonical_edge] = push!(get(graph[:edges], canonical_edge, Set{evidence_type}()), edge_evidence)
        end
    end
end
graph[:nodes] = sort(graph[:nodes])

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:02[39m


OrderedCollections.OrderedDict{BioSequences.Mer{BioSequences.DNAAlphabet{2},13},Set{NamedTuple{(:accession, :identifier, :index, :orientation),Tuple{String,String,Int64,Bool}}}} with 1237 entries:
  AAAACAAGCAATG => Set(NamedTuple{(:accession, :identifier, :index, :orientatio…
  AAAACAAGCCGAA => Set(NamedTuple{(:accession, :identifier, :index, :orientatio…
  AAAACAAGCTTCG => Set(NamedTuple{(:accession, :identifier, :index, :orientatio…
  AAAACATTTGTGC => Set(NamedTuple{(:accession, :identifier, :index, :orientatio…
  AAAACCCTTACGG => Set(NamedTuple{(:accession, :identifier, :index, :orientatio…
  AAAACCCTTATGG => Set(NamedTuple{(:accession, :identifier, :index, :orientatio…
  AAAACGAGCCAAA => Set(NamedTuple{(:accession, :identifier, :index, :orientatio…
  AAAACGAGCCACA => Set(NamedTuple{(:accession, :identifier, :index, :orientatio…
  AAAACTACTTGGA => Set(NamedTuple{(:accession, :identifier, :index, :orientatio…
  AAAACTGGCATAC => Set(NamedTuple{(:accession, :identifier, :index, :orien

In [None]:
# canonical_edge = canonical(edge)
# src_node = canonical_edge.src.orientation ? canonical_edge.src.node : BioSequences.reverse_complement(canonical_edge.src.node)
# dst_node = canonical_edge.dst.orientation ? canonical_edge.dst.node : BioSequences.reverse_complement(canonical_edge.dst.node)
# @assert BioSequences.LongSequence(src_node)[2:end] == BioSequences.LongSequence(dst_node)[1:end-1]

In [None]:
# seq = push!(BioSequences.LongSequence(src_node), last(dst_node))
# slice == seq || slice == BioSequences.reverse_complement(seq)

In [27]:
graph

Dict{Symbol,Any} with 2 entries:
  :nodes => OrderedCollections.OrderedDict{BioSequences.Mer{BioSequences.DNAAlp…
  :edges => Dict{NamedTuple{(:src, :dst),Tuple{NamedTuple{(:node, :orientation)…

In [28]:
pan_meta_genome_graph = MetaGraphs.MetaDiGraph(length(graph[:nodes]))

MetaGraphs.set_prop!(pan_meta_genome_graph, :k, k)

kmers = collect(keys(graph[:nodes]))

ProgressMeter.@showprogress for (i, (node, evidence)) in enumerate(graph[:nodes])
    MetaGraphs.set_prop!(pan_meta_genome_graph, i, :sequence, node)
    MetaGraphs.set_prop!(pan_meta_genome_graph, i, :evidence, evidence)
end

ProgressMeter.@showprogress for (edge, edge_evidence) in graph[:edges]
    src_indices = searchsorted(kmers, edge.src.node)
    dst_indices = searchsorted(kmers, edge.dst.node)
    @assert length(src_indices) == length(dst_indices) == 1
    src_index = first(src_indices)
    dst_index = first(dst_indices)
    LightGraphs.add_edge!(pan_meta_genome_graph, LightGraphs.Edge(src_index, dst_index))
    MetaGraphs.set_prop!(
        pan_meta_genome_graph,
        LightGraphs.Edge(src_index, dst_index),
        :orientation,
        edge.src.orientation => edge.dst.orientation)
    MetaGraphs.set_prop!(
        pan_meta_genome_graph,
        LightGraphs.Edge(src_index, dst_index),
        :evidence,
        edge_evidence)
end
pan_meta_genome_graph

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


{1237, 1267} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

In [29]:
graph = nothing

In [30]:
GC.gc()

In [31]:
GFA_file = "$(DIR)/$(tax_id).kmer-graph.gfa"
open(GFA_file, "w") do io
    println(io, "H\tVN:Z:1.0")
    for vertex in LightGraphs.vertices(pan_meta_genome_graph)
        kmer = MetaGraphs.get_prop(pan_meta_genome_graph, vertex, :sequence)
        evidence = MetaGraphs.get_prop(pan_meta_genome_graph, vertex, :evidence)
        depth_of_coverage = length(evidence)
        fields = ["S", "$vertex", string(kmer), "KC:i:$(depth_of_coverage)"]
        line = join(fields, '\t')
        println(io, line)
    end
    for edge in LightGraphs.edges(pan_meta_genome_graph)
        orientations = MetaGraphs.get_prop(pan_meta_genome_graph, edge, :orientation)
        overlap = MetaGraphs.get_prop(pan_meta_genome_graph, :k) - 1
        link = ["L",
                    edge.src,
                    first(orientations) ? '+' : '-',
                    edge.dst,
                    last(orientations) ? '+' : '-',
                    "$(overlap)M"]
        line = join(link, '\t')
        println(io, line)
    end
end

In [236]:
visited = falses(length(kmers))

1237-element BitArray{1}:
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

In [237]:
untigs = []

Any[]

In [1839]:
sum(visited)/length(visited)*100

100.0

In [1853]:
# figure out why these won't merge!
# 1052 - 394,115,959,3,17,59,202,112,114,201,58,16,2,1140,849,1072 - 1053,875,303,977,353,1167,890

# v = rand(findall(.!visited))
v = 1053

1053

In [1854]:
forward_kmer = pan_meta_genome_graph.vprops[v][:sequence]

DNA 13-mer:
GCTTGTTTTACAC

In [1855]:
forward_path = [forward_kmer]

1-element Array{BioSequences.Mer{BioSequences.DNAAlphabet{2},13},1}:
 GCTTGTTTTACAC

In [1856]:
function find_viable_neighbors(kmers, kmer)
    viable_neighbors = typeof(kmer)[]
    for neighbor in BioSequences.neighbors(kmer)
        canonical_neighbor = BioSequences.canonical(neighbor)
        neighbor_is_canonical = BioSequences.iscanonical(neighbor)
        neighbor_indices = searchsorted(kmers, canonical_neighbor)
        if !isempty(neighbor_indices)
            push!(viable_neighbors, neighbor)
        end
    end
    return viable_neighbors
end

find_viable_neighbors (generic function with 1 method)

In [1857]:
this_path = forward_path

1-element Array{BioSequences.Mer{BioSequences.DNAAlphabet{2},13},1}:
 GCTTGTTTTACAC

In [1858]:
done = false
while !done
    viable_neighbors = find_viable_neighbors(kmers, last(this_path))
    if length(viable_neighbors) == 1
        viable_neighbor = first(viable_neighbors)
        reversed_neighbor = BioSequences.reverse_complement(viable_neighbor)
        possible_backtracking_kmers = find_viable_neighbors(kmers, reversed_neighbor)
        only_one_backtrack_option = length(possible_backtracking_kmers) == 1
        backtrack_option_is_current_state = BioSequences.reverse_complement(first(possible_backtracking_kmers)) == last(this_path)
        if only_one_backtrack_option && backtrack_option_is_current_state
            push!(this_path, viable_neighbor)
            viable_neighbors = find_viable_neighbors(kmers, last(this_path))
        else
            done = true
        end
    else
        done = true
    end
end
this_path

7-element Array{BioSequences.Mer{BioSequences.DNAAlphabet{2},13},1}:
 GCTTGTTTTACAC
 CTTGTTTTACACG
 TTGTTTTACACGT
 TGTTTTACACGTC
 GTTTTACACGTCT
 TTTTACACGTCTA
 TTTACACGTCTAG

In [1859]:
reverse_path = [BioSequences.reverse_complement(forward_kmer)]

1-element Array{BioSequences.Mer{BioSequences.DNAAlphabet{2},13},1}:
 GTGTAAAACAAGC

In [1860]:
this_path = reverse_path

1-element Array{BioSequences.Mer{BioSequences.DNAAlphabet{2},13},1}:
 GTGTAAAACAAGC

In [1861]:
done = false
while !done
    viable_neighbors = find_viable_neighbors(kmers, last(this_path))
    if length(viable_neighbors) == 1
        viable_neighbor = first(viable_neighbors)
        reversed_neighbor = BioSequences.reverse_complement(viable_neighbor)
        possible_backtracking_kmers = find_viable_neighbors(kmers, reversed_neighbor)
        only_one_backtrack_option = length(possible_backtracking_kmers) == 1
        backtrack_option_is_current_state = BioSequences.reverse_complement(first(possible_backtracking_kmers)) == last(this_path)
        if only_one_backtrack_option && backtrack_option_is_current_state
            push!(this_path, viable_neighbor)
            viable_neighbors = find_viable_neighbors(kmers, last(this_path))
        else
            done = true
        end
    else
        done = true
    end
end
this_path

1-element Array{BioSequences.Mer{BioSequences.DNAAlphabet{2},13},1}:
 GTGTAAAACAAGC

In [1862]:
path = [BioSequences.reverse_complement.(reverse(reverse_path))..., forward_path[2:end]...]

7-element Array{BioSequences.Mer{BioSequences.DNAAlphabet{2},13},1}:
 GCTTGTTTTACAC
 CTTGTTTTACACG
 TTGTTTTACACGT
 TGTTTTACACGTC
 GTTTTACACGTCT
 TTTTACACGTCTA
 TTTACACGTCTAG

In [1863]:
oriented_index_path = [
    (searchsortedfirst(kmers, BioSequences.canonical(kmer)), BioSequences.iscanonical(kmer)) for kmer in path
]

7-element Array{Tuple{Int64,Bool},1}:
 (1053, 1)
 (875, 0)
 (303, 0)
 (977, 0)
 (353, 0)
 (1167, 0)
 (890, 0)

In [1864]:
for (index, orientation) in oriented_index_path
    visited[index] = true
end

In [1865]:
sequence = BioSequences.LongSequence(path[1])
for kmer in path[2:end]
    push!(sequence, kmer[end])
end
sequence

19nt DNA Sequence:
GCTTGTTTTACACGTCTAG

In [1866]:
push!(untigs, (sequence = x, oriented_kmer_path = path, oriented_index_path = oriented_index_path))

100-element Array{Any,1}:
 (sequence = GTGTAAATCCCGGACTCTTCACTTGAAAGATCAATGTCAATCCAAGTAGTTTTCCGAGGATTACCAGCAAAGTGCGATAC, oriented_kmer_path = BioSequences.Mer{BioSequences.DNAAlphabet{2},13}[AATACCGTAAAAC, ATACCGTAAAACA, TACCGTAAAACAT, ACCGTAAAACATT, CCGTAAAACATTT, CGTAAAACATTTG, GTAAAACATTTGT, TAAAACATTTGTG, AAAACATTTGTGC, AAACATTTGTGCA, AACATTTGTGCAG, ACATTTGTGCAGC, CATTTGTGCAGCA], oriented_index_path = Tuple{Int64,Bool}[(142, 1), (470, 1), (562, 0), (185, 0), (54, 0), (623, 0), (197, 0), (650, 0), (4, 1), (19, 1), (68, 1), (227, 1), (723, 1)])
 (sequence = GTGTAAATCCCGGACTCTTCACTTGAAAGATCAATGTCAATCCAAGTAGTTTTCCGAGGATTACCAGCAAAGTGCGATAC, oriented_kmer_path = BioSequences.Mer{BioSequences.DNAAlphabet{2},13}[ATGCGAGGATCAT, TGCGAGGATCATG, GCGAGGATCATGG, CGAGGATCATGGG, GAGGATCATGGGG, AGGATCATGGGGC, GGATCATGGGGCA, GATCATGGGGCAT, ATCATGGGGCATG, TCATGGGGCATGA  …  ATTTATACTCAGA, TTTATACTCAGAT, TTATACTCAGATG, TATACTCAGATGC, ATACTCAGATGCG, TACTCAGATGCGA, ACTCAGATGCGAG, CTCAGATGCGAGA, TCAGATGCG

In [1867]:
println(join([index for (index, orientation) in oriented_index_path], ','))

1053,875,303,977,353,1167,890


In [1806]:
# 394,115,959,3,17,59,202,112,114,201,58,16,2,1140,849,1072 - 1053

# 1052 - 394

# 672-672 self loop - should use a large enough k to avoid these

In [None]:
# # while length(inneighbors) == 1
    
# end

In [43]:
# LightGraphs.outneighbors(pan_meta_genome_graph, v)

1-element Array{Int64,1}:
 728

In [38]:
# for v in LightGraphs.vertices(pan_meta_genome_graph)
#     indegree = LightGraphs.indegree(pan_meta_genome_graph, v)
#     outdegree = LightGraphs.outdegree(pan_meta_genome_graph, v)
#     if indegree + outdegree == 1
# #         @show v, indegree, outdegree, "ENDCAP"
#     elseif (indegree == 1) && (outdegree == 1)
# #         @show v, indegree, outdegree, "CONNECTIONPOINT"
# #     elseif (indegree + outdegree  3)
#     end
# #     @show LightGraphs.indegree(pan_meta_genome_graph, v)
# #     @show LightGraphs.outdegree(pan_meta_genome_graph, v)
# end

In [None]:
# function take_a_walk(graph, path)
    
# #     @show path
#     if issorted(path)
#         edge = LightGraphs.Edge(path...)
# #         @show edge
#         orientations = graph.eprops[edge][:orientation]
# #         @show orientations
#     else
#         edge = LightGraphs.Edge(sort(path)...)
#         orientations = graph.eprops[edge][:orientation]
#         orientations = !orientations.second => !orientations.first
# #         @show orientations
#     end
#     oriented_path = [
#         (node = first(path), orientation = first(orientations)), 
#         (node = last(path), orientation = last(orientations))
#         ]
# #     @show oriented_path
    
#     neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, oriented_path[end].node)
# #     @show neighbors
    
# #     novel_neighbors = filter(x -> !(x in map(x -> x.node, oriented_path)), neighbors)
#     novel_neighbors = filter(x -> x != map(x -> x.node, oriented_path)[end-1], neighbors)
# #     @show novel_neighbors
    
#     while length(novel_neighbors) == 1
#         novel_neighbor = first(novel_neighbors)
#         if novel_neighbor > oriented_path[end].node
#             edge = LightGraphs.Edge(oriented_path[end].node, novel_neighbor)
#             orientations = graph.eprops[edge][:orientation]
#             if first(orientations) != last(oriented_path).orientation
#                 break
# #                 @error "incompatible"
#             end
#         else
#             edge = LightGraphs.Edge(novel_neighbor, oriented_path[end].node)
#             orientations = graph.eprops[edge][:orientation]
#             orientations = !orientations.second => !orientations.first
#             if first(orientations) != last(oriented_path).orientation
# #                 @error "incompatible"
#                 break
#             end
#         end        
    
#         push!(oriented_path, (node = novel_neighbor, orientation = last(orientations)))
        
# #         neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, path[end])
#         neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, oriented_path[end].node)
# #         @show neighbors
        
# #         novel_neighbors = filter(x -> !(x in map(x -> x.node, oriented_path)), neighbors)
#         novel_neighbors = filter(x -> x != map(x -> x.node, oriented_path)[end-1], neighbors)
# #         novel_neighbors = filter(x -> !(x in path), neighbors)
# #         @show novel_neighbors
#     end
#     return oriented_path
# end

In [None]:
# simplified_graph = MetaGraphs.MetaGraph()

In [None]:
# canonical_kmers = sort(collect(v[:sequence] for v in values(pan_meta_genome_graph.vprops)))

In [None]:
# function oriented_neighbors(graph, kmer, orientation)
#     canonical_kmers = [graph.vprops[v][:sequence] for v in LightGraphs.vertices(graph)]
# #     display(canonical_kmers)
    
#     oriented_kmer = orientation ? kmer : BioSequences.reverse_complement(kmer)
# #     display(kmer)
# #     display(oriented_kmer)
    
#     neighbors = collect(Iterators.filter(
#         neighbor -> BioSequences.canonical(neighbor) in canonical_kmers,
#         BioSequences.neighbors(oriented_kmer)))
# end

In [None]:
# for v in LightGraphs.vertices(pan_meta_genome_graph)
#     kmer = pan_meta_genome_graph.vprops[v][:sequence]
#     forward_neighbors = oriented_neighbors(pan_meta_genome_graph, kmer, true)
#     reverse_neighbors = oriented_neighbors(pan_meta_genome_graph, kmer, false)
# end

In [None]:
# oriented_neighbors(pan_meta_genome_graph, pan_meta_genome_graph.vprops[1][:sequence], true)

In [None]:
# oriented_neighbors(pan_meta_genome_graph, pan_meta_genome_graph.vprops[1][:sequence], false)

In [None]:
# downstream_neighbors = collect(Iterators.filter(
#     neighbor -> BioSequences.canonical(neighbor) in canonical_kmers,
#     BioSequences.neighbors(pan_meta_genome_graph.vprops[1][:sequence])))

In [None]:
# upstream_neighbors = collect(Iterators.filter(
#     neighbor -> BioSequences.canonical(neighbor) in canonical_kmers,
#     BioSequences.neighbors(BioSequences.reverse_complement(pan_meta_genome_graph.vprops[1][:sequence]))))

In [None]:
# visited = falses(length(LightGraphs.vertices(pan_meta_genome_graph)));

In [None]:
# function non_branching_walk(graph, kmer, orientation)

# #     canonical_kmers = sort(collect(v[:sequence] for v in values(graph.vprops)))
    
#     neighbors = oriented_neighbors(graph, kmer, orientation)
# #     @show kmer, orientation, neighbors

#     oriented_kmer = orientation ? kmer : BioSequences.reverse_complement(kmer)
#     path = [oriented_kmer]

# #     neighbors = collect(Iterators.filter(
# #         neighbor -> BioSequences.canonical(neighbor) in canonical_kmers,
# #         BioSequences.neighbors(last(path))))
#     done = false
#     while !done
# #         @show "here"
#         number_of_downstream_neighbors = length(neighbors)
#         if number_of_downstream_neighbors != 1
#             done = true
#             break
#         end
#         downstream_neighbor = first(neighbors)
#         downstream_neighbor_orientation = downstream_neighbor == BioSequences.canonical(downstream_neighbor)
#         upstream_neighbors_of_downstream_neighbor = oriented_neighbors(graph, downstream_neighbor, !downstream_neighbor_orientation)
#         if length(upstream_neighbors_of_downstream_neighbor) != 1
#             done = true
#             break
#         end
#         push!(path, downstream_neighbor)
#         neighbors = oriented_neighbors(graph, BioSequences.canonical(downstream_neighbor), downstream_neighbor_orientation)
#     end
#     return path
# end

In [None]:
# canonical_kmers = sort(collect(v[:sequence] for v in values(pan_meta_genome_graph.vprops)))

In [None]:
# visited = falses(length(LightGraphs.vertices(pan_meta_genome_graph)))
# unbranched_paths = []
# ProgressMeter.@showprogress for v in collect(LightGraphs.vertices(pan_meta_genome_graph))
# #     @show v
#     if !visited[v]
#         forward_walk = non_branching_walk(pan_meta_genome_graph, pan_meta_genome_graph.vprops[v][:sequence], true)
#         reverse_walk = non_branching_walk(pan_meta_genome_graph, pan_meta_genome_graph.vprops[v][:sequence], false)
#         reversed_reverse_walk = BioSequences.reverse_complement.(reverse_walk[end:-1:1])
#         @assert reversed_reverse_walk[end] == forward_walk[1]
#         walk = [reversed_reverse_walk[1:end-1]..., forward_walk...]
#         oriented_path = [(node = findfirst(x -> x == BioSequences.canonical(n), canonical_kmers), orientation = n == BioSequences.canonical(n)) for n in walk]
#         push!(unbranched_paths, oriented_path)
#         for n in oriented_path
#             visited[n.node] = true
#         end
#     end
# end
# # unique!(unbranched_paths)
# @assert all(visited)
# # unbranched_paths

In [None]:
# unique(unbranched_paths)

In [None]:
# for (i, path) in enumerate(unbranched_paths)
#     nodes = map(x -> x.node, path)
#     @show i
#     for (j, path2) in enumerate(unbranched_paths)
#         if i != j
#             nodes2 = map(x -> x.node, path2)
#             if nodes2[1] == nodes[1]
#                 @show nodes2 == nodes
#                 @show nodes
#                 @show nodes2
#                 @show j
# #                 @show "hit"
#             elseif nodes2[1] == nodes[end]
#                 @show nodes
#                 @show nodes2
# #                 @show "hit"
#             elseif nodes2[end] == nodes[1]
#                 @show nodes
#                 @show nodes2
# #                 @show "hit"
#             elseif nodes2[end] == nodes[end]
# #                 @show "hit"
#                 @show nodes
#                 @show nodes2
#             end
#         end
#     end
# end

In [None]:
# for v in collect(LightGraphs.vertices(simplified_graph))[1:1]
#     oriented_path = simplified_graph.vprops[v][:oriented_path]
#     path_nodes = [x.node for x in oriented_path]
# #     @show oriented_path
#     @show path_nodes
#     src = first(oriented_path)
#     dst = last(oriented_path)
# #     @show src, dst
#     for v2 in filter(x -> x != v, LightGraphs.vertices(simplified_graph))
#         v2_oriented_path = simplified_graph.vprops[v2][:oriented_path]
#         v2_nodes = [x.node for x in v2_oriented_path]
# #         @show v2_nodes
# #         @show v2_oriented_path
#         if (first(v2_nodes) == src.node)
#             @show first(v2_oriented_path)
#             @show src
# #             @show "hit!"
# #             @show v2_nodes
# #             @show v2
# #             LightGraphs.add_edge!(simplified_graph, v, v2)
#         elseif last(v2_nodes) == src.node
#             @show last(v2_oriented_path)
#             @show src
# #             @show v2
# #             @show v2_nodes
#         end
#     end
# #     downstream_neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, last(oriented_path).node)
# #     @show downstream_neighbors
# #     filter!(x -> !(x in path_nodes), downstream_neighbors)
# #     @show downstream_neighbors
# end
# simplified_graph

In [None]:
# findall(unbranched_paths[6] .!= unbranched_paths[7])

In [None]:
# not_same = findall(unbranched_paths[6] .!= unbranched_paths[7])
# unbranched_paths[6][83:97]

In [None]:
# unbranched_paths[7][83:97]

In [None]:
# for path in unbranched_paths
#     x = map(x -> x.node, path)
#     display((path[1].node => path[1].orientation, path[end].node => path[end].orientation))
# #     @show path
# end

In [None]:
# for v in collect(LightGraphs.vertices(simplified_graph))[1:1]
#     oriented_path = simplified_graph.vprops[v][:oriented_path]
#     path_nodes = [x.node for x in oriented_path]
# #     @show oriented_path
#     @show path_nodes
#     src = first(oriented_path)
#     dst = last(oriented_path)
# #     @show src, dst
#     for v2 in filter(x -> x != v, LightGraphs.vertices(simplified_graph))
#         v2_oriented_path = simplified_graph.vprops[v2][:oriented_path]
#         v2_nodes = [x.node for x in v2_oriented_path]
# #         @show v2_nodes
# #         @show v2_oriented_path
#         if (first(v2_nodes) == src.node)
#             @show first(v2_oriented_path)
#             @show src
# #             @show "hit!"
# #             @show v2_nodes
# #             @show v2
# #             LightGraphs.add_edge!(simplified_graph, v, v2)
#         elseif last(v2_nodes) == src.node
#             @show last(v2_oriented_path)
#             @show src
# #             @show v2
# #             @show v2_nodes
#         end
#     end
# #     downstream_neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, last(oriented_path).node)
# #     @show downstream_neighbors
# #     filter!(x -> !(x in path_nodes), downstream_neighbors)
# #     @show downstream_neighbors
# end
# simplified_graph

In [None]:
# visited = falses(length(LightGraphs.vertices(pan_meta_genome_graph)))
# unbranched_paths = []
# ProgressMeter.@showprogress for v in collect(LightGraphs.vertices(pan_meta_genome_graph))
#     neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, v)
#     walks = []
#     if !visited[v]
#         if length(neighbors) == 0
# #             @show "island"
#             walks = [[(node = v, orientation = true)]]
#         elseif length(neighbors) == 1
# #             @show "tip"
#             walks = [take_a_walk(pan_meta_genome_graph, [v, first(neighbors)])]
#         elseif length(neighbors) == 2
# #             @show "bridge"
#             forward_walk = take_a_walk(pan_meta_genome_graph, [v, first(neighbors)])
#             reverse_walk = take_a_walk(pan_meta_genome_graph, [v, last(neighbors)])
#             reverse_walk = [(node = v.node, orientation = !v.orientation) for v in reverse(reverse_walk)]
#             @assert last(reverse_walk) == first(forward_walk)
#             walks = [[reverse_walk[1:end-1]..., forward_walk...]]
# #         elseif length(neighbors) > 2
# # #             @show "intersection"
# #             walks = []
# #             for neighbor in neighbors
# #                 push!(walks, take_a_walk(pan_meta_genome_graph, [v, neighbor]))
# #             end
#         end
#         for walk in walks
#             if walk[1].node == 156
# #                 @show "here!!, $v"
# #                 @show neighbors
# #                 @show walks
#             end
#             push!(unbranched_paths, walk)
#             for v in walk
#                 visited[v.node] = true
#             end
#         end
#     end
# end
# @assert all(visited)
# unbranched_paths

In [None]:
# for unbranched_path in unbranched_paths
#     reconstructed_sequence = BioSequences.LongDNASeq(pan_meta_genome_graph.vprops[first(unbranched_path).node][:sequence])
#     if !first(unbranched_path).orientation
#         reconstructed_sequence = BioSequences.reverse_complement(reconstructed_sequence)
#     end
#     for (v, orientation) in unbranched_path[2:end]
#         v_kmer = pan_meta_genome_graph.vprops[v][:sequence]
#         if !orientation
#             v_kmer = BioSequences.reverse_complement(v_kmer)
#         end
#         k = length(v_kmer)
#         indices = length(reconstructed_sequence)-k+2:length(reconstructed_sequence)
#         @assert length(indices) == k-1
#         existing_overlap = reconstructed_sequence[indices]
#         kmer_overlap = BioSequences.LongDNASeq(v_kmer)[1:end-1]
#         @assert existing_overlap == kmer_overlap
#         push!(reconstructed_sequence, v_kmer[end])    
#     end
#     LightGraphs.add_vertex!(simplified_graph)
#     index = length(LightGraphs.vertices(simplified_graph))
#     MetaGraphs.set_prop!(simplified_graph, index, :sequence, reconstructed_sequence)
#     MetaGraphs.set_prop!(simplified_graph, index, :oriented_path, unbranched_path)
# end
# simplified_graph

In [None]:
# for v in collect(LightGraphs.vertices(simplified_graph))[1:1]
#     oriented_path = simplified_graph.vprops[v][:oriented_path]
#     path_nodes = [x.node for x in oriented_path]
# #     @show oriented_path
#     @show path_nodes
#     src = first(oriented_path)
#     dst = last(oriented_path)
# #     @show src, dst
#     for v2 in filter(x -> x != v, LightGraphs.vertices(simplified_graph))
#         v2_oriented_path = simplified_graph.vprops[v2][:oriented_path]
#         v2_nodes = [x.node for x in v2_oriented_path]
# #         @show v2_nodes
# #         @show v2_oriented_path
#         if (first(v2_nodes) == src.node)
#             @show first(v2_oriented_path)
#             @show src
# #             @show "hit!"
# #             @show v2_nodes
# #             @show v2
# #             LightGraphs.add_edge!(simplified_graph, v, v2)
#         elseif last(v2_nodes) == src.node
#             @show last(v2_oriented_path)
#             @show src
# #             @show v2
# #             @show v2_nodes
#         end
#     end
# #     downstream_neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, last(oriented_path).node)
# #     @show downstream_neighbors
# #     filter!(x -> !(x in path_nodes), downstream_neighbors)
# #     @show downstream_neighbors
# end
# simplified_graph

In [None]:
# GFA_file = "$(DIR)/$(tax_id).simplified-graph.gfa"
# open(GFA_file, "w") do io
#     println(io, "H\tVN:Z:1.0")
#     for vertex in LightGraphs.vertices(simplified_graph)
#         sequence = MetaGraphs.get_prop(simplified_graph, vertex, :sequence)
# #         evidence = MetaGraphs.get_prop(simplified_graph, vertex, :evidence)
# #         depth_of_coverage = length(evidence)
#         depth_of_coverage = 1
#         fields = ["S", "$vertex", sequence, "KC:i:$(depth_of_coverage)"]
#         line = join(fields, '\t')
#         println(io, line)
#     end
#     for edge in LightGraphs.edges(simplified_graph)
# #         orientations = MetaGraphs.get_prop(pan_meta_genome_graph, edge, :orientation)
# #         overlap = MetaGraphs.get_prop(pan_meta_genome_graph, :k) - 1
# #         link = ["L",
# #                     edge.src,
# #                     first(orientations) ? '+' : '-',
# #                     edge.dst,
# #                     last(orientations) ? '+' : '-',
# #                     "$(overlap)M"]
#         link = ["L",
#                     edge.src,
#                     '+',
#                     edge.dst,
#                     '+',
#                     "13M"]
#         line = join(link, '\t')
#         println(io, line)
#     end
# end

In [None]:
# visited = falses(length(LightGraphs.vertices(pan_meta_genome_graph)))
# unbranched_paths = []
# ProgressMeter.@showprogress for v in collect(LightGraphs.vertices(pan_meta_genome_graph))[1:1]
#     neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, v)
#     @show v, neighbors
#     if !visited[v] && length(neighbors) == 2
# #         @show 
#         a, b = LightGraphs.all_neighbors(pan_meta_genome_graph, v)
#         forward_walk = [v, a]
        
#         forward_neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, forward_walk[end])
#         novel_forward_neighbors = filter(x -> x != forward_walk[end-1], forward_neighbors)
        
#         while length(novel_forward_neighbors) == 1
#             push!(forward_walk, first(novel_forward_neighbors))
#             forward_neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, forward_walk[end])
#             novel_forward_neighbors = filter(x -> x != forward_walk[end-1], forward_neighbors)            
#         end
#         reverse_walk = [v, b]
        
#         reverse_neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, reverse_walk[end])
#         novel_reverse_neighbors = filter(x -> x != reverse_walk[end-1], reverse_neighbors)
        
#         while length(novel_reverse_neighbors) == 1
#             push!(reverse_walk, first(novel_reverse_neighbors))
#             reverse_neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, reverse_walk[end])
#             novel_reverse_neighbors = filter(x -> x != reverse_walk[end-1], reverse_neighbors)         
#         end
#         walk = [reverse(reverse_walk)[1:end-1]..., forward_walk...]
# #         @show walk
#         push!(unbranched_paths, walk)
#         for v in walk
#             visited[v] = true
#         end
#     end
# end

In [None]:
# simplified_graph = MetaGraphs.MetaDiGraph()

In [None]:
# simplify graph by reducing all non-branching paths

In [None]:
# visited = falses(length(LightGraphs.vertices(pan_meta_genome_graph)))
# unbranched_paths = []
# ProgressMeter.@showprogress for v in collect(LightGraphs.vertices(pan_meta_genome_graph))
#     neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, v)
#     if !visited[v] && length(neighbors) == 2
# #         @show 
#         a, b = LightGraphs.all_neighbors(pan_meta_genome_graph, v)
#         forward_walk = [v, a]
        
#         forward_neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, forward_walk[end])
#         novel_forward_neighbors = filter(x -> x != forward_walk[end-1], forward_neighbors)
        
#         while length(novel_forward_neighbors) == 1
#             push!(forward_walk, first(novel_forward_neighbors))
#             forward_neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, forward_walk[end])
#             novel_forward_neighbors = filter(x -> x != forward_walk[end-1], forward_neighbors)            
#         end
#         reverse_walk = [v, b]
        
#         reverse_neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, reverse_walk[end])
#         novel_reverse_neighbors = filter(x -> x != reverse_walk[end-1], reverse_neighbors)
        
#         while length(novel_reverse_neighbors) == 1
#             push!(reverse_walk, first(novel_reverse_neighbors))
#             reverse_neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, reverse_walk[end])
#             novel_reverse_neighbors = filter(x -> x != reverse_walk[end-1], reverse_neighbors)         
#         end
#         walk = [reverse(reverse_walk)[1:end-1]..., forward_walk...]
# #         @show walk
#         push!(unbranched_paths, walk)
#         for v in walk
#             visited[v] = true
#         end
#     end
# end

In [None]:
# ProgressMeter.@showprogress for v in collect(LightGraphs.vertices(pan_meta_genome_graph))
#     neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, v)
#     if !visited[v] && length(neighbors) > 2
#         for neighbor in neighbors
#             walk = [v, neighbor]
#             forward_neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, walk[end])
#             novel_forward_neighbors = filter(x -> x != walk[end-1], forward_neighbors)            
#             while length(novel_forward_neighbors) == 1
#                 push!(walk, first(novel_forward_neighbors))
#                 forward_neighbors = LightGraphs.all_neighbors(pan_meta_genome_graph, walk[end])
#                 novel_forward_neighbors = filter(x -> x != walk[end-1], forward_neighbors)            
#             end
#             push!(unbranched_paths, walk)
#             for v in walk
#                 visited[v] = true
#             end
#         end
#     end
# end
# @assert all(visited)

In [None]:
# unbranched_paths

In [None]:
# unbranched_paths

In [None]:
# for x in findall(.!visited)
#     push!(unbranched_paths, [x])
#     visited[x] = true
# end

In [None]:
# count(.!visited)

In [None]:
# map(x -> LightGraphs.all_neighbors(pan_meta_genome_graph, x), findall(.!visited))

In [None]:
# cypher_command = join(["MERGE (:DNAMer {sequence: \"$(kmer)\"})" for kmer in graph.kmers], '\n')
# cypher(database_id, cypher_command)

In [None]:
# edge = {orientation: [true, true], evidence: ["source-identifier_sequence-identifier_index_orientation"]}
# node = {sequence: "", evidence: ["source-identifier_sequence-identifier_index_orientation"]}

In [None]:
# # cyper_commands = []
# ProgressMeter.@showprogress for edge_evidence in collect(graph.edge_evidence)
#     edge = first(edge_evidence)
#     evidence = last(edge_evidence)
#     src_kmer = graph.kmers[edge.src]
#     dst_kmer = graph.kmers[edge.dst]
#     possible_orientations = map(bool -> Eisenia.assess_path_orientations([edge.src, edge.dst], graph.kmers, bool), [true, false])
#     possible_orientations = filter!(x -> x != nothing, possible_orientations)
#     orientations = first(possible_orientations)
#     string_orienations = string(Int.(orientations))
#     cypher_command = 
#     """
#     MATCH (src:DNAMer {sequence: "$(src_kmer)"})
#     MATCH (dst:DNAMer {sequence: "$(dst_kmer)"})
#     MERGE (src)-[edge:EDGE {orientations: $(string_orienations)}]->(dst)
#     RETURN *
#     """
# #     push!(cyper_commands, cypher_command)
#     cypher(database_id, cypher_command)
# end

In [None]:
# # cyper_commands = []
# ProgressMeter.@showprogress for edge_evidence in collect(graph.edge_evidence)
#     edge = first(edge_evidence)
#     evidence = last(edge_evidence)
#     src_kmer = graph.kmers[edge.src]
#     dst_kmer = graph.kmers[edge.dst]
#     unique_sources = unique(map(e -> e.record_identifier, evidence))
#     for unique_source in unique_sources
#         cypher_command = 
#         """
#         MATCH (src:DNAMer {sequence: "$(src_kmer)"})
#         MATCH (dst:DNAMer {sequence: "$(dst_kmer)"})
#         MATCH (f:FASTA) WHERE f.identifier = "$(unique_source)"
#         MERGE (src)-[src_f:CONTAINED_IN]->(f)
#         MERGE (dst)-[dst_f:CONTAINED_IN]->(f)
#         RETURN *
#         """
#         cypher(database_id, cypher_command)
#     end
# end