In [1]:
DATE = "2021-05-14"
TASK = "phage-k-pangenome"
DIR = mkpath("$(homedir())/$(DATE)-$(TASK)")

"/Users/cameronprybol/2021-05-14-phage-k-pangenome"

In [2]:
import LightGraphs
import MetaGraphs
import BioSequences

┌ Info: Precompiling LightGraphs [093fc24a-ae57-5d10-9952-331d41423f4d]
└ @ Base loading.jl:1278
┌ Info: Precompiling MetaGraphs [626554b9-1ddb-594c-aa3c-2596fe9399a5]
└ @ Base loading.jl:1278
┌ Info: Precompiling BioSequences [7e6ae17a-c86d-528c-b3b9-7f778a29fe59]
└ @ Base loading.jl:1278
[ Info: Compiling bit-parallel GC counter for LongSequence{<:NucleicAcidAlphabet}
[ Info: Compiling bit-parallel mismatch counter for LongSequence{<:NucleicAcidAlphabet}
[ Info: Compiling bit-parallel match counter for LongSequence{<:NucleicAcidAlphabet}
[ Info: Compiling bit-parallel ambiguity counter...
[ Info: 	For a single LongSequence{<:NucleicAcidAlphabet}
[ Info: 	For a pair of LongSequence{<:NucleicAcidAlphabet}s
[ Info: Compiling bit-parallel certainty counter for LongSequence{<:NucleicAcidAlphabet}
[ Info: Compiling bit-parallel gap counter for LongSequence{<:NucleicAcidAlphabet}


In [3]:
import uCSV
import DataFrames
import FASTX
import HTTP
import CodecZlib
import DataStructures

┌ Info: Precompiling FASTX [c2308a5c-f048-11e8-3e8a-31650f418d12]
└ @ Base loading.jl:1278


In [4]:
import Pkg

In [None]:
function add_evidence!(kmer_graph, index::Int, evidence)
    if MetaGraphs.has_prop(kmer_graph, index, :evidence)
        push!(kmer_graph.vprops[index][:evidence], evidence)
    else
        MetaGraphs.set_prop!(kmer_graph, index, :evidence, Set([evidence]))
    end
end

function add_evidence!(kmer_graph, edge::LightGraphs.SimpleGraphs.AbstractSimpleEdge, evidence)
    if MetaGraphs.has_prop(kmer_graph, edge, :evidence)
        push!(kmer_graph.eprops[edge][:evidence], evidence)
    else
        MetaGraphs.set_prop!(kmer_graph, edge, :evidence, Set([evidence]))
    end
end

In [None]:
function count_kmers(::Type{KMER_TYPE}, sequence::BioSequences.LongSequence) where KMER_TYPE
    kmer_counts = DataStructures.OrderedDict{KMER_TYPE, Int}()
    kmer_iterator = (kmer.fw for kmer in BioSequences.each(KMER_TYPE, sequence))
    for kmer in kmer_iterator
        kmer_counts[kmer] = get(kmer_counts, kmer, 0) + 1
    end
    return kmer_counts
end

function count_kmers(::Type{KMER_TYPE}, record::R) where {KMER_TYPE, R <: Union{FASTX.FASTA.Record, FASTX.FASTQ.Record}}
    return count_kmers(KMER_TYPE, FASTX.sequence(record))    
end

function count_kmers(::Type{KMER_TYPE}, sequences) where KMER_TYPE
    joint_kmer_counts = DataStructures.OrderedDict{KMER_TYPE, Int}()
    for sequence in sequences
        sequence_kmer_counts = count_kmers(KMER_TYPE, sequence)
        merge!(+, joint_kmer_counts, sequence_kmer_counts)
    end
    sort!(joint_kmer_counts)
end

In [None]:
function get_kmer_index(kmers, kmer)
    index = searchsortedfirst(kmers, kmer)
    @assert kmers[index] == kmer "$kmer"
    return index
end

In [None]:
"""
	function get_sequence(;db=""::String, accession=""::String, ftp=""::String)

Get dna (db = "nuccore") or protein (db = "protein") sequences from NCBI
or get fasta directly from FTP site
"""
function get_sequence(;db=""::String, accession=""::String, ftp=""::String)
    if !isempty(db) && !isempty(accession)
        # API will block if we request more than 3 times per second, so set a 1/2 second sleep to set max of 2 requests per second when looping
        sleep(0.5)
        url = "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=$(db)&report=fasta&id=$(accession)"
        return FASTX.FASTA.Reader(IOBuffer(HTTP.get(url).body))
    elseif !isempty(ftp)
        return FASTX.FASTA.Reader(CodecZlib.GzipDecompressorStream(IOBuffer(HTTP.get(ftp).body)))
    else
        @error "invalid call"
    end
end

In [None]:
Git_directory = homedir() * "/" * first(filter(x -> occursin(r"^git$"i, x), readdir(homedir())))
path = "$(Git_directory)/Eisenia"
push!(LOAD_PATH, path); Pkg.activate(path); Pkg.update(); Pkg.instantiate(); Pkg.activate()
import Eisenia

In [None]:
ncbi_staph_phage_metadata = DataFrames.DataFrame(uCSV.read("$(DIR)/sequences.csv", header=1, quotes='"', typedetectrows=100)...)

ncbi_staph_phage_metadata = ncbi_staph_phage_metadata[ncbi_staph_phage_metadata[!, "Nuc_Completeness"] .== "complete", :]

ncbi_staph_phage_metadata = ncbi_staph_phage_metadata[ncbi_staph_phage_metadata[!, "Genus"] .== "Kayvirus", :]

ncbi_staph_phage_metadata = ncbi_staph_phage_metadata[ncbi_staph_phage_metadata[!, "Sequence_Type"] .== "RefSeq", :]

In [None]:
show(ncbi_staph_phage_metadata, allcols=true)

In [None]:
# k = 3
k = 5
KMER_TYPE = BioSequences.DNAMer{k}

In [None]:
EDGE_MER = BioSequences.DNAMer{k+1}

In [None]:
accession = ncbi_staph_phage_metadata[1, "Accession"]
fastx = collect(get_sequence(db = "nuccore", accession = accession))
kmers = Set(collect(keys(count_kmers(KMER_TYPE, fastx))))
for accession in ncbi_staph_phage_metadata[2:end, "Accession"]
    fastx = collect(get_sequence(db = "nuccore", accession = accession))
    kmers = union!(kmers, collect(keys(count_kmers(KMER_TYPE, fastx))))
end
for kmer in kmers
    push!(kmers, BioSequences.reverse_complement(kmer))
end
unique!(sort!(collect(kmers)))

In [None]:
nucleotides = [BioSequences.DNA_A, BioSequences.DNA_C, BioSequences.DNA_G, BioSequences.DNA_T]

In [None]:
all_kmers = sort(BioSequences.DNAMer.(vec(collect(Iterators.product(nucleotides, nucleotides, nucleotides, nucleotides, nucleotides)))))

In [None]:
unhit_kmers = [kmer for kmer in all_kmers if !(kmer in kmers)]

In [None]:
unhit_sequences = BioSequences.LongSequence.(unhit_kmers)

In [None]:
unhit_fourmers = vcat(
    [s[1:end-1] for s in unhit_sequences],
    [s[2:end] for s in unhit_sequences]
)
unique!(sort!(BioSequences.canonical.(unhit_fourmers)))

In [None]:
kmer_graph = MetaGraphs.MetaDiGraph(length(kmers))
MetaGraphs.set_prop!(kmer_graph, :k, k)

In [None]:
for (vertex, kmer) in enumerate(kmers)
    MetaGraphs.set_prop!(kmer_graph, vertex, :kmer, kmer)
end

In [None]:
for accession in ncbi_staph_phage_metadata[!, "Accession"]
    fastx = collect(get_sequence(db = "nuccore", accession = accession))
    for record in fastx
        sequence = FASTX.sequence(record)
        record_identifier = FASTX.identifier(record) 
        edge_iterator = BioSequences.each(EDGE_MER, sequence)
        for sequence_edge in edge_iterator
            # forward

            forward_source = BioSequences.DNAMer(BioSequences.LongDNASeq(sequence_edge.fw)[1:end-1])
            forward_destination = BioSequences.DNAMer(BioSequences.LongDNASeq(sequence_edge.fw)[2:end])
            
            forward_source_index = get_kmer_index(kmers, forward_source)
            forward_source_evidence = (identifier = FASTX.identifier(record), position = sequence_edge.position, orientation = true)
            add_evidence!(kmer_graph, forward_source_index, forward_source_evidence)

            forward_destination_index = get_kmer_index(kmers, forward_destination)
            forward_destination_evidence = (identifier = FASTX.identifier(record), position = sequence_edge.position + 1, orientation = true)
            add_evidence!(kmer_graph, forward_destination_index, forward_destination_evidence)

            graph_edge = LightGraphs.Edge(forward_source_index, forward_destination_index)
            LightGraphs.add_edge!(kmer_graph, graph_edge)
            forward_edge_evidence = (identifier = FASTX.identifier(record), position = sequence_edge.position, orientation = true)
            add_evidence!(kmer_graph, graph_edge, forward_edge_evidence)

            # reverse

            reverse_source = BioSequences.DNAMer(BioSequences.LongDNASeq(sequence_edge.bw)[1:end-1])
            reverse_destination = BioSequences.DNAMer(BioSequences.LongDNASeq(sequence_edge.fw)[2:end])
            
            reverse_source_index = get_kmer_index(kmers, reverse_source)
            reverse_source_evidence = (identifier = FASTX.identifier(record), position = sequence_edge.position + 1, orientation = false)
            add_evidence!(kmer_graph, reverse_source_index, reverse_source_evidence)
            
            reverse_destination_index = get_kmer_index(kmers, reverse_destination)
            reverse_destination_evidence = (identifier = FASTX.identifier(record), position = sequence_edge.position, orientation = false)
            add_evidence!(kmer_graph, reverse_destination_index, reverse_destination_evidence)

            graph_edge = LightGraphs.Edge(reverse_source_index, reverse_destination_index)
            LightGraphs.add_edge!(kmer_graph, graph_edge)
            reverse_edge_evidence = (identifier = FASTX.identifier(record), position = sequence_edge.position, orientation = false)
            add_evidence!(kmer_graph, graph_edge, reverse_edge_evidence)
        end
    end
end

In [None]:
outfile = "test.$(k).gfa"
open(outfile, "w") do io
    println(io, "H\tVN:Z:1.0")
    for vertex in LightGraphs.vertices(kmer_graph)
        kmer = kmer_graph.vprops[vertex][:kmer]
        depth = length(kmer_graph.vprops[vertex][:evidence])
        fields = ["S", "$vertex", kmer, "RC:i:$(depth)"]
        line = join(fields, '\t')
        println(io, line)
    end
    for edge in LightGraphs.edges(kmer_graph)
        overlap = kmer_graph.gprops[:k] - 1
        link = ["L",
                    edge.src,
                    '+',
                    edge.dst,
                    '+',
                    "$(overlap)M"]
        line = join(link, '\t')
        println(io, line)
    end
end