In [1]:
DATE = "2021-05-14"
TASK = "phage-k-pangenome"
DIR = mkpath("$(homedir())/$(DATE)-$(TASK)")

"/home/jovyan/2021-05-14-phage-k-pangenome"

In [2]:
import LightGraphs
import MetaGraphs
import BioSequences

In [3]:
import uCSV
import DataFrames
import FASTX
import HTTP
import CodecZlib
import DataStructures

In [4]:
import Pkg

In [5]:
function add_evidence!(kmer_graph, index::Int, evidence)
    if MetaGraphs.has_prop(kmer_graph, index, :evidence)
        push!(kmer_graph.vprops[index][:evidence], evidence)
    else
        MetaGraphs.set_prop!(kmer_graph, index, :evidence, Set([evidence]))
    end
end

function add_evidence!(kmer_graph, edge::LightGraphs.SimpleGraphs.AbstractSimpleEdge, evidence)
    if MetaGraphs.has_prop(kmer_graph, edge, :evidence)
        push!(kmer_graph.eprops[edge][:evidence], evidence)
    else
        MetaGraphs.set_prop!(kmer_graph, edge, :evidence, Set([evidence]))
    end
end

add_evidence! (generic function with 2 methods)

In [6]:
function count_kmers(::Type{KMER_TYPE}, sequence::BioSequences.LongSequence) where KMER_TYPE
    kmer_counts = DataStructures.OrderedDict{KMER_TYPE, Int}()
    kmer_iterator = (kmer.fw for kmer in BioSequences.each(KMER_TYPE, sequence))
    for kmer in kmer_iterator
        kmer_counts[kmer] = get(kmer_counts, kmer, 0) + 1
    end
    return kmer_counts
end

function count_kmers(::Type{KMER_TYPE}, record::R) where {KMER_TYPE, R <: Union{FASTX.FASTA.Record, FASTX.FASTQ.Record}}
    return count_kmers(KMER_TYPE, FASTX.sequence(record))    
end

function count_kmers(::Type{KMER_TYPE}, sequences) where KMER_TYPE
    joint_kmer_counts = DataStructures.OrderedDict{KMER_TYPE, Int}()
    for sequence in sequences
        sequence_kmer_counts = count_kmers(KMER_TYPE, sequence)
        merge!(+, joint_kmer_counts, sequence_kmer_counts)
    end
    sort!(joint_kmer_counts)
end

count_kmers (generic function with 3 methods)

In [29]:
function get_kmer_index(kmers, kmer)
    index = searchsortedfirst(kmers, kmer)
    @assert kmers[index] == kmer "$kmer"
    return index
end

get_kmer_index (generic function with 1 method)

In [11]:
"""
	function get_sequence(;db=""::String, accession=""::String, ftp=""::String)

Get dna (db = "nuccore") or protein (db = "protein") sequences from NCBI
or get fasta directly from FTP site
"""
function get_sequence(;db=""::String, accession=""::String, ftp=""::String)
    if !isempty(db) && !isempty(accession)
        # API will block if we request more than 3 times per second, so set a 1/2 second sleep to set max of 2 requests per second when looping
        sleep(0.5)
        url = "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=$(db)&report=fasta&id=$(accession)"
        return FASTX.FASTA.Reader(IOBuffer(HTTP.get(url).body))
    elseif !isempty(ftp)
        return FASTX.FASTA.Reader(CodecZlib.GzipDecompressorStream(IOBuffer(HTTP.get(ftp).body)))
    else
        @error "invalid call"
    end
end

get_sequence

In [8]:
Git_directory = homedir() * "/" * first(filter(x -> occursin(r"^git$"i, x), readdir(homedir())))
path = "$(Git_directory)/Eisenia"
push!(LOAD_PATH, path); Pkg.activate(path); Pkg.update(); Pkg.instantiate(); Pkg.activate()
import Eisenia

[32m[1m Activating[22m[39m environment at `~/Git/Eisenia/Project.toml`
[32m[1m   Updating[22m[39m registry at `/opt/julia/registries/General`
######################################################################### 100.0%
[32m[1mNo Changes[22m[39m to `~/Git/Eisenia/Project.toml`
[32m[1mNo Changes[22m[39m to `~/Git/Eisenia/Manifest.toml`
[32m[1m Activating[22m[39m environment at `/opt/julia/environments/v1.5/Project.toml`
┌ Info: Precompiling Eisenia [453d265d-8292-4a7b-a57c-dce3f9ae6acd]
└ @ Base loading.jl:1278
│ - If you have Eisenia checked out for development and have
│   added MetaGraphs as a dependency but haven't updated your primary
│   environment's manifest file, try `Pkg.resolve()`.
│ - Otherwise you may need to report an issue with Eisenia


In [9]:
ncbi_staph_phage_metadata = DataFrames.DataFrame(uCSV.read("$(DIR)/sequences.csv", header=1, quotes='"', typedetectrows=100)...)

ncbi_staph_phage_metadata = ncbi_staph_phage_metadata[ncbi_staph_phage_metadata[!, "Nuc_Completeness"] .== "complete", :]

ncbi_staph_phage_metadata = ncbi_staph_phage_metadata[ncbi_staph_phage_metadata[!, "Genus"] .== "Kayvirus", :]

ncbi_staph_phage_metadata = ncbi_staph_phage_metadata[ncbi_staph_phage_metadata[!, "Sequence_Type"] .== "RefSeq", :]

Unnamed: 0_level_0,Accession,SRA_Accession,Submitters
Unnamed: 0_level_1,String,String,String
1,NC_047720,,"Vandersteegen,K., Mattheus,W., Ceyssens,P.J., Bilocq,F., De Vos,D., Pirnay,J.P., Noben,J.P., Merabishvili,M., Lipinska,U., Hermans,K., Lavigne,R."
2,NC_047721,,"Kirby,A.E."
3,NC_047722,,"Lobocka,M., Hejnowicz,M.S., Dabrowski,K., Gozdek,A., Kosakowski,J., Witkowska,M., Ulatowska,M.I., Weber-Dabrowska,B., Kwiatek,M., Parasion,S., Gawor,J., Kosowska,H., Glowacka,A."
4,NC_047723,,"Lobocka,M., Hejnowicz,M.S., Dabrowski,K., Gozdek,A., Kosakowski,J., Witkowska,M., Ulatowska,M.I., Weber-Dabrowska,B., Kwiatek,M., Parasion,S., Gawor,J., Kosowska,H., Glowacka,A."
5,NC_047724,,"Lobocka,M., Hejnowicz,M.S., Dabrowski,K., Gozdek,A., Kosakowski,J., Witkowska,M., Ulatowska,M.I., Weber-Dabrowska,B., Kwiatek,M., Parasion,S., Gawor,J., Kosowska,H., Glowacka,A."
6,NC_047725,,"Lobocka,M., Hejnowicz,M.S., Dabrowski,K., Gozdek,A., Kosakowski,J., Witkowska,M., Ulatowska,M.I., Weber-Dabrowska,B., Kwiatek,M., Parasion,S., Gawor,J., Kosowska,H., Glowacka,A."
7,NC_047726,,"Lobocka,M., Hejnowicz,M.S., Dabrowski,K., Gozdek,A., Kosakowski,J., Witkowska,M., Ulatowska,M.I., Weber-Dabrowska,B., Kwiatek,M., Parasion,S., Gawor,J., Kosowska,H., Glowacka,A."
8,NC_047727,,"Lobocka,M., Hejnowicz,M.S., Dabrowski,K., Gozdek,A., Kosakowski,J., Witkowska,M., Ulatowska,M.I., Weber-Dabrowska,B., Kwiatek,M., Parasion,S., Gawor,J., Kosowska,H., Glowacka,A."
9,NC_047728,,"Ulatowska,M.I., Weber-Dabrowska,B., Sadowy,E., Krzyszton-Russjan,J., Gorski,A., Lobocka,M.B."
10,NC_047729,,"Zhang,X., Kang,H., Tong,Y."


In [10]:
show(ncbi_staph_phage_metadata, allcols=true)

21×21 DataFrame
│ Row │ Accession │ SRA_Accession │
│     │ [90mString[39m    │ [90mString[39m        │
├─────┼───────────┼───────────────┤
│ 1   │ NC_047720 │               │
│ 2   │ NC_047721 │               │
│ 3   │ NC_047722 │               │
│ 4   │ NC_047723 │               │
│ 5   │ NC_047724 │               │
│ 6   │ NC_047725 │               │
│ 7   │ NC_047726 │               │
│ 8   │ NC_047727 │               │
│ 9   │ NC_047728 │               │
│ 10  │ NC_047729 │               │
│ 11  │ NC_047730 │               │
│ 12  │ NC_047731 │               │
│ 13  │ NC_047732 │               │
│ 14  │ NC_025416 │               │
│ 15  │ NC_025426 │               │
│ 16  │ NC_023573 │               │
│ 17  │ NC_023009 │               │
│ 18  │ NC_019726 │               │
│ 19  │ NC_019448 │               │
│ 20  │ NC_007066 │               │
│ 21  │ NC_005880 │               │

│ Row │ Submitters                                                                                 

In [32]:
# k = 3
k = 5
KMER_TYPE = BioSequences.DNAMer{k}

BioSequences.Mer{BioSequences.DNAAlphabet{2},5}

In [33]:
EDGE_MER = BioSequences.DNAMer{k+1}

BioSequences.Mer{BioSequences.DNAAlphabet{2},6}

In [34]:
accession = ncbi_staph_phage_metadata[1, "Accession"]
fastx = collect(get_sequence(db = "nuccore", accession = accession))
kmers = Set(collect(keys(count_kmers(KMER_TYPE, fastx))))
for accession in ncbi_staph_phage_metadata[2:end, "Accession"]
    fastx = collect(get_sequence(db = "nuccore", accession = accession))
    kmers = union!(kmers, collect(keys(count_kmers(KMER_TYPE, fastx))))
end
for kmer in kmers
    push!(kmers, BioSequences.reverse_complement(kmer))
end
unique!(sort!(collect(kmers)))

996-element Array{BioSequences.Mer{BioSequences.DNAAlphabet{2},5},1}:
 AAAAA
 AAAAC
 AAAAG
 AAAAT
 AAACA
 AAACC
 AAACG
 AAACT
 AAAGA
 AAAGC
 AAAGG
 AAAGT
 AAATA
 ⋮
 TTTCA
 TTTCC
 TTTCG
 TTTCT
 TTTGA
 TTTGC
 TTTGG
 TTTGT
 TTTTA
 TTTTC
 TTTTG
 TTTTT

In [35]:
nucleotides = [BioSequences.DNA_A, BioSequences.DNA_C, BioSequences.DNA_G, BioSequences.DNA_T]

4-element Array{BioSymbols.DNA,1}:
 DNA_A
 DNA_C
 DNA_G
 DNA_T

In [44]:
all_kmers = sort(BioSequences.DNAMer.(vec(collect(Iterators.product(nucleotides, nucleotides, nucleotides, nucleotides, nucleotides)))))

1024-element Array{BioSequences.Mer{BioSequences.DNAAlphabet{2},5},1}:
 AAAAA
 AAAAC
 AAAAG
 AAAAT
 AAACA
 AAACC
 AAACG
 AAACT
 AAAGA
 AAAGC
 AAAGG
 AAAGT
 AAATA
 ⋮
 TTTCA
 TTTCC
 TTTCG
 TTTCT
 TTTGA
 TTTGC
 TTTGG
 TTTGT
 TTTTA
 TTTTC
 TTTTG
 TTTTT

In [47]:
unhit_kmers = [kmer for kmer in all_kmers if !(kmer in kmers)]

28-element Array{BioSequences.Mer{BioSequences.DNAAlphabet{2},5},1}:
 ACGCG
 AGATC
 AGCGA
 AGGCC
 CCGCG
 CGATC
 CGCGA
 CGCGC
 CGCGG
 CGCGT
 CGGCC
 CTCGC
 GATCA
 ⋮
 GCGAA
 GCGAG
 GCGCG
 GGATC
 GGCCG
 GGCCT
 GGCGA
 TCGCC
 TCGCG
 TCGCT
 TGATC
 TTCGC

In [49]:
unhit_sequences = BioSequences.LongSequence.(unhit_kmers)

28-element Array{BioSequences.LongSequence{BioSequences.DNAAlphabet{2}},1}:
 ACGCG
 AGATC
 AGCGA
 AGGCC
 CCGCG
 CGATC
 CGCGA
 CGCGC
 CGCGG
 CGCGT
 CGGCC
 CTCGC
 GATCA
 ⋮
 GCGAA
 GCGAG
 GCGCG
 GGATC
 GGCCG
 GGCCT
 GGCGA
 TCGCC
 TCGCG
 TCGCT
 TGATC
 TTCGC

In [52]:
unhit_fourmers = vcat(
    [s[1:end-1] for s in unhit_sequences],
    [s[2:end] for s in unhit_sequences]
)
unique!(sort!(BioSequences.canonical.(unhit_fourmers)))

17-element Array{BioSequences.LongSequence{BioSequences.DNAAlphabet{2}},1}:
 ACGC
 AGAT
 AGCG
 AGGC
 ATCA
 ATCC
 ATCG
 CCGC
 CGAA
 CGAG
 CGCC
 CGCG
 CGGC
 GATC
 GCGA
 GCGC
 GGCC

In [25]:
kmer_graph = MetaGraphs.MetaDiGraph(length(kmers))
MetaGraphs.set_prop!(kmer_graph, :k, k)

true

In [26]:
for (vertex, kmer) in enumerate(kmers)
    MetaGraphs.set_prop!(kmer_graph, vertex, :kmer, kmer)
end

In [30]:
for accession in ncbi_staph_phage_metadata[!, "Accession"]
    fastx = collect(get_sequence(db = "nuccore", accession = accession))
    for record in fastx
        sequence = FASTX.sequence(record)
        record_identifier = FASTX.identifier(record) 
        edge_iterator = BioSequences.each(EDGE_MER, sequence)
        for sequence_edge in edge_iterator
            # forward

            forward_source = BioSequences.DNAMer(BioSequences.LongDNASeq(sequence_edge.fw)[1:end-1])
            forward_destination = BioSequences.DNAMer(BioSequences.LongDNASeq(sequence_edge.fw)[2:end])
            
            forward_source_index = get_kmer_index(kmers, forward_source)
            forward_source_evidence = (identifier = FASTX.identifier(record), position = sequence_edge.position, orientation = true)
            add_evidence!(kmer_graph, forward_source_index, forward_source_evidence)

            forward_destination_index = get_kmer_index(kmers, forward_destination)
            forward_destination_evidence = (identifier = FASTX.identifier(record), position = sequence_edge.position + 1, orientation = true)
            add_evidence!(kmer_graph, forward_destination_index, forward_destination_evidence)

            graph_edge = LightGraphs.Edge(forward_source_index, forward_destination_index)
            LightGraphs.add_edge!(kmer_graph, graph_edge)
            forward_edge_evidence = (identifier = FASTX.identifier(record), position = sequence_edge.position, orientation = true)
            add_evidence!(kmer_graph, graph_edge, forward_edge_evidence)

            # reverse

            reverse_source = BioSequences.DNAMer(BioSequences.LongDNASeq(sequence_edge.bw)[1:end-1])
            reverse_destination = BioSequences.DNAMer(BioSequences.LongDNASeq(sequence_edge.fw)[2:end])
            
            reverse_source_index = get_kmer_index(kmers, reverse_source)
            reverse_source_evidence = (identifier = FASTX.identifier(record), position = sequence_edge.position + 1, orientation = false)
            add_evidence!(kmer_graph, reverse_source_index, reverse_source_evidence)
            
            reverse_destination_index = get_kmer_index(kmers, reverse_destination)
            reverse_destination_evidence = (identifier = FASTX.identifier(record), position = sequence_edge.position, orientation = false)
            add_evidence!(kmer_graph, reverse_destination_index, reverse_destination_evidence)

            graph_edge = LightGraphs.Edge(reverse_source_index, reverse_destination_index)
            LightGraphs.add_edge!(kmer_graph, graph_edge)
            reverse_edge_evidence = (identifier = FASTX.identifier(record), position = sequence_edge.position, orientation = false)
            add_evidence!(kmer_graph, graph_edge, reverse_edge_evidence)
        end
    end
end

LoadError: AssertionError: GGCCA

In [28]:
outfile = "test.$(k).gfa"
open(outfile, "w") do io
    println(io, "H\tVN:Z:1.0")
    for vertex in LightGraphs.vertices(kmer_graph)
        kmer = kmer_graph.vprops[vertex][:kmer]
        depth = length(kmer_graph.vprops[vertex][:evidence])
        fields = ["S", "$vertex", kmer, "RC:i:$(depth)"]
        line = join(fields, '\t')
        println(io, line)
    end
    for edge in LightGraphs.edges(kmer_graph)
        overlap = kmer_graph.gprops[:k] - 1
        link = ["L",
                    edge.src,
                    '+',
                    edge.dst,
                    '+',
                    "$(overlap)M"]
        line = join(link, '\t')
        println(io, line)
    end
end

LoadError: KeyError: key :evidence not found