In [1]:
DATE = "2021-07-10"
TASK = "enterococcus-pangenome"
DIR = mkpath("$(homedir())/$(DATE)")

"/Users/cameronprybol/2021-07-10"

In [2]:
pkgs = [
"LightGraphs",
"MetaGraphs",
"BioSequences",
"uCSV",
"DataFrames",
"FASTX",
"HTTP",
"CodecZlib",
"DataStructures",
"Revise",
"ProgressMeter",
"BenchmarkTools",
"StatsBase"
]

import Pkg
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $(basename(pkg))"))
end

import Mycelia



In [3]:
refseq_summary_lines = readlines(IOBuffer(HTTP.get("https://ftp.ncbi.nih.gov/genomes/refseq/assembly_summary_refseq.txt").body));

In [4]:
# drop first line which is a comment
refseq_summary_table = DataFrames.DataFrame(uCSV.read(IOBuffer(join(refseq_summary_lines[2:end], '\n')), header=1, delim='\t')...);

In [5]:
# filter down to only included enterococcus records
enterococcus_summary_table = refseq_summary_table[findall(n -> occursin(Regex("Enterococcus", "i"), n), refseq_summary_table[!, "organism_name"]), :];

In [6]:
# filter down to only include full genomes, rather than partial
enterococcus_summary_table = enterococcus_summary_table[enterococcus_summary_table[!, "genome_rep"] .== "Full", :];

In [7]:
StatsBase.countmap(enterococcus_summary_table[!, "assembly_level"])

Dict{String, Int64} with 4 entries:
  "Scaffold"        => 2213
  "Contig"          => 2456
  "Chromosome"      => 83
  "Complete Genome" => 343

In [None]:
# pull genomes and look at the differences in fasta contents between Complete Genome & the rest
# I'm worried that complete genome also contains plasmids
# if they do, we'll want to filter those out

In [8]:
i = findfirst(assembly_level -> assembly_level == "Complete Genome", enterococcus_summary_table[!, "assembly_level"])

1

In [9]:
show(enterococcus_summary_table[i:i, :], allcols=true)

[1m1×23 DataFrame[0m
[1m Row [0m│[1m # assembly_accession [0m[1m bioproject  [0m[1m biosample    [0m[1m wgs_master [0m[1m refseq_category [0m[1m taxid  [0m[1m species_taxid [0m[1m organism_name              [0m[1m infraspecific_name [0m[1m isolate [0m[1m version_status [0m[1m assembly_level  [0m[1m release_type [0m[1m genome_rep [0m[1m seq_rel_date [0m[1m asm_name [0m[1m submitter [0m[1m gbrs_paired_asm [0m[1m paired_asm_comp [0m[1m ftp_path                          [0m[1m excluded_from_refseq [0m[1m relation_to_type_material [0m[1m asm_not_live_date [0m
[1m     [0m│[90m String               [0m[90m String      [0m[90m String       [0m[90m String     [0m[90m String          [0m[90m Int64  [0m[90m Int64         [0m[90m String                     [0m[90m String             [0m[90m String  [0m[90m String         [0m[90m String          [0m[90m String       [0m[90m String     [0m[90m String       [0m[90m Str

In [10]:
ftp_path = replace(enterococcus_summary_table[i, "ftp_path"], "ftp://" => "https://")
assembly_report_path = ftp_path * "/" * basename(ftp_path) * "_assembly_report.txt"

"https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/007/785/GCF_000007785.1_ASM778v1/GCF_000007785.1_ASM778v1_assembly_report.txt"

In [11]:
genome_path = ftp_path * "/" * basename(ftp_path) *  "_genomic.fna.gz"

"https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/007/785/GCF_000007785.1_ASM778v1/GCF_000007785.1_ASM778v1_genomic.fna.gz"

In [12]:
Mycelia.get_sequence(ftp = genome_path)

FASTX.FASTA.Reader{TranscodingStreams.TranscodingStream{CodecZlib.GzipDecompressor, IOBuffer}}(BioGenerics.Automa.State{TranscodingStreams.TranscodingStream{CodecZlib.GzipDecompressor, IOBuffer}}(TranscodingStreams.TranscodingStream{CodecZlib.GzipDecompressor, IOBuffer}(<mode=idle>), 1, 1, false), nothing)

In [13]:
fasta = collect(ans)

4-element Vector{FASTX.FASTA.Record}:
 FASTX.FASTA.Record:
   identifier: NC_004668.1
  description: Enterococcus faecalis V583, complete sequence
     sequence: TTTTAAGTTATCCACATTTTTTAGATAACCAAAATTTAA…
 FASTX.FASTA.Record:
   identifier: NC_004669.1
  description: Enterococcus faecalis V583 plasmid pTEF1, complete sequence
     sequence: AATTGCTTTATTTTAAATAATGTTTTGGTATTCTTAAAA…
 FASTX.FASTA.Record:
   identifier: NC_004671.1
  description: Enterococcus faecalis V583 plasmid pTEF2, complete sequence
     sequence: GATATAATAAGAAAGCAGGCAAATACTGTTATATCAGTA…
 FASTX.FASTA.Record:
   identifier: NC_004670.1
  description: Enterococcus faecalis V583 plasmid pTEF3, complete sequence
     sequence: ATTATATTTCTTTTATACTTAACAGCTATATAATTTTTG…

In [None]:
# initialize a metagraph with these fasta records

In [14]:
function cypher(cmd;
    address="neo4j://localhost:7687",
    username="neo4j",
    password="password",
    format="auto",
    database="system"
    )
    cmd = `cypher-shell --address $(address) --username $(username) --password $(password) --format $(format) --database $(database) $(cmd)`
    return cmd
end

cypher (generic function with 1 method)

In [15]:
cmd = "drop database Enterococcus"
run(cypher(database="system", cmd))

Process(`[4mcypher-shell[24m [4m--address[24m [4mneo4j://localhost:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mpassword[24m [4m--format[24m [4mauto[24m [4m--database[24m [4msystem[24m [4m'drop database Enterococcus'[24m`, ProcessExited(0))

In [16]:
cmd = "create database Enterococcus"
run(cypher(database="system", cmd))

Process(`[4mcypher-shell[24m [4m--address[24m [4mneo4j://localhost:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mpassword[24m [4m--format[24m [4mauto[24m [4m--database[24m [4msystem[24m [4m'create database Enterococcus'[24m`, ProcessExited(0))

In [17]:
enterococcus_db = MetaGraphs.MetaDiGraph()

{0, 0} directed Int64 metagraph with Float64 weights defined by :weight (default weight 1.0)

# step 1 insert fasta record into julia database

In [19]:
record = first(fasta)
LightGraphs.add_vertex!(enterococcus_db)
vi = LightGraphs.nv(enterococcus_db)
MetaGraphs.set_prop!(enterococcus_db, vi, :type, "FASTA")
MetaGraphs.set_prop!(enterococcus_db, vi, :identifier, FASTX.identifier(record))
MetaGraphs.set_prop!(enterococcus_db, vi, :description, FASTX.description(record))
MetaGraphs.set_prop!(enterococcus_db, vi, :sequence, FASTX.sequence(record))
enterococcus_db.vprops[vi]

# step 1.5, insert fasta record into neo database

In [30]:
record_csv_file = DIR * "/" * FASTX.identifier(record) * ".tsv"

record_table = DataFrames.DataFrame(
    :type => "FASTA",
    :identifier => FASTX.identifier(record),
    :description => FASTX.description(record),
    :sequence => FASTX.sequence(record)
);

uCSV.write(record_csv_file, header = names(record_table), data = collect(DataFrames.eachcol(record_table)), delim='\t')

cmd = 
"""
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM
'file://$(record_csv_file)' AS line
FIELDTERMINATOR '\t'
with line
CREATE (f:FASTA {identifier: line.`identifier`, description: line.`description`, sequence: line.`sequence`})
"""

run(cypher(database="Enterococcus", cmd))

# step 2 determine entity classification
# either known or inferred by blasting

# Add to Neo

In [45]:
DataFrames.rename!(enterococcus_summary_table, "# assembly_accession" => "assembly_accession")
# step 2 insert entity into database
t = enterococcus_summary_table[i:i, :]
# uCSV.write()
# show(t, allcols=true)

f = "$(Mycelia.neo_import_dir)/enterococcus-entity-metadata.tsv"
uCSV.write(f, data=collect(DataFrames.eachcol(t)), header=names(t), delim='\t')
node_metadata_descriptors = join(["$n: line.`$n`" for n in names(t)], ", ")
node_metadata = "(n:ENTITY {$(node_metadata_descriptors)})"
normalized_f = replace(f, " " => "%20")

cmd = 
"""
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM
'file://$normalized_f' AS line
FIELDTERMINATOR '\t'
with line
CREATE $(node_metadata)
"""

run(cypher(database="Enterococcus", cmd))

cmd = 
"""
MATCH (e:ENTITY) RETURN e
"""
run(cypher(database="Enterococcus", cmd))

# Add to julia db

In [82]:
# add new node
LightGraphs.add_vertex!(enterococcus_db)
vi = LightGraphs.nv(enterococcus_db)
# MetaGraphs.set_prop!(enterococcus_db, vi, :type, "FASTA")
# MetaGraphs.set_prop!(enterococcus_db, vi, :identifier, FASTX.identifier(record))
# MetaGraphs.set_prop!(enterococcus_db, vi, :description, FASTX.description(record))
# MetaGraphs.set_prop!(enterococcus_db, vi, :sequence, FASTX.sequence(record))
# enterococcus_db.vprops[vi]

2

In [88]:
MetaGraphs.set_prop!(enterococcus_db, vi, :type, "ENTITY")

true

In [None]:
# for column in dataframe row, add metadata to node

In [86]:
for n in names(t)
#     @show t[i, n]
    MetaGraphs.set_prop!(enterococcus_db, vi, Symbol(n), t[i, n])
end

In [89]:
enterococcus_db.vprops[vi]

Dict{Symbol, Any} with 24 entries:
  :relation_to_type_material => ""
  :infraspecific_name        => "strain=V583"
  :organism_name             => "Enterococcus faecalis V583"
  :taxid                     => 226185
  :asm_name                  => "ASM778v1"
  :ftp_path                  => "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000…
  :asm_not_live_date         => "na"
  :bioproject                => "PRJNA224116"
  :release_type              => "Major"
  :paired_asm_comp           => "identical"
  :biosample                 => "SAMN02603978"
  :species_taxid             => 1351
  :assembly_level            => "Complete Genome"
  :isolate                   => ""
  :excluded_from_refseq      => ""
  :seq_rel_date              => "2003/03/28"
  :version_status            => "latest"
  :wgs_master                => ""
  :gbrs_paired_asm           => "GCA_000007785.1"
  :submitter                 => "TIGR"
  :type                      => "ENTITY"
  :genome_rep                => "Full"

In [None]:
# step 3 add connections between fasta records and entity

In [90]:
# points from entity to fasta
LightGraphs.add_edge!(enterococcus_db, 2, 1)

true

In [102]:
cmd =
"""
MATCH (e:ENTITY {assembly_accession: '$(t[1, "assembly_accession"])'})
MATCH (f:FASTA {identifier: '$(FASTX.identifier(record))'})
MERGE (e)-[ef:HAS_SEQUENCE]->(f)
return e.assembly_accession,f.identifier,ef
"""
run(cypher(database="Enterococcus", cmd))

e.assembly_accession, f.identifier, ef
"GCF_000007785.1", "NC_004668.1", [:HAS_SEQUENCE]


Process(`[4mcypher-shell[24m [4m--address[24m [4mneo4j://localhost:7687[24m [4m--username[24m [4mneo4j[24m [4m--password[24m [4mpassword[24m [4m--format[24m [4mauto[24m [4m--database[24m [4mEnterococcus[24m [4m"MATCH (e:ENTITY {assembly_accession: 'GCF_000007785.1'})[24m
[4mMATCH (f:FASTA {identifier: 'NC_004668.1'})[24m
[4mMERGE (e)-[ef:HAS_SEQUENCE]->(f)[24m
[4mreturn e.assembly_accession,f.identifier,ef[24m
[4m"[24m`, ProcessExited(0))

In [None]:
# step 4, extract fasta files and generate a list of all canonical kmers