In [1]:
TODAY="2021-12-20"
TASK = "phylogenetic-determination-AAI"
DIR = "$(homedir())/$(TODAY)-$(TASK)"
if !isdir(DIR)
    mkdir(DIR)
end
cd(DIR)

In [2]:
# alternative workflow would be to utilize ezAAI http://leb.snu.ac.kr/ezaai
# or recreate ezAAI with diamond
# use CheckM

# use the genus assignment of the phage, and then assume the family level assignment from the genus classification
# require that scale dependent, correlation, and probability based genus is consisent, otherwise flag

In [3]:
import Pkg

pkgs = [
"JSON",
"HTTP",
"Dates",
"uCSV",
"DelimitedFiles",
"DataFrames",
"ProgressMeter",
"BioSequences",
"FASTX",
"Distances",
"Plots",
"StatsPlots",
"StatsBase",
"Statistics",
"Mmap",
"MultivariateStats",
"PyCall",
"Random",
"Primes",
"Revise",
"SparseArrays",
"SHA",
"Mycelia",
"GenomicAnnotations",
"BioFetch",
"Combinatorics",
"StaticArrays",
"BioSymbols",
"RollingFunctions",
"OrderedCollections"
]

for pkg in pkgs
    try
        eval(Meta.parse("import $pkg"))
    catch
        Pkg.add(pkg)
        eval(Meta.parse("import $pkg"))
    end
end

In [4]:
MYCELIA_METADATA = joinpath(dirname(dirname(pathof(Mycelia))), "docs", "metadata")

"/home/jupyter-cjprybol/.julia/dev/Mycelia/docs/metadata"

In [5]:
# run(`conda install -c bioconda comparem`)

In [19]:
genomes_dir = mkpath("$(DIR)/genomes")

"/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/genomes"

In [57]:
ProgressMeter.@showprogress for accession in entity_metadata[!, "Accession"]
    fasta_file = joinpath(genomes_dir, "$(accession).fna")
    if !isfile(fasta_file) || isempty(fasta_file)
        open(fasta_file, "w") do io
            fastx_io = FASTX.FASTA.Writer(io)
            for record in Mycelia.get_sequence(db="nuccore", accession=accession)
                write(fastx_io, record)
            end
            close(fastx_io)
        end
    end
end

In [6]:
# https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Bacteriophage,%20all%20taxids&Completeness_s=complete
entity_metadata_full = DataFrames.DataFrame(uCSV.read("$(MYCELIA_METADATA)/2021-11-13-ncbi-complete-bacteriophage.csv", header=1, quotes='"')...)
sort!(entity_metadata_full, "Accession")
entity_metadata_full = entity_metadata_full[entity_metadata_full[!, "Nuc_Completeness"] .== "complete", :]
# filter down to only include labelled genera
entity_metadata_full = entity_metadata_full[.!isempty.(entity_metadata_full[!, "Genus"]), :]
# filter down to only include genera that are present at least once
genera_counts = sort(collect(StatsBase.countmap(entity_metadata_full[!, "Genus"])), by=x->x[2], rev=true)

964-element Vector{Pair{String, Int64}}:
       "Fromanvirus" => 905
     "Gequatrovirus" => 371
      "Cheoctovirus" => 317
        "Skunavirus" => 311
       "Pegunavirus" => 240
     "Tequatrovirus" => 222
    "Timquatrovirus" => 212
       "Pahexavirus" => 201
      "Bixzunavirus" => 174
        "Pbunavirus" => 168
   "Sinsheimervirus" => 156
        "Microvirus" => 138
       "Kostyavirus" => 127
                     ⋮
 "Raunefjordenvirus" => 1
         "Podivirus" => 1
  "Saintgironsvirus" => 1
       "Nahantvirus" => 1
       "Yonginvirus" => 1
        "Kajamvirus" => 1
       "Pekhitvirus" => 1
       "Cyclitvirus" => 1
          "Sasvirus" => 1
     "Seongbukvirus" => 1
        "Jalkavirus" => 1
      "Capvunavirus" => 1

In [9]:
non_singleton_genera_counts = filter(x -> x[2] > 1, genera_counts)

930-element Vector{Pair{String, Int64}}:
           "Fromanvirus" => 905
         "Gequatrovirus" => 371
          "Cheoctovirus" => 317
            "Skunavirus" => 311
           "Pegunavirus" => 240
         "Tequatrovirus" => 222
        "Timquatrovirus" => 212
           "Pahexavirus" => 201
          "Bixzunavirus" => 174
            "Pbunavirus" => 168
       "Sinsheimervirus" => 156
            "Microvirus" => 138
           "Kostyavirus" => 127
                         ⋮
          "Vilniusvirus" => 2
 "Grisebachstrassevirus" => 2
          "Nampongvirus" => 2
       "Goettingenvirus" => 2
      "Chakrabartyvirus" => 2
         "Mingyongvirus" => 2
      "Skarprettervirus" => 2
        "Shirahamavirus" => 2
           "Nanhaivirus" => 2
         "Predatorvirus" => 2
         "Kungbxnavirus" => 2
           "Kilunavirus" => 2

In [14]:
sorted_counts = reverse(unique(last.(non_singleton_genera_counts)))

87-element Vector{Int64}:
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
   ⋮
 138
 156
 168
 174
 201
 212
 222
 240
 311
 317
 371
 905

In [62]:
repeat_count = sorted_counts[end]

905

In [63]:
selected_genera = first.(filter(x -> x[2] <= repeat_count, genera_counts))

964-element Vector{String}:
 "Fromanvirus"
 "Gequatrovirus"
 "Cheoctovirus"
 "Skunavirus"
 "Pegunavirus"
 "Tequatrovirus"
 "Timquatrovirus"
 "Pahexavirus"
 "Bixzunavirus"
 "Pbunavirus"
 "Sinsheimervirus"
 "Microvirus"
 "Kostyavirus"
 ⋮
 "Raunefjordenvirus"
 "Podivirus"
 "Saintgironsvirus"
 "Nahantvirus"
 "Yonginvirus"
 "Kajamvirus"
 "Pekhitvirus"
 "Cyclitvirus"
 "Sasvirus"
 "Seongbukvirus"
 "Jalkavirus"
 "Capvunavirus"

In [64]:
entity_metadata = entity_metadata_full[map(x -> x in selected_genera, entity_metadata_full[!, "Genus"]), :];

In [65]:
# ezaai_db_dir = mkpath("$(DIR)/ezaai")
# ezaai_jar = joinpath(homedir(), "software", "bin", "ezaai.jar")
# ProgressMeter.@showprogress for row in DataFrames.eachrow(entity_metadata[1:3, :])
#     accession = row["Accession"]
#     descriptive_identifier = row["GenBank_Title"]
#     fasta_file = "$(genomes_dir)/$(accession).fna"
#     db_file = "$(ezaai_db_dir)/$(accession).db"
# #     @show fasta_file
# #     @show db_file
#     if !isfile(db_file)
#         cmd = `java -jar $(ezaai_jar) extract -i $(fasta_file) -o $(db_file) -l $(descriptive_identifier)`
# #         @show cmd
#         run(cmd)
#     end
# end
# run(`java -jar $(ezaai_jar) calculate -i $(ezaai_db_dir) -j $(ezaai_db_dir) -o $(ezaai_db_dir)/aai.tsv`)

In [67]:
comparem_aai_dir = mkpath("$(DIR)/comparem_aai_count-$(repeat_count)")

"/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/comparem_aai_count-905"

In [68]:
genome_list = "$(DIR)/fasta_list.txt"
open(genome_list, "w") do io
    for accession in entity_metadata[!, "Accession"]
        println(io, "$(genomes_dir)/$(accession).fna")
    end
end
readlines(genome_list)

13172-element Vector{String}:
 "/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/genomes/AB002632.1.fna"
 "/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/genomes/AB008550.1.fna"
 "/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/genomes/AB012573.1.fna"
 "/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/genomes/AB012574.1.fna"
 "/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/genomes/AB043678.1.fna"
 "/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/genomes/AB043679.1.fna"
 "/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/genomes/AB045978.2.fna"
 "/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/genomes/AB063393.2.fna"
 "/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/genomes/AB102868.1.fna"
 "/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/genomes/AB231700.1.fna"
 "/home/jupyter-cjprybol/2021-12-20-phylogenetic-determinati

In [None]:
# run(`comparem aai_wf --cpus $(Sys.CPU_THREADS) $(genome_list) $(comparem_aai_dir)`)
# did not finish

[2021-12-21 21:31:27] INFO: CompareM v0.1.2
[2021-12-21 21:31:27] INFO: comparem aai_wf --cpus 2 /home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/fasta_list.txt /home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/comparem_aai_count-905
[2021-12-21 21:31:27] INFO: Identifying genes within genomes: 
  Finished processing 8178 of 13172 (62.09%) genomes.

Process(`[4mcomparem[24m [4maai_wf[24m [4m--cpus[24m [4m2[24m [4m/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/fasta_list.txt[24m [4m/home/jupyter-cjprybol/2021-12-20-phylogenetic-determination-AAI/comparem_aai_count-905[24m`, ProcessExited(0))