In [1]:
TODAY="2021-10-09"
TASK = "NCBI-taxonomy"
DIR = "$(homedir())/$(TODAY)-$(TASK)"
if !isdir(DIR)
    mkdir(DIR)
end
cd(DIR)

In [2]:
import Pkg
pkgs = [
    "BioAlignments",
    "BioSequences",
    "Clustering",
    "CodecZlib",
#     "Colors",
#     "Combinatorics",
    "DataFrames",
    "DataStructures",
    "Dates",
#     "DelimitedFiles",
    "Distances",
    "Distributions",
#     "EzXML",
    "FASTX",
#     "GFF3",
    "GLM",
    "GraphPlot",
#     "HTTP",
#     "Impute",
#     "JSON",
    "Graphs",
    "LsqFit",
#     "LSHFunctions",
#     "Measures",
    "MetaGraphs",
    "https://github.com/cjprybol/Mycelia.git",
    "MultivariateStats",
#     "NumericIO",
    "OnlineStats",
#     "PlotlyJS",
#     "Plots",
    "Primes",
#     "Printf",
    "ProgressMeter",
    "Random",
    "Revise",
    "SparseArrays",
    "Statistics",
    "StatsBase",
    "StatsPlots",
#     "StringDistances",
    "uCSV",
#     "XLSX",
    "RollingFunctions",
]

unregistered_packages = filter(pkg -> occursin(r"(^https|git$)", pkg), pkgs)
registered_packages = setdiff(pkgs, unregistered_packages)

for pkg in registered_packages
    try
        eval(Meta.parse("import $(pkg)"))
    catch
        Pkg.add(pkg)
        Pkg.build(pkg)
        eval(Meta.parse("import $(pkg)"))
    end
end

for pkg_url in unregistered_packages
    pkg_name = replace(basename(pkg_url), ".git" => "")
    try
        eval(Meta.parse("import $(pkg_name)"))
    catch
        Pkg.develop(url=pkg_url)
        Pkg.build(pkg_name)
        eval(Meta.parse("import $(pkg_name)"))
    end
end

In [None]:
# 2021-05-12T16:16:17 jovyan@jupyter-cameron-2eprybol:~/2021-05-12-staph-phage-pangenome/taxdump
# $ ll
# total 373M
# -rw-r--r-- 1 jovyan  18M May 12 15:28 citations.dmp
# -rw-r--r-- 1 jovyan 4.1M May 12 15:26 delnodes.dmp
# -rw-r--r-- 1 jovyan  452 May 12 15:20 division.dmp
# -rw-r--r-- 1 jovyan  17K May 12 15:28 gc.prt
# -rw-r--r-- 1 jovyan 4.9K May 12 15:20 gencode.dmp
# -rw-r--r-- 1 jovyan 1.2M May 12 15:26 merged.dmp
# -rw-r--r-- 1 jovyan 198M May 12 15:28 names.dmp
# -rw-r--r-- 1 jovyan 153M May 12 15:28 nodes.dmp
# -rw-r----- 1 jovyan 2.7K Sep 11  2019 readme.txt

In [3]:
taxdump_url = "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
taxdump_local_tarball = "$(DIR)/$(basename(taxdump_url))"
if !isfile(taxdump_local_tarball)
    download(taxdump_url, taxdump_local_tarball)
end

"/home/jovyan/2021-10-09-NCBI-taxonomy/taxdump.tar.gz"

In [4]:
taxdump_out = replace(taxdump_local_tarball, ".tar.gz" => "")
if !isdir(taxdump_out)
    mkpath(taxdump_out)
    run(`tar -xvzf $(taxdump_local_tarball) -C $(taxdump_out)`)
end

citations.dmp
delnodes.dmp
division.dmp
gencode.dmp
merged.dmp
names.dmp
nodes.dmp
gc.prt
readme.txt


Process(`[4mtar[24m [4m-xvzf[24m [4m/home/jovyan/2021-10-09-NCBI-taxonomy/taxdump.tar.gz[24m [4m-C[24m [4m/home/jovyan/2021-10-09-NCBI-taxonomy/taxdump[24m`, ProcessExited(0))

In [5]:
readdir(taxdump_out)

9-element Vector{String}:
 "citations.dmp"
 "delnodes.dmp"
 "division.dmp"
 "gc.prt"
 "gencode.dmp"
 "merged.dmp"
 "names.dmp"
 "nodes.dmp"
 "readme.txt"

In [6]:
# Taxonomy names file (names.dmp):
# 	tax_id					-- the id of node associated with this name
# 	name_txt				-- name itself
# 	unique name				-- the unique variant of this name if name not unique
# 	name class				-- (synonym, common name, ...)

names_dmp = DataFrames.DataFrame(
    tax_id = Int[],
    name_txt = String[],
    unique_name = String[],
    name_class = String[]
)
ProgressMeter.@showprogress for line in split(read(open("$(taxdump_out)/names.dmp"), String), "\t|\n")
    if isempty(line)
        continue
    else
        (tax_id_string, name_txt, unique_name, name_class) = split(line, "\t|\t")
        tax_id = parse(Int, tax_id_string)
        row = (;tax_id, name_txt, unique_name, name_class)
        push!(names_dmp, row)
    end
end
names_dmp

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:27[39m39m


Unnamed: 0_level_0,tax_id,name_txt
Unnamed: 0_level_1,Int64,String
1,1,all
2,1,root
3,2,Bacteria
4,2,bacteria
5,2,eubacteria
6,2,Monera
7,2,Procaryotae
8,2,Prokaryotae
9,2,Prokaryota
10,2,prokaryote


In [7]:
unique_tax_ids = unique(names_dmp[!, "tax_id"])

2368832-element Vector{Int64}:
       1
       2
       6
       7
       9
      10
      11
      13
      14
      16
      17
      18
      19
       ⋮
 2881377
 2881378
 2881379
 2881380
 2881381
 2881383
 2881384
 2881385
 2881426
 2881427
 2881428
 2883098

In [8]:
ncbi_taxonomy = MetaGraphs.MetaDiGraph(length(unique_tax_ids))
ProgressMeter.@showprogress for (index, group) in enumerate(collect(DataFrames.groupby(names_dmp, "tax_id")))
    MetaGraphs.set_prop!(ncbi_taxonomy, index, :tax_id, group[1, "tax_id"])
    for row in DataFrames.eachrow(group)
        unique_name = isempty(row["unique_name"]) ? row["name_txt"] : row["unique_name"]
        # remove quotes since neo4j doesn't like them
        unique_name = replace(unique_name, '"' => "")
        # replace spaces and dashes with underscores
        name_class = Symbol(replace(replace(row["name_class"], r"\s+" => "-"), "-" => "_"))
#         name_class = Symbol(row["name_class"])
        if haskey(MetaGraphs.props(ncbi_taxonomy, index), name_class)
            current_value = MetaGraphs.get_prop(ncbi_taxonomy, index, name_class)
            if (current_value isa Array) && !(unique_name in current_value)
                new_value = [current_value..., unique_name]
                MetaGraphs.set_prop!(ncbi_taxonomy, index, name_class, new_value)
            elseif !(current_value isa Array) && (current_value != unique_name)
                new_value = [current_value, unique_name]
                MetaGraphs.set_prop!(ncbi_taxonomy, index, name_class, new_value)
            else
                continue
            end
        else
            MetaGraphs.set_prop!(ncbi_taxonomy, index, name_class, unique_name)
        end
    end
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:48[39m39m


In [13]:
divisions = Dict()
for line in split(read(open("$(taxdump_out)/division.dmp"), String), "\t|\n")
    if !isempty(line)
        (id_string, shorthand, full_name, notes) = split(line, "\t|\t")
        id = parse(Int, id_string)
        divisions[id] = Dict(:division_cde => shorthand, :division_name => full_name)
    end
end
divisions

Dict{Any, Any} with 12 entries:
  5  => Dict{Symbol, SubString{String}}(:division_name=>"Primates", :division_c…
  8  => Dict{Symbol, SubString{String}}(:division_name=>"Unassigned", :division…
  1  => Dict{Symbol, SubString{String}}(:division_name=>"Invertebrates", :divis…
  0  => Dict{Symbol, SubString{String}}(:division_name=>"Bacteria", :division_c…
  6  => Dict{Symbol, SubString{String}}(:division_name=>"Rodents", :division_cd…
  11 => Dict{Symbol, SubString{String}}(:division_name=>"Environmental samples"…
  9  => Dict{Symbol, SubString{String}}(:division_name=>"Viruses", :division_cd…
  3  => Dict{Symbol, SubString{String}}(:division_name=>"Phages", :division_cde…
  7  => Dict{Symbol, SubString{String}}(:division_name=>"Synthetic and Chimeric…
  4  => Dict{Symbol, SubString{String}}(:division_name=>"Plants and Fungi", :di…
  2  => Dict{Symbol, SubString{String}}(:division_name=>"Mammals", :division_cd…
  10 => Dict{Symbol, SubString{String}}(:division_name=>"Vertebrates", :divis

In [14]:
node_2_taxid_map = map(index -> ncbi_taxonomy.vprops[index][:tax_id], Graphs.vertices(ncbi_taxonomy))

2368832-element Vector{Int64}:
       1
       2
       6
       7
       9
      10
      11
      13
      14
      16
      17
      18
      19
       ⋮
 2881377
 2881378
 2881379
 2881380
 2881381
 2881383
 2881384
 2881385
 2881426
 2881427
 2881428
 2883098

In [15]:
ProgressMeter.@showprogress for line in split(read(open("$(taxdump_out)/nodes.dmp"), String), "\t|\n")
    if isempty(line)
        continue
    else
        (tax_id_string, parent_tax_id_string, rank, embl_code, division_id_string) = split(line, "\t|\t")
        
        
        division_id = parse(Int, division_id_string)
        
        tax_id = parse(Int, tax_id_string)
        lightgraphs_tax_ids = searchsorted(node_2_taxid_map, tax_id)
        @assert length(lightgraphs_tax_ids) == 1
        lightgraphs_tax_id = first(lightgraphs_tax_ids)
        
        parent_tax_id = parse(Int, parent_tax_id_string)
        lightgraphs_parent_tax_ids = searchsorted(node_2_taxid_map, parent_tax_id)
        @assert length(lightgraphs_parent_tax_ids) == 1
        lightgraphs_parent_tax_id = first(lightgraphs_parent_tax_ids)
        
        Graphs.add_edge!(ncbi_taxonomy, lightgraphs_tax_id, lightgraphs_parent_tax_id)
        MetaGraphs.set_prop!(ncbi_taxonomy, lightgraphs_tax_id, :rank, rank)
        # these should probably be broken out as independent nodes!
        MetaGraphs.set_prop!(ncbi_taxonomy, lightgraphs_tax_id, :division_id, division_id)
        MetaGraphs.set_prop!(ncbi_taxonomy, lightgraphs_tax_id, :division_cde, divisions[division_id][:division_cde])
        MetaGraphs.set_prop!(ncbi_taxonomy, lightgraphs_tax_id, :division_name, divisions[division_id][:division_name])
    end
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:01:16[39m


In [18]:
ncbi_taxonomy.vprops[3]

Dict{Symbol, Any} with 7 entries:
  :tax_id          => 6
  :division_id     => 0
  :division_name   => "Bacteria"
  :scientific_name => "Azorhizobium"
  :rank            => "genus"
  :authority       => "Azorhizobium Dreyfus et al. 1988 emend. Lang et al. 2013"
  :division_cde    => "BCT"

In [29]:
# this file got to 50Gb before I killed the job
# don't write out, just always rebuild from disk
# MetaGraphs.savemg("ncbi-taxonomy.metagraph-jl", ncbi_taxonomy)

In [26]:
rank_counts =
    sort(
        collect(
            StatsBase.countmap(
                ncbi_taxonomy.vprops[v][:rank] for v in Graphs.vertices(ncbi_taxonomy))),
        by=x->x[2])
for rank_count in rank_counts
    println(rank_count)
end

Pair{Any, Int64}("subkingdom", 1)
Pair{Any, Int64}("superphylum", 1)
Pair{Any, Int64}("subcohort", 3)
Pair{Any, Int64}("superkingdom", 4)
Pair{Any, Int64}("pathogroup", 5)
Pair{Any, Int64}("cohort", 5)
Pair{Any, Int64}("superclass", 6)
Pair{Any, Int64}("series", 9)
Pair{Any, Int64}("morph", 12)
Pair{Any, Int64}("kingdom", 13)
Pair{Any, Int64}("biotype", 17)
Pair{Any, Int64}("infraclass", 18)
Pair{Any, Int64}("genotype", 20)
Pair{Any, Int64}("subsection", 21)
Pair{Any, Int64}("parvorder", 26)
Pair{Any, Int64}("subphylum", 32)
Pair{Any, Int64}("superorder", 54)
Pair{Any, Int64}("species subgroup", 127)
Pair{Any, Int64}("infraorder", 130)
Pair{Any, Int64}("serogroup", 140)
Pair{Any, Int64}("subclass", 163)
Pair{Any, Int64}("phylum", 290)
Pair{Any, Int64}("species group", 339)
Pair{Any, Int64}("suborder", 374)
Pair{Any, Int64}("class", 447)
Pair{Any, Int64}("section", 476)
Pair{Any, Int64}("subtribe", 574)
Pair{Any, Int64}("forma", 597)
Pair{Any, Int64}("forma specialis", 737)
Pair{Any, In

In [None]:
Pair{Any, Int64}("kingdom", 13)
Pair{Any, Int64}("phylum", 290)
Pair{Any, Int64}("order", 1701)
Pair{Any, Int64}("family", 9654)
Pair{Any, Int64}("genus", 100737)
Pair{Any, Int64}("species", 1933524)

In [None]:
# k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Clostridiaceae|g__Clostridium|s__Clostridium_leptum 0.30871

In [None]:
# sorted list of kingdoms
# sorted list of phylums
# sorted list of orders
# families
# genera
# species

In [None]:
# go from sorted list and n samples to sparse matrices for each level