In [1]:
DATE_TASK = "2022-02-25-pseudomonas-phylogeny-AA"
DIR = mkpath("$(homedir())/workspace/$DATE_TASK")
cd(DIR)
DATE, TASK = match(r"^(\d{4}-\d{2}-\d{2})-(.*)$", DATE_TASK).captures

2-element Vector{Union{Nothing, SubString{String}}}:
 "2022-02-25"
 "pseudomonas-phylogeny-AA"

In [2]:
import Pkg
Pkg.update()
pkgs = [
"JSON",
"HTTP",
"Dates",
"uCSV",
"DelimitedFiles",
"DataFrames",
"ProgressMeter",
"BioSequences",
"FASTX",
"Distances",
"Plots",
"StatsPlots",
"StatsBase",
"Statistics",
"Mmap",
"MultivariateStats",
"PyCall",
"Random",
"Primes",
"SparseArrays",
"SHA",
"https://github.com/cjprybol/Mycelia.git#master",
"GenomicAnnotations",
"BioFetch",
"Combinatorics",
"StaticArrays",
"BioSymbols",
"RollingFunctions",
"OrderedCollections",
"Downloads",
"Clustering"
]

for pkg in pkgs
    try
        eval(Meta.parse("import $pkg"))
    catch
        try
            Pkg.add(pkg)
        catch
            Pkg.add(url=pkg)
            pkg = replace(basename(pkg), ".git#master" => "")
        end
        eval(Meta.parse("import $pkg"))
    end
end

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m    Updating[22m[39m git-repo `https://github.com/cjprybol/Mycelia.git#master`
[32m[1m    Updating[22m[39m `~/git/Mycelia/docs/Project.toml`
 [90m [453d265d] [39m[93m~ Mycelia v0.1.0 `https://github.com/cjprybol/Mycelia.git#master#master` ⇒ v0.1.0 `https://github.com/cjprybol/Mycelia.git#master#master`[39m
[32m[1m    Updating[22m[39m `~/git/Mycelia/docs/Manifest.toml`
 [90m [453d265d] [39m[93m~ Mycelia v0.1.0 `https://github.com/cjprybol/Mycelia.git#master#master` ⇒ v0.1.0 `https://github.com/cjprybol/Mycelia.git#master#master`[39m
[32m[1mPrecompiling[22m[39m project...
[32m  ✓ [39mMycelia
  1 dependency successfully precompiled in 21 seconds (260 already precompiled)
[32m[1m    Updating[22m[39m git-repo `https://github.com/cjprybol/Mycelia.git#master`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/git/Mycelia/docs/Project.toml`
[32m

In [3]:
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?&id=$(tax_id)
root_tax_id = 287

287

In [4]:
child_tax_ids = vcat(Mycelia.taxonomic_id_to_children(root_tax_id), root_tax_id)
# child_tax_ids = vcat(child_tax_ids, root_tax_id)

MATCH (n)<-[*]-(n2) WHERE n.tax_id IS NOT NULL AND n.tax_id = "287" RETURN DISTINCT n2.tax_id AS tax_id


383-element Vector{Int64}:
 1089456
 1084724
 1081927
 1078464
 1051005
 1051003
 1051004
 1009714
 1000561
  990330
  990327
  990321
  990333
       ⋮
 1454223
 1454221
 1454219
 1454222
 1454220
 1454218
 1454216
 1454217
 1457392
 1457194
 1093787
     287

In [5]:
refseq_metadata = Mycelia.load_refseq_metadata()

LoadError: UndefVarError: DataFrames not defined

In [None]:
show(refseq_metadata[1:1, :], allcols=true)

In [None]:
is_full = refseq_metadata[!, "genome_rep"] .== "Full"
not_excluded = refseq_metadata[!, "excluded_from_refseq"] .== ""
assembly_levels = ["Complete Genome"]
# assembly_levels = ["Complete Genome", "Chromosome"]
# assembly_levels = ["Complete Genome", "Chromosome", "Scaffold"]
# assembly_levels = ["Complete Genome", "Chromosome", "Scaffold", "Contig"]
assembly_level_filter = map(x -> x in assembly_levels, refseq_metadata[!, "assembly_level"])
tax_id_filter = map(taxid -> taxid in child_tax_ids, refseq_metadata[!, "taxid"])
full_filter = is_full .& not_excluded .& assembly_level_filter .& tax_id_filter
count(full_filter)

In [None]:
refseq_metadata_of_interest = refseq_metadata[full_filter, :]

In [None]:
for extension in ("genomic.fna.gz", "genomic.gff.gz", "protein.faa.gz")
    outdir = mkpath(joinpath(DIR, extension))
    ProgressMeter.@showprogress for row in DataFrames.eachrow(refseq_metadata_of_interest)
        url = ncbi_ftp_path_to_url(row["ftp_path"], extension)
        outfile = joinpath(outdir, basename(url))
        if !isfile(outfile)
            Downloads.download(url, outfile)
        end
    end
end

In [None]:
extension = "protein.faa.gz"
outdir = mkpath(joinpath(DIR, extension))

In [None]:
fastx_file = first(readdir(outdir, join=true))

In [None]:
# these are too small, all of the within vs between have some disagreement
dna_k = 5
aa_k = 2
# should use these?
# dna_k = 7
# aa_k = 3

In [None]:
fastx_files = filter(x -> !occursin(".ipynb_checkpoints", x), readdir(outdir, join=true))

In [None]:
counts_table, outfile = Mycelia.fasta_list_to_counts_table(fasta_list=fastx_files, k=aa_k, alphabet=:AA, outfile="$(outdir).$(aa_k).counts.bin")

In [None]:
distance_matrix = zeros(size(counts_table, 2), size(counts_table, 2))
for i1 in 1:size(counts_table, 2)
    for i2 in i1+1:size(counts_table, 2)
        a = counts_table[:, i1]
        b = counts_table[:, i2]
        sa = sum(a)
        sb = sum(b)
        size_dist = 1-(min(sa, sb)/max(sa, sb))
        cosine_dist = Distances.cosine_dist(a, b)
        distances = filter(x -> x > 0, (size_dist, cosine_dist))
        if isempty(distances)
            dist = 0.0
        else
            dist = reduce(*, distances)
        end
        distance_matrix[i1, i2] = distance_matrix[i2, i1] = dist
    end
end

In [None]:
distance_matrix

In [None]:
newick_tree_file = outfile * ".newick"
Mycelia.distance_matrix_to_newick(distance_matrix, basename.(fastx_files), newick_tree_file)