# Initialize workspace

In [1]:
DATE_TASK = "2022-05-14-ncbi-viral-pangenome"
DIR = mkpath("$(homedir())/workspace/$DATE_TASK")
cd(DIR)
DATE, TASK = match(r"^(\d{4}-\d{2}-\d{2})-(.*)$", DATE_TASK).captures

2-element Vector{Union{Nothing, SubString{String}}}:
 "2022-05-14"
 "ncbi-viral-pangenome"

# Import packages

In [2]:
import Pkg
Pkg.update()
pkgs = [
"JSON",
"HTTP",
"Dates",
"uCSV",
"DelimitedFiles",
"DataFrames",
"ProgressMeter",
"BioSequences",
"FASTX",
"Distances",
"StatsPlots",
"StatsBase",
"Statistics",
"MultivariateStats",
"Random",
"Primes",
"SparseArrays",
"SHA",
"GenomicAnnotations",
"Combinatorics",
"OrderedCollections",
"Downloads",
"Clustering",
"Revise",
"Mmap",
"Graphs",
"MetaGraphs",
"FileIO",
"SHA",
"MD5"
]

for pkg in pkgs
    try
        eval(Meta.parse("import $pkg"))
    catch
        Pkg.add(pkg)
        eval(Meta.parse("import $pkg"))
    end
end

# works but can't update locally, need to push and restart kernel to activate changes
# "https://github.com/cjprybol/Mycelia.git#master",
# didn't work
# "$(homedir())/git/Mycelia#master",
pkg_path = "$(homedir())/git/Mycelia"
try
    eval(Meta.parse("import $(basename(pkg_path))"))
catch
    # Pkg.add(url=pkg)
    Pkg.develop(path=pkg_path)
    # pkg = replace(basename(pkg), ".git#master" => "")
    # pkg = replace(basename(pkg), "#master" => "")
    eval(Meta.parse("import $(basename(pkg_path))"))
end

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m    Updating[22m[39m git-repo `https://github.com/cjprybol/Mycelia.git#master`
[32m[1m   Installed[22m[39m StatsPlots ───────── v0.14.34
[32m[1m   Installed[22m[39m DataStructures ───── v0.18.12
[32m[1m   Installed[22m[39m FileIO ───────────── v1.14.0
[32m[1m   Installed[22m[39m Plots ────────────── v1.29.0
[32m[1m   Installed[22m[39m Distributions ────── v0.25.58
[32m[1m   Installed[22m[39m InverseFunctions ─── v0.1.4
[32m[1m   Installed[22m[39m StructArrays ─────── v0.6.7
[32m[1m   Installed[22m[39m LogExpFunctions ──── v0.3.15
[32m[1m   Installed[22m[39m DataFrames ───────── v1.3.4
[32m[1m   Installed[22m[39m GraphPlot ────────── v0.5.1
[32m[1m   Installed[22m[39m ChangesOfVariables ─ v0.1.3
[32m[1m   Installed[22m[39m Documenter ───────── v0.27.17
[32m[1m   Installed[22m[39m OffsetArrays ─────── v1.11.0
[32m[1m   Installed[22m[39m Widgets ────

# Declare global parameters

In [3]:
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?&id=$(tax_id)
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?lvl=0&amp;id=2733124
# root_tax_id = 2733124

root_tax_id = 10239
# host = "Escherichia"
database_id = "41a45c7d"

"41a45c7d"

In [None]:
# NEO4J_BIN_DIR = "/home/jupyter-cjprybol/software/neo4j-community-4.4.3/bin"
# if !occursin(NEO4J_BIN_DIR, ENV["PATH"])
#     ENV["PATH"] = "$(NEO4J_BIN_DIR):" * ENV["PATH"]
# end
USERNAME="neo4j"
# PASSWORD=readline(joinpath(homedir(), ".config", "neo4j", "$(database_id).pass"));
PASSWORD="PtvnGnnS58elLLVsGwfeovOLi5xdfPe0hTMrwa_WtbM"
ADDRESS="neo4j+s://$(database_id).databases.neo4j.io:7687"
DATABASE = "neo4j"

"neo4j"

In [12]:
function taxonomic_id_to_children(tax_id; DATABASE_ID, USERNAME, PASSWORD)
    DATABASE = "neo4j"
    ADDRESS="neo4j+s://$(database_id).databases.neo4j.io:7687"
    
    # NOTE! *, or 0 distance (e.g. [*0..2]) step range will include source node!!!!
    # cmd = "MATCH (n)<-[*]-(n2) WHERE n.tax_id IS NOT NULL AND n.tax_id = \"$(tax_id)\" RETURN DISTINCT n2.tax_id AS tax_id"
    cmd = "MATCH (n)-[*]->(n2) WHERE n.tax_id IS NOT NULL AND n.tax_id = \"$(tax_id)\" RETURN DISTINCT n2.tax_id AS tax_id"
    println(cmd)
    
    cypher = Mycelia.cypher(cmd, address=ADDRESS, username = USERNAME, password = PASSWORD, database = DATABASE)
    tax_ids = readlines(open(cypher))[2:end]
    tax_ids = strip.(tax_ids, '"')
    tax_ids = parse.(Int, tax_ids)
    return unique(tax_ids)
end

taxonomic_id_to_children (generic function with 1 method)

In [13]:
# taxonomic_id_to_children(root_tax_id, DATABASE_ID=database_id, USERNAME=USERNAME, PASSWORD=PASSWORD)

MATCH (n)-[*]->(n2) WHERE n.tax_id IS NOT NULL AND n.tax_id = "10239" RETURN DISTINCT n2.tax_id AS tax_id


227386-element Vector{Int64}:
 2840056
 2840070
 1511852
 2842693
 2509616
 1529056
 2842638
 1546257
 1511854
   29250
 1128424
 1511853
 2844104
       ⋮
   12404
   12403
   12392
   12388
   12386
   12375
   12374
   12371
   12366
   12347
   12340
   28368

In [23]:
# TODO
# here is where we should apply a filter where host == Escherichia
# need to load host information into neo4j taxonomy

child_tax_ids = vcat(taxonomic_id_to_children(root_tax_id, DATABASE_ID=database_id, USERNAME=USERNAME, PASSWORD=PASSWORD), root_tax_id)
# child_tax_ids = vcat(child_tax_ids, root_tax_id)

# # refseq_metadata = Mycelia.load_refseq_metadata()
ncbi_metadata = Mycelia.load_genbank_metadata()
# ncbi_metadata = Mycelia.load_refseq_metadata()
# show(ncbi_metadata[1:1, :], allcols=true)
tax_id_filter = map(taxid -> taxid in child_tax_ids, ncbi_metadata[!, "taxid"])
# is_right_host = map(x -> occursin(Regex(host, "i"), x), ncbi_metadata[!, "organism_name"])

MATCH (n)-[*]->(n2) WHERE n.tax_id IS NOT NULL AND n.tax_id = "10239" RETURN DISTINCT n2.tax_id AS tax_id


1273151-element Vector{Bool}:
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

In [24]:
not_excluded = ncbi_metadata[!, "excluded_from_refseq"] .== ""
is_full = ncbi_metadata[!, "genome_rep"] .== "Full"
# assembly_levels = ["Complete Genome"]
assembly_levels = ["Complete Genome", "Chromosome"]
# assembly_levels = ["Complete Genome", "Chromosome", "Scaffold"]
# assembly_levels = ["Complete Genome", "Chromosome", "Scaffold", "Contig"]
assembly_level_filter = map(x -> x in assembly_levels, ncbi_metadata[!, "assembly_level"])
full_filter = is_full .& not_excluded .& assembly_level_filter .& tax_id_filter
@show count(full_filter)

# 11403 for refseq
# 43440 for genbank

count(full_filter) = 43440


43440

In [25]:
indices = findall(full_filter)
subset_n=0
if subset_n != 0
    indices = StatsBase.sample(indices, subset_n)
end

ncbi_metadata_of_interest = ncbi_metadata[indices, :]

Unnamed: 0_level_0,# assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid
Unnamed: 0_level_1,String,String,String,String,String,Int64
1,GCA_000529295.1,PRJEB4178,SAMEA3138851,,na,10454
2,GCA_000529585.1,PRJEB4181,SAMEA3138859,,na,10454
3,GCA_000530085.1,PRJEB4176,SAMEA3138873,,na,10454
4,GCA_000530135.1,PRJEB4177,SAMEA3138875,,na,10454
5,GCA_000530235.2,PRJEB4180,SAMEA3138878,,na,10454
6,GCA_000530255.2,PRJEB4182,SAMEA3138879,,na,10454
7,GCA_000689295.1,PRJEB4179,SAMEA3139003,,na,10454
8,GCA_000819615.1,,,,na,2886930
9,GCA_000820355.1,,,,na,191289
10,GCA_000820495.2,,,,na,518987


## Acquire pangenome input files

In [26]:
# can I also get genbank record?????
# for extension in ["genomic.fna.gz", "protein.faa.gz"]
# for extension in ["genomic.fna.gz", "protein.faa.gz", "genomic.gbff.gz"]
# for extension in ["genomic.gbff.gz"]
# for extension in ["protein.faa.gz"]
# for extension in ["genomic.fna.gz", "genomic.gbff.gz"]
for extension in ["genomic.fna.gz"]
    outdir = mkpath(joinpath(DIR, extension))
    ProgressMeter.@showprogress for row in DataFrames.eachrow(ncbi_metadata_of_interest)
        url = Mycelia.ncbi_ftp_path_to_url(row["ftp_path"], extension)
        outfile = joinpath(outdir, basename(url))
        if !isfile(outfile)
            try
                Downloads.download(url, outfile)
            catch e
                # @show e
                showerror(stdout, e)
                # @assert extension == "protein.faa.gz"
                # here is where we should call prodigal to fill in protein annotations if we don't otherwise see them
            end
        end
    end
end

[32mProgress:  83%|█████████████████████████████████▉       |  ETA: 0:07:42[39m

Could not resolve host: na while requesting na/na_genomic.fna.gz

[32mProgress:  99%|████████████████████████████████████████▌|  ETA: 0:00:33[39m

Could not resolve host: na while requesting na/na_genomic.fna.gz

[32mProgress:  99%|████████████████████████████████████████▌|  ETA: 0:00:33[39m

Could not resolve host: na while requesting na/na_genomic.fna.gz

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:43:20[39m
