In [None]:
import Pkg
Pkg.activate(".")

# not for the faint of heart!
# Pkg.update()

pkgs = [
"ArgParse",
"Base64",
"BioSequences",
"DataFrames",
"Dates",
"DelimitedFiles",
"FASTX",
"GLM",
"HTTP",
"JSON",
"Graphs",
"MetaGraphs",
"MD5",
"Statistics",
"StatsPlots",
"uCSV",
"CodecZlib",
"YAML",
"Revise",
"Kmers",
"StatsBase"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

# point me to remote upon deployment
Pkg.develop(path="$(homedir())/workspace/Mycelia")
# can point to a specific git commit
# Pkg.add(url="https://github.com/cjprybol/Mycelia", rev="master")
import Mycelia

In [None]:
config = YAML.load_file("config.yaml")

In [None]:
config["start time"] = "20230215T175842538"
# config["start time"] = replace(Dates.format(Dates.now(), Dates.ISODateTimeFormat), r"[^\w]" => "")

In [None]:
config["githash"] = "3bd2daa8"
# config["githash"] = rstrip(read(`git rev-parse HEAD`, String))[1:8]

In [None]:
config["annotation run identifier"] = join((config["assembly run identifier"], config["start time"], config["githash"]), "__")

In [None]:
OUT_DIR = mkpath(config["annotation run identifier"])

In [None]:
# pull tarchive
sequencing_run_identifier, sample_identifier, assembly_start_time, assembly_hash = split(config["assembly run identifier"], "__")
remote_assembly_tarball = "$(config["remote"])/Assemblies/$(sequencing_run_identifier)/$(sample_identifier)/$(config["assembly run identifier"]).tar.gz"
local_assembly_tarball = basename(remote_assembly_tarball)
local_assembly_folder = replace(local_assembly_tarball, ".tar.gz" => "")
if !isfile(local_assembly_tarball)
    run(`rclone copy $(remote_assembly_tarball) .`)
    run(`tar -xf $(local_assembly_tarball)`)
end

In [None]:
# consider using GFA to review topology of each contig and then annotating circular vs linear contigs in seperate prodigal batches

In [None]:
assembled_fasta = joinpath(local_assembly_folder, config["assembly run identifier"] * ".final.contigs.fastg.gfa.fna")

In [None]:
# @info "normalizing fasta file"
# normalized_fasta_file = normalize_fasta("$OUT_DIR/raw_fasta/$ID.fasta", OUT_DIR)

# ORF calling

In [None]:
# @info "running prodigal"
prodigal_directory = Mycelia.run_prodigal(out_dir=OUT_DIR, fasta_file=assembled_fasta)
nucleic_acid_fasta = joinpath(prodigal_directory, config["annotation run identifier"] * ".prodigal.fna")
amino_acid_fasta = joinpath(prodigal_directory, config["annotation run identifier"] * ".prodigal.faa")
gff = joinpath(prodigal_directory, config["annotation run identifier"] * ".prodigal.gff")

# General Annotation

In [None]:
# refseq_protein -> blastp
# https://blast.ncbi.nlm.nih.gov/Blast.cgi
@info "running blastp"
blastdb_dir = "$(homedir())/blastdb"
blast_db = "refseq_protein"
if isdir(blastdb_dir)
    if !isempty(filter(x -> occursin(blast_db, x), readdir(blastdb_dir)))
        @info "blast db detected, using existing"
    elseif isempty(filter(x -> occursin(blast_db, x), readdir(blastdb_dir)))
        @info "blast db not detected but folder found, using NCBI download to add to existing database"
        # ~ 1.5hours to download from ncbi
        Mycelia.download_blast_db(db=blast_db, outdir=blastdb_dir, source="ncbi")
    end
else
    # we're probably on a cloud build
    Mycelia.download_blast_db(db=blast_db, outdir=blastdb_dir, source="gcp")
end
# 14332.578486 seconds (435.48 k allocations: 13.843 MiB)
# 233 minutes = 3.8 hours
blast_dir = Mycelia.run_blast(out_dir=OUT_DIR, fasta=amino_acid_fasta, blast_db="$(blastdb_dir)/$(blast_db)", blast_command="blastp")

In [None]:
# consider exporting refseq protein to diamond and then running diamond to save runtime

# ./diamond prepdb -d nr

# wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.FULL.gz
# blastdbcmd -db refseq_protein -entry all | diamond makedb --db refseq_protein.diamond --taxonmap prot.accession2taxid.FULL.gz


# ./diamond blastp -d nr -q queries.fasta -o matches.tsv

config["annotation run identifier"]

diamond_db = joinpath(blastdb_dir, "refseq_protein.diamond")

Mycelia.run_diamond(
        identifier=config["annotation run identifier"],
        out_dir=OUT_DIR,
        protein_fasta=amino_acid_fasta,
        diamond_db=diamond_db,
    )

In [None]:
# Mmseqs +
# https://github.com/soedinglab/MMseqs2
# run(`mamba install -c bioconda mmseqs2`)

In [None]:
# Mycelia.download_mmseqs_db(db="NR")
# Mycelia.download_mmseqs_db(db="NT")

In [None]:
# amino acid
# ~ 5 hour download time may not be tolerable?
# processing
# 7930.350543 seconds (710.71 k allocations: 40.309 MiB, 0.00% gc time, 0.00% compilation time)
# 2 hours 12m
# 1.5 hours for iterative search with varying sensitivity
Mycelia.download_mmseqs_db(db="UniRef100")
# mmseqs databases --compressed 1 --remove-tmp-files 1 --force-reuse 1 UniRef100 $HOME/mmseqs/UniRef100 $HOME/mmseqs/tmp

Mycelia.run_mmseqs_easy_search(out_dir=OUT_DIR, query_fasta=amino_acid_fasta, target_database="$(homedir())/mmseqs/UniRef100", outfile=config["annotation run identifier"])
# add taxonomic information to uniref classifications above!

In [None]:
# 42.0M
# done
Mycelia.download_mmseqs_db(db="PDB")
# mmseqs databases --compressed 1 --remove-tmp-files 1 PDB $HOME/mmseqs/PDB $HOME/mmseqs/tmp

# good names!
Mycelia.run_mmseqs_easy_search(out_dir=OUT_DIR, query_fasta=amino_acid_fasta, target_database="$(homedir())/mmseqs/PDB", outfile=config["annotation run identifier"], force=true)

In [None]:
# 334M
# done
Mycelia.download_mmseqs_db(db="CDD")
# mmseqs databases --compressed 1 --remove-tmp-files 1 CDD $HOME/mmseqs/CDD $HOME/mmseqs/tmp

# not useful names
Mycelia.run_mmseqs_easy_search(out_dir=OUT_DIR, query_fasta=amino_acid_fasta, target_database="$(homedir())/mmseqs/CDD", outfile=config["annotation run identifier"])

# wget https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cddid_all.tbl.gz
# gzip -d cddid_all.tbl.gz
# 

# ===============================================================================
# cddid_all.tbl.gz
# =============================================================================== 

# "cddid_all.tbl.gz" contains summary information about all CD models in
# this distribution. This is a tab-delimited text file, with a single row per CD 
# model and the following columns:

#  PSSM-Id (unique numerical identifier)
#  CD accession (starting with 'cd', 'pfam', 'smart', 'COG', 'PRK', 'CHL', 'KOG',
#                or 'LOAD')
#  CD "short name"
#  CD description
#  PSSM-Length (number of columns, the size of the search model)

#         (Scope A: this file includes data from ALL CD models; 
#         see section on "SCOPE OF DATA in FTP FILES" for details)