In [None]:
import Pkg
Pkg.activate(".")

# not for the faint of heart!
# Pkg.update()

pkgs = [
"ArgParse",
"Base64",
"BioSequences",
"DataFrames",
"Dates",
"DelimitedFiles",
"FASTX",
"GLM",
"HTTP",
"JSON",
"Graphs",
"MetaGraphs",
"MD5",
"Statistics",
"StatsPlots",
"uCSV",
"CodecZlib",
"YAML",
"Revise",
"Kmers",
"StatsBase"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

import Mycelia

In [None]:
config = YAML.load_file("config.yaml")

In [None]:
# config["start time"] = "20230206T171658393"
config["start time"] = replace(Dates.format(Dates.now(), Dates.ISODateTimeFormat), r"[^\w]" => "")

In [None]:
# config["githash"] = "de06764d"
config["githash"] = rstrip(read(`git rev-parse HEAD`, String))[1:8]

In [None]:
config["assembly run identifier"] = join((config["sequencing run identifier"], config["sample identifier"], config["start time"], config["githash"]), "__")

In [None]:
OUT_DIR = mkpath(config["assembly run identifier"])

In [None]:
remote_fastq_tar = joinpath(config["remote"], config["sequencing run identifier"], config["sample identifier"] * ".tar.gz")
local_fastq_tar = basename(remote_fastq_tar)

In [None]:
# we will put all of the key results into this json dict
results = Dict()

In [None]:
TRIMMED_FORWARD = joinpath(config["sample identifier"], config["sample identifier"] * "_L001_R1_001_val_1.fq.gz")
TRIMMED_REVERSE = joinpath(config["sample identifier"], config["sample identifier"] * "_L001_R2_001_val_2.fq.gz")

In [None]:
if !isfile(TRIMMED_FORWARD) && !isfile(TRIMMED_REVERSE)
    run(`rclone --retries 5 copy $(remote_fastq_tar) .`)
    extracted_tar_directory = replace(local_fastq_tar, ".tar.gz" => "")
    # run(`tar vzxf $(local_fastq_tar) $(TRIMMED_FORWARD) $(TRIMMED_REVERSE)`)
    run(`tar vzxf $(local_fastq_tar)`)
end

In [None]:
median_read_length = Statistics.median(vcat(Mycelia.determine_read_lengths(TRIMMED_FORWARD), Mycelia.determine_read_lengths(TRIMMED_REVERSE)))
results["median read length"] = Int(floor(median_read_length))

In [None]:
kmer_spectra_dir = mkpath("$OUT_DIR/kmer-spectra")

downsample_file = joinpath(kmer_spectra_dir, "downsampling-rate.txt")
kmer_spectra_png = joinpath(kmer_spectra_dir, "peak-detected.png")
kmer_spectra_svg = joinpath(kmer_spectra_dir, "peak-detected.svg")

if !isfile(downsample_file) || !isfile(kmer_spectra_png) || !isfile(kmer_spectra_svg)
    Mycelia.analyze_kmer_spectra(
        out_directory = "$OUT_DIR/kmer-spectra",
        forward_reads = TRIMMED_FORWARD,
        reverse_reads = TRIMMED_REVERSE,
        k=17,
        target_coverage=300)
end

results["downsampling rate"] = readline(downsample_file)

In [None]:
mkpath("$OUT_DIR/downsampled")
DOWNSAMPLED_FORWARD = "$OUT_DIR/downsampled/forward.downsampled.fastq.gz"
DOWNSAMPLED_REVERSE = "$OUT_DIR/downsampled/reverse.downsampled.fastq.gz"
if !isfile(DOWNSAMPLED_FORWARD) || !isfile(DOWNSAMPLED_REVERSE)
    if results["downsampling rate"] == "1.0"
        # coverage too low to justify downsampling
        cp(TRIMMED_FORWARD, DOWNSAMPLED_FORWARD)
        cp(TRIMMED_REVERSE, DOWNSAMPLED_REVERSE)
    else
        # should actually downsample
        run(pipeline(`gzip -dc $(TRIMMED_FORWARD)`, "$OUT_DIR/downsampled/forward.fastq"))
        run(pipeline(
            `seqtk sample $OUT_DIR/downsampled/forward.fastq $(results["downsampling rate"])`,
            `gzip`,
            "$OUT_DIR/downsampled/forward.downsampled.fastq.gz"
        ))
        rm("$OUT_DIR/downsampled/forward.fastq")

        run(pipeline(`gzip -dc $(TRIMMED_REVERSE)`, "$OUT_DIR/downsampled/reverse.fastq"))
        run(pipeline(
            `seqtk sample $OUT_DIR/downsampled/reverse.fastq $(results["downsampling rate"])`,
            `gzip`,
            "$OUT_DIR/downsampled/reverse.downsampled.fastq.gz"
        ))
        rm("$OUT_DIR/downsampled/reverse.fastq")
    end
end

In [None]:
# megahit: MEGAHIT v1.2.9

# contact: Dinghua Li <voutcn@gmail.com>

# Usage:
#   megahit [options] {-1 <pe1> -2 <pe2> | --12 <pe12> | -r <se>} [-o <out_dir>]

#   Input options that can be specified for multiple times (supporting plain text and gz/bz2 extensions)
#     -1                       <pe1>          comma-separated list of fasta/q paired-end #1 files, paired with files in <pe2>
#     -2                       <pe2>          comma-separated list of fasta/q paired-end #2 files, paired with files in <pe1>
#     --12                     <pe12>         comma-separated list of interleaved fasta/q paired-end files
#     -r/--read                <se>           comma-separated list of fasta/q single-end files

# Optional Arguments:
#   Basic assembly options:
#     --min-count              <int>          minimum multiplicity for filtering (k_min+1)-mers [2]
#     --k-list                 <int,int,..>   comma-separated list of kmer size
#                                             all must be odd, in the range 15-255, increment <= 28)
#                                             [21,29,39,59,79,99,119,141]

#   Another way to set --k-list (overrides --k-list if one of them set):
#     --k-min                  <int>          minimum kmer size (<= 255), must be odd number [21]
#     --k-max                  <int>          maximum kmer size (<= 255), must be odd number [141]
#     --k-step                 <int>          increment of kmer size of each iteration (<= 28), must be even number [12]

#   Advanced assembly options:
#     --no-mercy                              do not add mercy kmers
#     --bubble-level           <int>          intensity of bubble merging (0-2), 0 to disable [2]
#     --merge-level            <l,s>          merge complex bubbles of length <= l*kmer_size and similarity >= s [20,0.95]
#     --prune-level            <int>          strength of low depth pruning (0-3) [2]
#     --prune-depth            <int>          remove unitigs with avg kmer depth less than this value [2]
#     --disconnect-ratio       <float>        disconnect unitigs if its depth is less than this ratio times 
#                                             the total depth of itself and its siblings [0.1]  
#     --low-local-ratio        <float>        remove unitigs if its depth is less than this ratio times
#                                             the average depth of the neighborhoods [0.2]
#     --max-tip-len            <int>          remove tips less than this value [2*k]
#     --cleaning-rounds        <int>          number of rounds for graph cleanning [5]
#     --no-local                              disable local assembly
#     --kmin-1pass                            use 1pass mode to build SdBG of k_min

#   Presets parameters:
#     --presets                <str>          override a group of parameters; possible values:
#                                             meta-sensitive: '--min-count 1 --k-list 21,29,39,49,...,129,141'
#                                             meta-large: '--k-min 27 --k-max 127 --k-step 10'
#                                             (large & complex metagenomes, like soil)

#   Hardware options:
#     -t/--num-cpu-threads     <int>          number of CPU threads [# of logical processors]

#   Output options:
#     -o/--out-dir             <string>       output directory [./megahit_out]
#     --out-prefix             <string>       output prefix (the contig file will be OUT_DIR/OUT_PREFIX.contigs.fa)
#     --min-contig-len         <int>          minimum length of contigs to output [200]

# k_lengths = [i for i in [21,29,39,59,79,99,119,141] if i < joint_median_read_length]
# k_lengths_string = join(string.(k_lengths), ',')

results["assembly method"] = "megahit"

assembly_dir = "$OUT_DIR/megahit"
initial_assembled_fasta = "$(assembly_dir)/final.contigs.fa"
assembled_fastg = replace(initial_assembled_fasta, ".fa" => ".fastg")

if !isfile(initial_assembled_fasta)
    run(`megahit -1 $DOWNSAMPLED_FORWARD -2 $DOWNSAMPLED_REVERSE -o $(assembly_dir)`)
end

# read in the assembled fasta file and parse contig identifiers to get final k length
final_k_lengths = unique([replace(first(split(FASTX.identifier(record), '_')), r"^k" => "") for record in Mycelia.open_fastx(initial_assembled_fasta)])
@assert length(final_k_lengths) == 1
final_k_length = parse(Int, first(final_k_lengths))
if !isfile(assembled_fastg)
    run(pipeline(`megahit_toolkit contig2fastg $(final_k_length) $(initial_assembled_fasta)`, assembled_fastg))
end
    
assembled_gfa = "$(assembled_fastg).gfa"
if !isfile(assembled_gfa)
    run(`Bandage reduce $(assembled_fastg) $(assembled_gfa)`)
end

assembled_fasta = assembled_gfa * ".fna"
open(assembled_fasta, "w") do io
    fastx_io = FASTX.FASTA.Writer(io)
    gfa_graph = Mycelia.parse_gfa(assembled_gfa)
    for v in Graphs.vertices(gfa_graph)
        record = FASTX.FASTA.Record(gfa_graph.vprops[v][:identifier], gfa_graph.vprops[v][:sequence])
        write(fastx_io, record)
    end
    close(fastx_io)
end

In [None]:
# generate a bandage plot of the assembly graph
bandage_outfile = "$(assembled_gfa).bandage.jpg"
if !isfile(bandage_outfile)
    run(`Bandage image $(assembled_gfa) $bandage_outfile`)
end

In [None]:
# map reads to the assembly and run qualimap QC
bwt_index = "$(assembled_fasta).bwt"
if !isfile(bwt_index)
    run(`bwa index $(assembled_fasta)`)
end

mapped_reads_bam = "$(assembled_fasta).bwa.bam"
if !isfile(mapped_reads_bam)
    run(pipeline(
        `bwa mem -R "@RG\tID:$(config["sample identifier"])\tSM:bar" -t $(Sys.CPU_THREADS) $(assembled_fasta) $(TRIMMED_FORWARD) $(TRIMMED_REVERSE)`,
        `samtools collate -O - -`,
        `samtools fixmate -m - -`,
        `samtools sort`,
        `samtools markdup - -`,
        `samtools view -buh`,
        mapped_reads_bam))
end

if !isfile("$(mapped_reads_bam).bai")
    run(`samtools index $(mapped_reads_bam)`)
end

qualimap_report_pdf = "$(assembly_dir)/qualimap/report.pdf"
qualimap_report_txt = "$(assembly_dir)/qualimap/genome_results.txt"

if !isfile(qualimap_report_pdf) || !isfile(qualimap_report_txt)
    run(`
        qualimap bamqc
        -nt $(Sys.CPU_THREADS)
        -bam $(mapped_reads_bam)
        -outdir $(assembly_dir)/qualimap
        -outformat PDF:HTML
        --output-genome-coverage $(mapped_reads_bam).genome_coverage.txt
        `)
end

qualimap_contig_coverage_table = Mycelia.parse_qualimap_contig_coverage(qualimap_report_txt)
qualimap_contig_coverage_table[!, "% Mapped bases"] = round.(qualimap_contig_coverage_table[!, "Mapped bases"] ./ sum(qualimap_contig_coverage_table[!, "Mapped bases"]) .* 100, digits=3);

In [None]:
blastdb_dir = "$(homedir())/blastdb"
blast_db = "nt"
if isdir(blastdb_dir) && !isempty(readdir(blastdb_dir))
    @info "blast db detected, using existing"
    # Mycelia.download_blast_db(db=blast_db, outdir=blastdb_dir, source="ncbi")
    # Mycelia.download_blast_db(db=blast_db, outdir=blastdb_dir)
else
    # we're probably on a cloud build
    Mycelia.download_blast_db(db=blast_db, outdir=blastdb_dir, source="gcp")
end

db_path = joinpath(blastdb_dir, blast_db)
# blast contigs against NCBI
ncbi_blast_outfile = "$(assembled_fasta).blastn.$(blast_db).txt"

# >= 1-2 hours to run remotely
# 1679.701928 seconds
# 3497.275545
 # 28m 0.44s
if !isfile(ncbi_blast_outfile)
    @time run(
    `blastn
        -num_threads $(Sys.CPU_THREADS)
        -db $(db_path)
        -outfmt '7 qseqid qtitle sseqid sacc saccver stitle qlen slen qstart qend sstart send evalue bitscore length pident nident mismatch staxid'
        -query $(assembled_fasta)
        -out $(ncbi_blast_outfile)`)
end

In [None]:
# join the blast results to generate the contig info file
ncbi_blast_results = Mycelia.parse_blast_report(ncbi_blast_outfile)

if !isfile("taxdump.tar.gz")
    run(`wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz`)
end
if !isdir("taxdump")
    mkdir("taxdump")
    run(`tar -xvzf taxdump.tar.gz --directory taxdump`)
end

detected_tax_id_file = "$(OUT_DIR)/$(config["assembly run identifier"]).detected_tax_ids.txt"
open(detected_tax_id_file, "w") do io
    for taxid in unique(filter(!ismissing, ncbi_blast_results[!, "subject tax id"]))
        println(io, taxid)
    end
end

taxid_to_lineage_map = Dict(parse(Int, split_line[1]) => split_line[2] for split_line in split.(readlines(`taxonkit lineage --data-dir taxdump $(detected_tax_id_file)`), '\t'))

ncbi_blast_results[!, "lineage"] = map(x -> get(taxid_to_lineage_map, x, ""), ncbi_blast_results[!, "subject tax id"])

ncbi_blast_results[!, "% of subject length"] = round.(ncbi_blast_results[!, "query length"] ./ ncbi_blast_results[!, "subject length"] * 100, digits=3)
contig_info_table = DataFrames.leftjoin(qualimap_contig_coverage_table, ncbi_blast_results, on="Contig" => "query id")

# # get top 10 hits for each contig
# contig_info_table_top_hits = 
# DataFrames.combine(DataFrames.groupby(contig_info_table, "Contig")) do gdf
#    first(sort(gdf, "bit score", rev=true), 10)
# end

# re-order columns based on utility
reordered_columns = [
    "Contig",
    "Length",
    "Mapped bases",
    "Mean coverage",
    "Standard Deviation",
    "% Mapped bases",
    "subject id",
    "subject acc.",
    "subject title",
    "subject tax id",
    "lineage",
    "% identity",
    "% of subject length",
    "evalue",
    "bit score",
    "query length",
    "subject length",
    "alignment length",
    "q. start",
    "q. end",
    "s. start",
    "s. end",
    "identical",
    "mismatches"
]
contig_info_table_top_hits = contig_info_table_top_hits[!, reordered_columns]
sort!(contig_info_table_top_hits, ["% Mapped bases", "bit score"], rev=true)

contig_info_csv = "$(OUT_DIR)/$(config["assembly run identifier"]).config_info.csv"
contig_info_tsv = "$(OUT_DIR)/$(config["assembly run identifier"]).config_info.tsv"
uCSV.write(contig_info_csv, contig_info_table_top_hits, quotes='"')
uCSV.write(contig_info_tsv, contig_info_table_top_hits)
results["contig info"] = contig_info_csv