In [None]:
ENV["LD_LIBRARY_PATH"] = ""

In [None]:
import Pkg
Pkg.activate(".")

pkgs = [
"BioSequences",
"DataFrames",
"Dates",
"DelimitedFiles",
"FASTX",
"GLM",
"HTTP",
"JSON",
"Graphs",
"MetaGraphs",
"MD5",
"Statistics",
"StatsPlots",
"uCSV",
"CodecZlib",
"YAML",
"Revise",
"Kmers",
"StatsBase",
"ProgressMeter"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

import Mycelia

In [None]:
data_dir = joinpath(dirname(pwd()), "data")

In [None]:
SRR_paths = filter(x -> !occursin(".ipynb_checkpoints", x), readdir(joinpath(data_dir, "SRA"), join=true))
SRR_paths = filter(x -> "trim_galore" in readdir(x), SRR_paths)

In [None]:
# n_cores = max(Int(round(Sys.CPU_THREADS / 2)), 1)
n_cores = Sys.CPU_THREADS

In [None]:
ProgressMeter.@showprogress for SRR_path in SRR_paths
    SRR = basename(SRR_path)

    out_dir = joinpath(SRR_path, "megahit")

    trimmed_forward_reads = joinpath(SRR_path, "trim_galore", "$(SRR)_1_val_1.fq.gz")
    trimmed_reverse_reads = joinpath(SRR_path, "trim_galore", "$(SRR)_2_val_2.fq.gz")

    # megahit: MEGAHIT v1.2.9

    # contact: Dinghua Li <voutcn@gmail.com>

    # Usage:
    #   megahit [options] {-1 <pe1> -2 <pe2> | --12 <pe12> | -r <se>} [-o <out_dir>]

    #   Input options that can be specified for multiple times (supporting plain text and gz/bz2 extensions)
    #     -1                       <pe1>          comma-separated list of fasta/q paired-end #1 files, paired with files in <pe2>
    #     -2                       <pe2>          comma-separated list of fasta/q paired-end #2 files, paired with files in <pe1>
    #     --12                     <pe12>         comma-separated list of interleaved fasta/q paired-end files
    #     -r/--read                <se>           comma-separated list of fasta/q single-end files

    # Optional Arguments:
    #   Basic assembly options:
    #     --min-count              <int>          minimum multiplicity for filtering (k_min+1)-mers [2]
    #     --k-list                 <int,int,..>   comma-separated list of kmer size
    #                                             all must be odd, in the range 15-255, increment <= 28)
    #                                             [21,29,39,59,79,99,119,141]

    #   Another way to set --k-list (overrides --k-list if one of them set):
    #     --k-min                  <int>          minimum kmer size (<= 255), must be odd number [21]
    #     --k-max                  <int>          maximum kmer size (<= 255), must be odd number [141]
    #     --k-step                 <int>          increment of kmer size of each iteration (<= 28), must be even number [12]

    #   Advanced assembly options:
    #     --no-mercy                              do not add mercy kmers
    #     --bubble-level           <int>          intensity of bubble merging (0-2), 0 to disable [2]
    #     --merge-level            <l,s>          merge complex bubbles of length <= l*kmer_size and similarity >= s [20,0.95]
    #     --prune-level            <int>          strength of low depth pruning (0-3) [2]
    #     --prune-depth            <int>          remove unitigs with avg kmer depth less than this value [2]
    #     --disconnect-ratio       <float>        disconnect unitigs if its depth is less than this ratio times 
    #                                             the total depth of itself and its siblings [0.1]  
    #     --low-local-ratio        <float>        remove unitigs if its depth is less than this ratio times
    #                                             the average depth of the neighborhoods [0.2]
    #     --max-tip-len            <int>          remove tips less than this value [2*k]
    #     --cleaning-rounds        <int>          number of rounds for graph cleanning [5]
    #     --no-local                              disable local assembly
    #     --kmin-1pass                            use 1pass mode to build SdBG of k_min

    #   Presets parameters:
    #     --presets                <str>          override a group of parameters; possible values:
    #                                             meta-sensitive: '--min-count 1 --k-list 21,29,39,49,...,129,141'
    #                                             meta-large: '--k-min 27 --k-max 127 --k-step 10'
    #                                             (large & complex metagenomes, like soil)

    #   Hardware options:
    #     -t/--num-cpu-threads     <int>          number of CPU threads [# of logical processors]

    #   Output options:
    #     -o/--out-dir             <string>       output directory [./megahit_out]
    #     --out-prefix             <string>       output prefix (the contig file will be OUT_DIR/OUT_PREFIX.contigs.fa)
    #     --min-contig-len         <int>          minimum length of contigs to output [200]

    # median_read_length = Statistics.median(vcat(Mycelia.determine_read_lengths(trimmed_forward_reads), Mycelia.determine_read_lengths(trimmed_reverse_reads)))

    # k_lengths = [i for i in [21,29,39,59,79,99,119,141] if i < joint_median_read_length]
    # k_lengths_string = join(string.(k_lengths), ',')

    # 2023-02-19 00:51:46 - ALL DONE. Time elapsed: 2031.380950 seconds 

    initial_assembled_fasta = "$(out_dir)/final.contigs.fa"
    assembled_fastg = replace(initial_assembled_fasta, ".fa" => ".fastg")

    if !isfile(initial_assembled_fasta)
        run(`megahit --num-cpu-threads $(n_cores) -1 $trimmed_forward_reads -2 $trimmed_reverse_reads -o $(out_dir)`)
        # run(`conda run --live-stream --no-capture-output -n viral-pangenome-discovery megahit --num-cpu-threads $(n_cores) -1 $trimmed_forward_reads -2 $trimmed_reverse_reads -o $(out_dir)`)
    end

    # read in the assembled fasta file and parse contig identifiers to get final k length
    final_k_lengths = unique([replace(first(split(FASTX.identifier(record), '_')), r"^k" => "") for record in Mycelia.open_fastx(initial_assembled_fasta)])
    @assert length(final_k_lengths) == 1
    final_k_length = parse(Int, first(final_k_lengths))
    if !isfile(assembled_fastg)
        run(pipeline(`megahit_toolkit contig2fastg $(final_k_length) $(initial_assembled_fasta)`, assembled_fastg))
    end

    assembled_gfa = "$(assembled_fastg).gfa"
    if !isfile(assembled_gfa)
        run(`Bandage reduce $(assembled_fastg) $(assembled_gfa)`)
    end

    assembled_fasta = assembled_gfa * ".fna"
    if !isfile(assembled_fasta)
        open(assembled_fasta, "w") do io
            fastx_io = FASTX.FASTA.Writer(io)
            gfa_graph = Mycelia.parse_gfa(assembled_gfa)
            for v in Graphs.vertices(gfa_graph)
                record = FASTX.FASTA.Record(gfa_graph.vprops[v][:identifier], gfa_graph.vprops[v][:sequence])
                write(fastx_io, record)
            end
            close(fastx_io)
        end
    end

    # generate a bandage plot of the assembly graph
    bandage_outfile = "$(assembled_gfa).bandage.jpg"
    if !isfile(bandage_outfile)
        run(`Bandage image $(assembled_gfa) $bandage_outfile`)
    end

    # map reads to the assembly and run qualimap QC
    bwt_index = "$(assembled_fasta).bwt"
    if !isfile(bwt_index)
        run(`bwa index $(assembled_fasta)`)
    end

    # mapped_reads_bam = "$(assembled_fasta).bwa.sorted.marked_duplicates.bam"
    mapped_reads_bam = "$(assembled_fasta).bwa.bam"
    # sorting and deduping
    # if !isfile(mapped_reads_bam)
    #     run(pipeline(
    #         `bwa mem -R "@RG\tID:$(SRR)\tSM:bar" -t $(n_cores) $(assembled_fasta) $(trimmed_forward_reads) $(trimmed_reverse_reads)`,
    #         `samtools collate --threads $(n_cores) -O - -`,
    #         `samtools fixmate --threads $(n_cores) -m - -`,
    #         `samtools sort --threads $(n_cores)`,
    #         `samtools markdup --threads $(n_cores) - -`,
    #         `samtools view --threads $(n_cores) -buh`,
    #         mapped_reads_bams))
    # end
    # sorting - 50Gb - 1594.823 sec
    if !isfile(mapped_reads_bam) || (filesize(mapped_reads_bam) == 0)
        run(pipeline(
            `bwa mem -R "@RG\tID:$(SRR)\tSM:bar" -t $(n_cores) $(assembled_fasta) $(trimmed_forward_reads) $(trimmed_reverse_reads)`,
            `samtools sort --threads $(n_cores) -u -m 100M`,
            `samtools view --threads $(n_cores) -bh`,
            mapped_reads_bam))
    end

    # if !isfile("$(mapped_reads_bam).bai")
    #     run(`samtools index $(mapped_reads_bam)`)
    # end

    qualimap_report_pdf = "$(out_dir)/qualimap/report.pdf"
    qualimap_report_txt = "$(out_dir)/qualimap/genome_results.txt"

    if !isfile(qualimap_report_pdf) || !isfile(qualimap_report_txt)
        run(`
            conda run --live-stream -n viral-pangenome-discovery
            qualimap bamqc
            -nt $(n_cores)
            -bam $(mapped_reads_bam)
            -outdir $(out_dir)/qualimap
            -outformat PDF:HTML
            --output-genome-coverage $(mapped_reads_bam).genome_coverage.txt
            --java-mem-size=4G
            `)
    end
end

# qualimap_contig_coverage_table = Mycelia.parse_qualimap_contig_coverage(qualimap_report_txt)
# qualimap_contig_coverage_table[!, "% Mapped bases"] = round.(qualimap_contig_coverage_table[!, "Mapped bases"] ./ sum(qualimap_contig_coverage_table[!, "Mapped bases"]) .* 100, digits=3);