In [1]:
import os
import sys
import shutil
from pathlib import Path
from multiprocessing import Pool
from concurrent.futures import ProcessPoolExecutor

In [2]:
sys.path.append('../src')
from utils import run_cmd

In [3]:
program = '/media/GenomicResearch/Tools/homopolish/homopolish.py'
polish_model = '/media/GenomicResearch/Tools/homopolish/R9.4.pkl'
mash_sketch = '/media/GenomicResearch/Tools/homopolish/bacteria.msh'
medaka_model = 'r941_min_hac_g507'

In [4]:
def fn(reads, outdir, medaka_model, polish_model, mash_sketch, threads=8, nofilter=False, minreadlen=500, minquality=7, opts=''):
    raw = os.path.join(outdir, 'READS.fastq.gz')
    flye_dirname = os.path.join(outdir, 'flye')
    flye_asm = os.path.join(flye_dirname, 'assembly.fasta')
    racon_dirname = os.path.join(flye_dirname, 'polish', 'racon')
    racon_asm = os.path.join(racon_dirname, 'consensus.fasta')
    paf_filename = os.path.join(racon_dirname, 'aligments.paf')
    medaka_dirname = os.path.join(flye_dirname, 'polish', 'medaka')
    medaka_asm = os.path.join(medaka_dirname, 'consensus.fasta')
    for dirname in (racon_dirname, medaka_dirname):
        os.makedirs(dirname, exist_ok=True)
    ###
    filter_command = []
    if nofilter is False:
        if filter_command:
            filter_command += [f"nanoq -l {minreadlen} -q {minquality}",]
        else:
            filter_command += [f"nanoq -i {reads} -l {minreadlen} -q {minquality}",]
    if filter_command:
        filter_command += [f"pigz -1 -p {threads} > {raw}"]
        run_cmd(' | '.join(filter_command))
    else:
        os.symlink(reads, raw)
    ###
    run_cmd(f"conda run -n dragonflye flye --nano-raw {raw} -o {flye_dirname} -t {threads} {opts}")
    if os.path.exists(flye_asm) is False:
        os.remove(raw)
        return
    shutil.copyfile(
        os.path.join(flye_dirname, 'assembly_info.txt'),
        os.path.join(outdir, 'flye_info.txt')
    )
    shutil.copyfile(
        os.path.join(flye_dirname, 'assembly_graph.gfa'),
        os.path.join(outdir, 'flye-unpolished.gfa')
    )
    run_cmd(f"minimap2 -t {threads} -x map-ont {flye_asm} {raw} > {paf_filename}")
    run_cmd(f"racon -t {threads} {raw} {paf_filename} {flye_asm} > {racon_asm}")
    run_cmd(f"conda run -n dragonflye medaka_consensus -i {raw} -d {racon_asm} -o {medaka_dirname} -m {medaka_model} -t {threads}")
    os.remove(raw)
    homopolish_dirname = os.path.join(outdir, 'homopolish')
    homopolish_asm = os.path.join(homopolish_dirname, 'consensus_homopolished.fasta')
    run_cmd(f"conda run -n homopolish python {program} polish -a {medaka_asm} -s {mash_sketch} -m {polish_model} "\
            f"-o {homopolish_dirname} -t {threads}")
    shutil.copyfile(
            homopolish_asm,
            os.path.join(outdir, 'contigs.fasta')
        )
    for dirname in (flye_dirname, homopolish_dirname):
        shutil.rmtree(dirname)

In [None]:
raw = '/media/Central_Lab_Storage/MinION/mNGS/20220105_ICU20_23/fastq/Barcode19.fastq.gz'
out = '/media/Central_Lab_Storage/MinION/mNGS/20220105_ICU20_23/denovo/Barcode19'
fn(raw, out, medaka_model, polish_model, mash_sketch, 32, nofilter=True, opts='--meta')

In [5]:
dirpath = Path('/media/Central_Lab_Storage/MinION/mNGS/20220216_ICU31_34_strain/fastq')
outpath = Path('/media/Central_Lab_Storage/MinION/mNGS/20220216_ICU31_34_strain/denovo')

with ProcessPoolExecutor(4) as executor:
    for i in dirpath.iterdir():
        outdir = outpath/(i.name.replace('.fastq.gz', ''))
        executor.submit(fn, i, outdir, medaka_model, polish_model, mash_sketch, 16, nofilter=True, opts='--meta')