In [1]:
import os
import sys
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor

In [2]:
sys.path.append('../src')

from utils import run_cmd

In [3]:
def build_index(file):
    run_cmd(f'minimap2 -d {file}')

In [4]:
def long_reads_filter(input_files, output_file, min_length=500, min_qual=7, exclude_genome='', threads=8):
    cmd = []
    cmd += [f"cat {input_files}"]
    cmd += [f"nanoq -q {min_qual} -l {min_length}"]
    if exclude_genome:
        if os.path.exists(exclude_genome + '.fai') is False:
            build_index(file)
        cmd += [
            f'minimap2 -t {threads} -ax map-ont {exclude_genome + ".fai"} - ',
            f'samtools sort -@ {threads} --reference {exclude_genome} -O BAM -',
            f"samtools view -@ {threads} --reference {exclude_genome} -f 4 -O BAM -",
            f"samtools fastq -@ {threads} --reference {exclude_genome} -",
        ]
    if output_file.endswith('gz'):
        cmd += [f"pigz -9 -p {threads} > {output_file}"]
    else:
        cmd[-1] += f" > {output_file}"
    run_cmd(' | '.join(cmd))

In [5]:
dirpath = Path('/media/Central_Lab_Storage/MinION/mNGS/20220216_ICU31_34_strain/Guppy')
outpath = Path('/media/Central_Lab_Storage/MinION/mNGS/20220216_ICU31_34_strain/fastq')

In [6]:
exclude_genome = '/media/Central_Lab_Storage/NcbiASM/GRCh38_latest_genomic.fna'

for idx in (64, 65, 66, 67, 72, 73, 74):
    barcode = f"barcode{idx:02}"
    barcode_dir = os.path.join(dirpath, barcode, '*')
    fastq_file = os.path.join(outpath, (barcode + '.fastq.gz'))
    long_reads_filter(
        barcode_dir,
        fastq_file,
        exclude_genome=exclude_genome,
        threads=32,
    )

In [20]:
dirpath = Path('/media/ONT/20211220_Lis_WGA_2/guppy_hac')
outpath = Path('/media/Central_Lab_Storage/MinION/20211220_Lis_WGA_2/fastq')

In [21]:
for idx in (5, 7):
    barcode = f"barcode{idx:02}"
    barcode_dir = os.path.join(dirpath, barcode, '*')
    fastq_file = os.path.join(outpath, (barcode + '.fastq.gz'))
    run_cmd(f"cat {barcode_dir} | pigz -9 -p 64 > {fastq_file}")

In [15]:
with ProcessPoolExecutor(4) as executor:
    for idx in (9, 10, 11):
        barcode = f"barcode{idx:02}"
        barcode_dir = os.path.join(dirpath, barcode, '*')
        fastq_file = os.path.join(outpath, (barcode + '.fastq.gz'))
        executor.submit(long_reads_filter, barcode_dir, fastq_file, threads=16, min_length=1000)

In [38]:
dirpath = Path('/media/Central_Lab_Storage/MinION_2022/20220106_Sal/fastq')

cmds = []
for idx in (5, 6, 7, 8):
    barcode = f"barcode{idx:02}"
    fastq_file = os.path.join(dirpath, (barcode + '.fastq.gz'))
    subsample_file = os.path.join(dirpath, (barcode + '_100x.fastq.gz'))
    cmds.append(f"rasusa -c 100 -g 5MB -i {fastq_file} | pigz -9 -p 8 > {subsample_file}")

In [39]:
with ProcessPoolExecutor(4) as executor:
    executor.map(run_cmd, cmds)