In [1]:
import os
from pathlib import Path
from multiprocessing import Pool
from Bio import SeqIO

import sys
sys.path.append('../src')
from utils import run_cmd

In [12]:
asm = '/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/amplification/20211028_R20-0088H/reference.fa'
raw = '/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/amplification/20211028_R20-0088H/reads.fastq.gz'
depths = (0.5, 1, 2, 4, 6, 8, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400)

In [13]:
krk_db = '/media/GenomicResearch/Issue/kraken2_db'
cf_db = '/media/GenomicResearch/Issue/centrifuge/12-Sept-2020-centrifuge-ref-db/centrifuge-ref-db'
outdir = Path('/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/amplification/20211028_R20-0088H/classify')

krk_outdir = outdir/'kraken2'
krk_outdir.mkdir(parents=True, exist_ok=True)
cf_outdir = outdir/'centrifuge'
cf_outdir.mkdir(parents=True, exist_ok=True)

In [14]:
gsize = sum(len(record) for record in SeqIO.parse(asm, 'fasta'))
gsize

3009037

In [15]:
cmds = []
for depth in depths:
    krk_output = krk_outdir/f"{depth}x.txt"
    krk_report = krk_outdir/f"{depth}x_report.txt"
    cmd = f"rasusa -i {raw} -g {gsize} -c {depth} | kraken2 --db {krk_db} --threads 8 --report {krk_report} --output {krk_output} --memory-mapping /dev/fd/0"
    cmds.append(cmd)
    cf_output = cf_outdir/f"{depth}x.txt"
    cf_report = cf_outdir/f"{depth}x_kreport.txt"
    cmd = f"rasusa -i {raw} -g {gsize} -c {depth} | centrifuge --mm -p 8 -x {cf_db} -U - | tee {cf_output} | centrifuge-kreport -x {cf_db} > {cf_report}"
    cmds.append(cmd)

In [16]:
with Pool(8) as p:
    try:
        p.map(run_cmd, cmds)
        p.close()
        p.join()
    except KeyboardInterrupt:
        p.terminate()