In [1]:
import os
import sys
from pathlib import Path
from itertools import product, chain
from multiprocessing import Pool
from Bio import SeqIO

sys.path.append('../src')
from utils import run_cmd

In [81]:
basepath = Path('/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20211209_Lis_barcode10_R20-0148')

asm = basepath/'reference.fa'
raw = basepath/'reads.fastq.gz'

samples_dir = basepath/'fastq'
samples_dir.mkdir(exist_ok=True, parents=True)

In [82]:
gsize = sum(len(record) for record in SeqIO.parse(asm, 'fasta'))

stdout, stderr = run_cmd(f"nanoq -s -i {raw} 2>&1 | awk '{{print $2}}'")
total_bases = int(stdout.strip())

origin_depth = total_bases/gsize

In [83]:
print(f'Genome Size: {gsize/10e5:.1f}MB', f"Total Bases: {total_bases/10e5:.1f}MB", f"Origin Depth: {origin_depth:.1f}x", sep='\n')

Genome Size: 2.9MB
Total Bases: 589.2MB
Origin Depth: 202.3x


In [84]:
depths = list(map(lambda x: x**2, range(3, 21)))
# depths = (50, 100, 200, 300)
repeats = (1, 2, 3, 4, 5)

In [85]:
cmds = []
for depth, repeat in product(depths, repeats):
    if depth < origin_depth:
        outfile = samples_dir/f"{depth:03}x-{repeat}.fastq.gz"
        cmd = f"rasusa -i {raw} -g {gsize} -c {depth} | pigz -9 -p 4 > {outfile}"
        cmds.append(cmd)
len(cmds)

60

In [86]:
with Pool(16) as p:
    try:
        p.map(run_cmd, cmds)
        p.close()
        p.join()
    except KeyboardInterrupt:
        p.terminate()