In [1]:
import subprocess
from pathlib import Path
from multiprocessing import Pool
from itertools import product
from concurrent.futures import ProcessPoolExecutor
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def simulate_process(reference, quantity, qscore_model, error_model, mean_length, stdev_length, threads=8):
    cmd = [
        'rustyread', '--threads', str(threads), 'simulate',
        '--reference', reference, '--quantity', quantity,
        '--qscore_model', qscore_model, '--length', f"{mean_length},{stdev_length}",
        '--error_model', error_model, '--number_base_store', '500x',
    ]
    return subprocess.Popen(cmd, stdout=subprocess.PIPE)

In [9]:
def classify(classifier, database, output, report_output, threads, stdin):
    if classifier == 'kraken2':
        cmd = [classifier, '--db', database, '--memory-mapping', '--threads', str(threads), '--output', output, '--report', report_output, '/dev/fd/0']
    elif classifier == 'centrifuge':
        cmd = [classifier, '--mm', '-p', str(threads), '-x', database, '-U', '-']
    else:
        raise ValueError(f"{classifier} is invalidate.")
    subprocess.run(cmd, stdin=stdin)

In [4]:
database = '/media/GenomicResearch/Issue/kraken2_db/k2_pluspf_20210517'
qscore_model = '/media/GenomicResearch/Tools/rustyread_model/qscore_models/nanopore_2020'
error_model = '/media/GenomicResearch/Tools/rustyread_model/error_models/nanopore2020'

In [5]:
reference = '/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/sim/LT2/reference.fa'

In [6]:
depths = list(map(lambda x: x*x, range(1, 5)))

In [7]:
lengths = list(map(lambda x: x*1000, range(1, 6)))

In [8]:
outpath = Path('/media/GenomicResearch/Issue/classification_benchmark/test')

In [12]:
def fn(depth, length, output, report, threads=6):
    stdev = int(length * 1.4)
    p = simulate_process(reference, f"{depth}x", qscore_model, error_model, length, stdev, threads)
    classify('kraken2', database, output, report, threads, p.stdout)
    

with ProcessPoolExecutor(4) as executor:
    for depth, length in product(depths, lengths):
        output = outpath/f"{depth}x_{length}bp.out"
        report = outpath/f"{depth}x_{length}bp.txt"
        executor.submit(fn, depth, length, output, report, 16)

In [None]:
organism = 'Salmonella enterica'

In [None]:
dirpath = Path('/media/GenomicResearch/Issue/classification_benchmark/kraken2')

data = []
for i in dirpath.iterdir():
    df = pd.read_csv(i, sep='\t', header=None)
    abundance = df[df[5].str.strip()==organism].iloc[0][0]
    depth, length = i.stem.split('_')
    data.append((depth, length, abundance))

In [None]:
result = pd.DataFrame(data, columns=['depth', 'read_length', 'abundance'])

In [None]:
result['depth'] = result['depth'].str.replace('x', '').str.zfill(3)

result = result.sort_values('depth')

result['depth'] = result['depth'].str.lstrip('0')
result['read_length'] = result['read_length'].str.replace('bp', '').astype(int)

In [None]:
with plt.style.context('seaborn-whitegrid'):
    fig, ax = plt.subplots(figsize=(12, 4), dpi=87)
    g = sns.lineplot(x='depth', y='abundance', data=result, hue='read_length', palette='Set1', ax=ax)
    ax.set_xlabel('Depth')
    ax.set_ylabel('Abundance(%)')
    ax.set_ylim(0, 100)
    ax.legend(bbox_to_anchor=(1.1, 0.5), title='read length', loc=10)

In [None]:
fig.savefig('/media/GenomicResearch/Issue/classification_benchmark/results.png', facecolor='w', bbox_inches='tight', dpi=150)