In [6]:
import os
import subprocess
from pathlib import Path
from io import BytesIO, StringIO
from collections import defaultdict
from multiprocessing import Pool
import pandas as pd

from Bio.Blast.Applications import NcbiblastnCommandline

import sys
sys.path.append('../src')

from utils import run_cmd

In [2]:
dirpath = Path('/media/NAS/MiSeqOutput/210329_M04855_0070_000000000-JDN5K/Data/Intensities/BaseCalls')
outpath = Path('/media/NAS/MiSeqOutput/210329_M04855_0070_000000000-JDN5K/shigatyper')

In [3]:
pairwise = defaultdict(list)
for i in dirpath.iterdir():
    prefix = i.stem.split('_')[0]
    pairwise[prefix].append(i.as_posix())

In [5]:
prog_name = '/home/chen1i6c04/Tools/shigatyper/shigatyper/shigatyper.py'

cmds = []
for prefix, seqfiles in pairwise.items():
    outfile = outpath/prefix
    cmd = f'{prog_name} -n {outfile} {seqfiles[0]} {seqfiles[1]}'
    cmds.append(cmd)

In [6]:
with Pool(16) as p:
    results = p.map(run_cmd, cmds)

In [7]:
summaries = []
for stdout, stderr in results:
    summary = pd.read_csv(BytesIO(stdout), sep='\t')
    summaries.append(summary)

In [8]:
summaries = pd.concat(summaries, ignore_index=True)

In [9]:
summaries['sample'] = [os.path.basename(i) for i in summaries['sample']]

In [10]:
summaries.to_csv('/media/NAS/MiSeqOutput/210329_M04855_0070_000000000-JDN5K/shigatyper.tsv', sep='\t', index=False)

In [62]:
query = '/media/Central_Lab_Storage/bo-han/Database/Escherichia_and_Shigella/Dereplication/ERR1953702.fa'
subject = '/media/GenomicResearch/Tools/shigatyper/shigatyper/resources/ShigellaRef5.fasta'

In [63]:
cline = NcbiblastnCommandline(
    query=query,
    subject=subject,
    outfmt='6 qseqid sseqid pident length qstart qend qlen sstart send slen bitscore',
    perc_identity=95,
)
stdout, stderr = cline()

In [64]:
df = pd.read_csv(StringIO(stdout), sep='\t', names=['qseqid', 'sseqid', 'pident', 'length', 'qstart', 'qend', 'qlen', 'sstart', 'send', 'slen', 'bitscore'])

In [65]:
df.loc[:, 'scov'] = df['length']/df['slen']*100

In [66]:
df.sort_values('bitscore', ascending=False).drop_duplicates('sseqid')

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,qlen,sstart,send,slen,bitscore,scov
6,NODE_204_length_7110_cov_7.237845,cadA,99.069,2148,419,2566,7110,1,2143,2143,3851.0,100.233318
3,NODE_30_length_38185_cov_37.308183,ipaB,99.254,1743,23386,25128,38185,1,1743,1743,3147.0,100.0
8,NODE_448_length_2058_cov_105.824264,gtrI,100.0,1521,131,1651,2058,1,1521,1521,2809.0,100.0
7,NODE_264_length_5486_cov_19.098140,gtrII,99.932,1461,3893,5353,5486,1461,1,1461,2693.0,100.0
1,NODE_27_length_38889_cov_95.095921,Sf_wzx,99.841,1257,18144,19400,38889,1257,1,1257,2311.0,100.0
4,NODE_59_length_23264_cov_4.503770,EclacY,99.681,1254,20586,21839,23264,1254,1,1254,2294.0,100.0
2,NODE_27_length_38889_cov_95.095921,Sf_wzy,99.913,1149,15158,16306,38889,1149,1,1149,2117.0,100.0
5,NODE_101_length_14578_cov_78.422227,Oac1b,100.0,1002,13460,14461,14578,1002,1,1002,1851.0,100.0
11,NODE_1132_length_717_cov_734.240181,ipaH_c,99.721,717,1,717,717,717,1,780,1314.0,91.923077
12,NODE_1198_length_672_cov_1.474878,Sd1_wzy,96.093,691,1,672,672,215,905,1143,1109.0,60.454943
