In [1]:
import os
import sys
import json
import subprocess
from pathlib import Path
from tempfile import TemporaryDirectory
from concurrent.futures import ProcessPoolExecutor
import pandas as pd

In [2]:
sys.path.append('../src')

In [3]:
from application import SerotypefinderCommandline

In [4]:
def run_serotypefinder(infile, outdir, db_path):
    os.makedirs(outdir, exist_ok=True)
    with TemporaryDirectory(dir='/tmp/') as tmpdir:
        cline = SerotypefinderCommandline(
            infile=infile, outdir=outdir, db_path=db_path, tmp=tmpdir, extented_output=True
        )
        cline()

In [3]:
dirpath = Path('/media/NGS/Data_Analysis/20210401/Contigs')
outpath = Path('/media/NGS/Data_Analysis/20210401/serotypefinder')
database = '/media/NGS/Data_Analysis/CGE/serotypefinder_db'

In [4]:
with ProcessPoolExecutor(48) as executor:
    for filepath in dirpath.iterdir():
        outdir = outpath/filepath.stem
        executor.submit(run, filepath, outdir, database)

In [24]:
df = pd.read_csv('/media/NGS/Data_Analysis/20210401_NHRI_Ecoli/Analysis/eco-037/serotypefinder/results_tab.tsv', sep='\t')
df = df.sort_values('Identity', ascending=False).drop_duplicates('Database')
df

Unnamed: 0,Database,Gene,Serotype,Identity,Template / HSP length,Contig,Position in contig,Accession number
0,H_type,fliC,H7,100.0,1758 / 1758,NODE_3_length_518691_cov_14.001030_pilon,152753..154510,AF228492
1,O_type,wzy,O18,99.92,1191 / 1191,NODE_18_length_104213_cov_14.989634_pilon,57788..58978,GU299793


In [19]:
def serotrype_summary(filepath):
    df = pd.read_csv(filepath, sep='\t')
    df = df.drop_duplicates('Serotype')
    s = df.groupby('Database')['Serotype'].agg(lambda s: s.str.cat(sep='/').split('/'))
    summary = s.apply(lambda x: ' or '.join(sorted(set(x)))).to_dict()
    return summary

In [20]:
serotrype_summary('/media/NGS/MiSeqAnalysis/Shigella/Analysis/R20.0084/serotypefinder/results_tab.tsv')

{'H_type': 'H14', 'O_type': 'O129 or O13 or O135'}

In [55]:
dirpath = Path('/media/NGS/Data_Analysis/20210401/serotypefinder')

In [56]:
summaries = dict()
for fp in dirpath.iterdir():
    filepath = fp/'results_tab.tsv'
    summary = serotrype_summary(filepath)
    summaries[fp.name] = summary

In [57]:
summaries = pd.DataFrame(summaries).T

In [58]:
summaries

Unnamed: 0,H_type,O_type
eco-053,H5,O75
eco-081,H4,O25
eco-123,H4,O38
eco-073,H5,O170
eco-069,H7,O174
...,...,...
eco-131,H7,O83
eco-105,H4,O129 or O13 or O135
eco-025,H31,O6
eco-016,H6,


In [61]:
summaries.index.name = 'ID'
summaries = summaries.sort_index()

In [62]:
summaries.to_csv('/media/NGS/Data_Analysis/20210401/serotypefinder.tsv', sep='\t')