In [1]:
import os
from pathlib import Path
from multiprocessing import Pool
from tempfile import TemporaryDirectory

In [2]:
import sys
sys.path.append('../src')
from application import VirulencefinderCommandline

In [3]:
def run_virulencefinder(infile, outdir, db_path, database):
    os.makedirs(outdir, exist_ok=True)
    with TemporaryDirectory(dir='/tmp/') as tempdir:
        cline = VirulencefinderCommandline('/media/GenomicResearch/Tools/CGE/virulencefinder/virulencefinder.py',
            infile=infile, outdir=outdir, db_path=db_path, database=database, tmp=tempdir, extented_output=True, mincov=0.6, threshold=0.9 
        )
        cline()

db_prefix|name|description
---------|----|-------------
virulence_ecoli|Escherichia coli|Virulence genes for Escherichia coli
virulence_ent|Enterococcus|Virulence genes for Enterococcus
listeria|Listeria|Virulence genes for Listeria
s.aureus_exoenzyme|S. aureus|Exoenzyme genes for S. aureus   
s.aureus_hostimm|S. aureus|Hostimm genes for S. aureus
s.aureus_toxin|S. aureus|Toxin genes for S. aureus
stx|Escherichia coli|Shiga-toxin genes

In [7]:
dirpath = Path('/media/GenomicResearch/MiSeq/Listeria_monocytogenes/Contigs')
outpath = Path('/media/GenomicResearch/MiSeq/Listeria_monocytogenes/Analysis')
db_path = '/media/GenomicResearch/Tools/CGE/virulencefinder_db'
database = 'listeria'

In [11]:
run_virulencefinder(
    '/media/GenomicResearch/MiSeq/Listeria_monocytogenes/Contigs/R21.1268.fa',
    '/media/GenomicResearch/MiSeq/Listeria_monocytogenes/Analysis/R21.1268/virulencefinder/', db_path, database)

In [10]:
with Pool(32) as p:
    try:
        for i in dirpath.iterdir():
            outdir = outpath/i.stem/'virulencefinder'
            p.apply_async(run_virulencefinder, args=(i, outdir, db_path, database))
        p.close()
        p.join()
    except KeyboardInterrupt:
        p.terminate()

In [None]:
import pandas as pd

In [None]:
def virulence_summary(filepath):
    df = pd.read_csv(filepath, sep='\t')
    df['Coverage'] = [int(qlen)/int(slen)*100 for qlen, slen in df['Query / Template length'].str.split(' / ')]
    
    summary = set()
    for contig, group in df.groupby('Contig'):
        group = group.sort_values(['Identity', 'Coverage'], ascending=[False, False])
        i, c = group.iloc[0]['Identity'], group.iloc[0]['Coverage']
        group = group[(group['Identity']==i)&(group['Coverage']==c)]
        summary.update(group["Virulence factor"])
    return summary

In [None]:
dirpath = Path('/media/NGS/Data_Analysis/20210401/virulencefinder')
summaries = dict()
for fp in dirpath.iterdir():
    filepath = fp/'results_tab.tsv'
    summary = virulence_summary(filepath)
    summaries[fp.name] = ', '.join(sorted(summary))

In [None]:
s = pd.Series(summaries)
s.name = 'virulence gene'
s.index.name = 'Key'

In [None]:
s.to_csv('/media/NGS/Data_Analysis/20210401/virulencefinder.tsv', sep='\t')

In [None]:
df = pd.read_csv('/media/NGS/Data_Analysis/20210401/virulencefinder.tsv', sep='\t')

In [None]:
split_data = [(', '.join(i[:15]), ', '.join(i[15:])) for i in df['virulence gene'].str.split(', ')]

In [None]:
df['virulence gene 1'] = list(map(lambda x: x[0], split_data))
df['virulence gene 2'] = list(map(lambda x: x[1], split_data))

In [None]:
df.to_csv('/media/NGS/Data_Analysis/20210401/virulencefinder.tsv', sep='\t', index=False)