In [55]:
import os
import sys
import json
import shutil
from pathlib import Path
from collections import defaultdict
from tempfile import TemporaryDirectory
from concurrent.futures import ProcessPoolExecutor

import pandas as pd

sys.path.append('../src')
from command import run_cmd

In [2]:
def run_amrfinder(infile, outfile, database, threads=2):
    tmp = TemporaryDirectory(dir='/tmp')
    tmp_dir = tmp.name
    tmp_file = os.path.join(tmp_dir, 'locus.faa')
    run_cmd(f'prodigal -i {infile} -a {tmp_file}')
    run_cmd(f'amrfinder -p {tmp_file} -o {outfile} -d {database} --threads {threads}')
    
def run_mlst(infile, outdir, database, species):
    os.makedirs(outdir, exist_ok=True)
    tmp = TemporaryDirectory(dir='/tmp')
    tmp_dir = tmp.name
    run_cmd(f'mlst.py -i {infile} -o {outdir} -p {database} -s {species} -t {tmp_dir} -x')
    
def run_plasmidfinder(infile, outdir, database):
    os.makedirs(outdir, exist_ok=True)
    tmp = TemporaryDirectory(dir='/tmp')
    tmp_dir = tmp.name
    run_cmd(f'plasmidfinder.py -i {infile} -o {outdir} -p {database} -tmp {tmp_dir} -x')
    
def run_pointfinder(infile, outdir, database, species):
    os.makedirs(outdir, exist_ok=True)
    run_cmd(f'pointfinder.py -i {infile} -o {outdir} -p {database} -s {species} -m blastn -m_p /usr/bin/blastn')
    shutil.rmtree(os.path.join(outdir, 'tmp'))

In [6]:
AMRFINDER_DB = '/home/chen1i6c04/Tools/amr/database/latest'
MLST_DB = '/media/NGS/Data_Analysis/CGE/mlst_db'
PLASMIDFINDER_DB = '/media/NGS/Data_Analysis/CGE/plasmidfinder_db/'
POINTFINDER_DB = '/media/NGS/Data_Analysis/CGE/pointfinder_db/'


def check_mlst_species(species):
    config = pd.read_csv(os.path.join(MLST_DB, 'config'), sep='\t', header=3)
    if species not in set(config['# species_db']):
        raise
        
def check_pointfinder_organism(organism):
    config = pd.read_csv(os.path.join(POINTFINDER_DB, 'config'), sep='\t', header=3)
    if organism not in set(config['#db_prefix']):
        raise

def fn(infile, outdir, mlst_species=None, pointfinder_organism=None):
    os.makedirs(outdir)
    if mlst_species:
        check_mlst_species(mlst_species)
        run_mlst(infile, os.path.join(outdir, 'mlst'), MLST_DB, mlst_species)
    if pointfinder_organism:
        check_pointfinder_organism(pointfinder_organism)
        run_pointfinder(infile, os.path.join(outdir, 'pointfinder'), POINTFINDER_DB, pointfinder_organism)
    run_amrfinder(infile=infile, outfile=os.path.join(outdir, 'amrfinder.txt'), database=AMRFINDER_DB)
    run_plasmidfinder(infile, os.path.join(outdir, 'plasmidfinder'), PLASMIDFINDER_DB)     

In [7]:
dirpath = Path('/media/NGS/Data_Analysis/20191128_CDC_MiSeq/Campylobacter/Contigs')
outpath = Path('/media/NGS/Data_Analysis/20191128_CDC_MiSeq/Campylobacter/Analyze')

with ProcessPoolExecutor(32) as executor:
    for i in dirpath.iterdir():
        executor.submit(fn, i, outpath/i.stem, 'campylobacter', 'campylobacter')

In [8]:
def amr_profile_summary(source):
    pf = pd.read_csv(source, sep='\t')
    pf = pf.drop_duplicates('Gene symbol')
    pf = pf[pf['Element type'] == 'AMR']
    summary = pf.groupby('Class')['Gene symbol'].apply(', '.join)
    return summary

In [16]:
summaries = []
for i in outpath.iterdir():
    summary = amr_profile_summary(i/'amrfinder.txt')
    summary.name = i.name
    summaries.append(summary)
summaries = pd.concat(summaries, axis=1).T
summaries.columns = summaries.columns.str.title()
summaries.index.name = 'Key'

In [17]:
summaries.to_csv('/media/NGS/Data_Analysis/20191128_CDC_MiSeq/Campylobacter/AMR.csv')

In [18]:
def MlstParser(source):
    with open(source) as f:
        data = json.load(f)
    result = data['mlst']['results']
    summary = dict()
    summary['ST'] = result['sequence_type']
    for locus_id, info in result['allele_profile'].items():
        summary[locus_id] = info['allele']
    return summary

In [21]:
summaries = dict()
for i in outpath.iterdir():
    summary = MlstParser(i/'mlst'/'data.json')
    summaries[i.name] = summary
df = pd.DataFrame(summaries).T
df.index.name = 'Key'

In [23]:
df.to_csv('/media/NGS/Data_Analysis/20191128_CDC_MiSeq/Campylobacter/MLST.csv')

In [32]:
summaries = dict()
for i in outpath.iterdir():
    result = pd.read_csv(i/'plasmidfinder'/'results_tab.tsv', sep='\t')
    result = result.drop_duplicates('Plasmid')
    summaries[i.name] = ', '.join(result['Plasmid'])
s = pd.Series(summaries, name='Plasmid')
s.index.name = 'Key'

In [33]:
s.to_csv('/media/NGS/Data_Analysis/20191128_CDC_MiSeq/Campylobacter/Plasmid.csv')

In [78]:
summaries = dict()
for i in outpath.iterdir():
    summary = defaultdict(list)
    for j in (i/'pointfinder').glob('*.tsv'):
        pf = pd.read_csv(j, sep='\t')
        for gene, position in pf['Mutation'].str.split(' '):
            summary[gene].append(position)
    summary = {k: ', '.join(v) for k, v in summary.items()}
    summaries[i.name] = summary

summaries = pd.DataFrame(summaries).T
summaries.index.name = 'Key'

In [79]:
summaries.to_csv('/media/NGS/Data_Analysis/20191128_CDC_MiSeq/Campylobacter/Point-mutations.csv')