In [1]:
import os
import sys
import json
import shutil
from pathlib import Path
from tempfile import TemporaryDirectory
from multiprocessing import Pool
import pandas as pd

In [2]:
sys.path.append('../../src')

from run import run_amrfinder, run_mlst, run_plasmidfinder
from utils import run_cmd

In [4]:
def pipeline(infile, outdir, threads=4):
    os.makedirs(outdir, exist_ok=True)
    
    amrfinder_filename = os.path.join(outdir, 'amrfinder.txt')
    mlst_dirname = os.path.join(outdir, 'mlst')
    plasmidfinder_dirname = os.path.join(outdir, 'plasmidfinder')
    
    run_amrfinder(
        infile,
        amrfinder_filename,
        database='/media/GenomicResearch/Tools/amrfinder_database/latest/',
        threads=threads,
        organism='Vibrio_cholerae',
        plus=True
    )
    run_mlst(
        infile, mlst_dirname, '/media/GenomicResearch/Tools/CGE/mlst_db', 'vcholerae'
    )
    run_plasmidfinder(
        infile, plasmidfinder_dirname, '/media/GenomicResearch/Tools/CGE/plasmidfinder_db'
    )

In [6]:
dirpath = Path('/media/GenomicResearch/MiSeq/Vibrio_cholerae/Contigs')
outpath = Path('/media/GenomicResearch/MiSeq/Vibrio_cholerae/Analysis')

In [8]:
with Pool(32) as p:
    try:
        for filepath in dirpath.iterdir():
            outdir = outpath/filepath.stem
            p.apply_async(pipeline, (filepath, outdir), {'threads':4})
        p.close()
        p.join()
    except:
        p.terminate()

In [9]:
from parse import parse_amrfinder_result, parse_mlst_result, parse_plasmidfinder_result

In [10]:
dirpath = Path('/media/GenomicResearch/MiSeq/Vibrio_cholerae/Analysis')

summaries = dict()
for i in dirpath.iterdir():
    summary = dict()
    amrfinder_filename = os.path.join(i, 'amrfinder.txt')
    mlst_filename = os.path.join(i, 'mlst', 'data.json')
    plasmidfinder_filename = os.path.join(i, 'plasmidfinder', 'results_tab.tsv')
    
    records = parse_amrfinder_result(amrfinder_filename)
    summary['AMR'] = ', '.join(sorted(record['gene_symbol'] for record in records if record['element_subtype'] == 'AMR'))
    summary['POINT'] = ', '.join(sorted(record['gene_symbol'] for record in records if record['element_subtype'] == 'POINT'))
    summary.update(parse_mlst_result(mlst_filename))
    records = parse_plasmidfinder_result(plasmidfinder_filename)
    summary['Inc type'] = ', '.join(sorted(records))
    summaries[i.name] = summary

In [11]:
df = pd.DataFrame(summaries).T
df.index.name = 'Key'
df = df.sort_index()
df.head()

Unnamed: 0_level_0,AMR,POINT,ST,purM,pyrC,mdh,pntA,metE,adk,gyrB,Inc type
Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
569B,"almG, varG",,73,1,38,4,12,9,7,11,
C12.0861,"almG, qnrVC, varG",,75,1,38,4,12,37,7,2,
C12.0862,"almG, qnrVC, varG",,75,1,38,4,12,37,7,2,
C12.0876,"almG, qnrVC, varG",,75,1,38,4,12,37,7,2,
C12.0927,"almG, qnrVC, varG",,75,1,38,4,12,37,7,2,


In [12]:
df.to_csv('/media/GenomicResearch/MiSeq/Vibrio_cholerae/summaries.txt', sep='\t')