In [1]:
import os
import sys
import json
import shutil
from pathlib import Path
from tempfile import TemporaryDirectory
from multiprocessing import Pool
import pandas as pd

In [2]:
sys.path.append('../src')

from run import run_amrfinder, run_mlst, run_plasmidfinder, run_resfinder
from utils import run_cmd

In [3]:
def pipeline(infile, outdir, prefix, threads=4):
    os.makedirs(outdir, exist_ok=True)
    
    amrfinder_filename = os.path.join(outdir, 'amrfinder.txt')
#     amrfinder_nucl_filename = os.path.join(outdir, 'amrfinder.fna')
    mlst_dirname = os.path.join(outdir, 'mlst')
    plasmidfinder_dirname = os.path.join(outdir, 'plasmidfinder')
    resfinder_dirname = os.path.join(outdir, 'resfinder')
    
    run_amrfinder(
        infile,
        amrfinder_filename,
        database='/media/GenomicResearch/Tools/amrfinder_database/latest/',
        threads=threads,
        organism='Streptococcus_agalactiae',
#         nucleotide_output=amrfinder_nucl_filename,
    )
    run_mlst(
        infile, mlst_dirname, '/media/GenomicResearch/Tools/CGE/mlst_db', 'sagalactiae'
    )
    run_plasmidfinder(
        infile, plasmidfinder_dirname, '/media/GenomicResearch/Tools/CGE/plasmidfinder_db'
    )
#     run_resfinder(
#         infile, resfinder_dirname,
#         db_res='/media/GenomicResearch/Tools/CGE/resfinder_db',
#         db_point='/media/GenomicResearch/Tools/CGE/pointfinder_db',
#         point=True,
#         species='klebsiella',
#     )

In [4]:
dirpath = Path('/media/GenomicResearch/MiSeq/Streptococcus_agalactiae/NEW')
outpath = Path('/media/GenomicResearch/MiSeq/Streptococcus_agalactiae/Analysis')

In [5]:
with Pool(20) as p:
    try:
        for filepath in dirpath.iterdir():
            outdir = outpath/filepath.stem
            sample_name = filepath.stem
            p.apply_async(pipeline, (filepath, outdir, sample_name), {'threads':4})
        p.close()
        p.join()
    except:
        p.terminate()

In [6]:
from parse import parse_amrfinder_result, parse_mlst_result, parse_plasmidfinder_result

In [7]:
dirpath = Path('/media/GenomicResearch/MiSeq/Streptococcus_agalactiae/Analysis')

summaries = dict()
for i in dirpath.iterdir():
    summary = dict()
    amrfinder_filename = os.path.join(i, 'amrfinder.txt')
    mlst_filename = os.path.join(i, 'mlst', 'data.json')
    plasmidfinder_filename = os.path.join(i, 'plasmidfinder', 'results_tab.tsv')
    
    records = parse_amrfinder_result(amrfinder_filename)
    summary['AMR'] = ', '.join(sorted(record['gene_symbol'] for record in records if record['element_subtype'] == 'AMR'))
    summary['POINT'] = ', '.join(sorted(record['gene_symbol'] for record in records if record['element_subtype'] == 'POINT'))
    summary.update(parse_mlst_result(mlst_filename))
    records = parse_plasmidfinder_result(plasmidfinder_filename)
    summary['Inc type'] = ', '.join(sorted(records))
    summaries[i.name] = summary

In [8]:
df = pd.DataFrame(summaries).T
df.index.name = 'Key'
df = df.sort_index()
df.head()

Unnamed: 0_level_0,AMR,POINT,ST,pheS,glnA,sdhA,atr,glcK,tkt,adhP,Inc type
Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0422GBSPC,tet(M),,110,1,2,2,3,2,9,1,repUS43
R21.1103,"lsa(C), tet(M)",,23,4,3,2,6,1,3,5,
R21.1104,"erm(A), tet(M)",,1,1,1,1,2,2,2,1,repUS43
R21.1105,,,1,1,1,1,2,2,2,1,
R21.1106,tet(M),,1,1,1,1,2,2,2,1,repUS43


In [9]:
df.to_csv('/media/GenomicResearch/MiSeq/Streptococcus_agalactiae/summaries.txt', sep='\t')