In [1]:
import os
import sys
import json
import shutil
import subprocess
from pathlib import Path
from tempfile import TemporaryDirectory
from multiprocessing import Pool
import pandas as pd

In [2]:
sys.path.append('../../src')

from run import run_amrfinder, run_mlst, run_plasmidfinder

In [3]:
def gbs_serotype(infile, outfile):
    program = '/media/GenomicResearch/Tools/GBS-SBG/GBS-SBG.pl'
    cmd = [program, infile, '-best']
    with open(outfile, 'w') as handle:
        subprocess.run(cmd, stdout=handle)
        
def parse_gbs_serotype(infile):
    df = pd.read_csv(infile, sep='\t', usecols=['Serotype'])
    return df['Serotype'][0].split(':')[-1]

In [4]:
def pipeline(infile, outdir, prefix, threads=4):
    os.makedirs(outdir, exist_ok=True)
    
    amrfinder_filename = os.path.join(outdir, 'amrfinder.txt')
#     amrfinder_nucl_filename = os.path.join(outdir, 'amrfinder.fna')
    mlst_dirname = os.path.join(outdir, 'mlst')
    plasmidfinder_dirname = os.path.join(outdir, 'plasmidfinder')
    resfinder_dirname = os.path.join(outdir, 'resfinder')
    gbs_sbg_filename = os.path.join(outdir, 'GBS-SBG.txt')
    
    run_amrfinder(
        infile,
        amrfinder_filename,
        database='/media/GenomicResearch/Tools/amrfinder_database/latest/',
        threads=threads,
        organism='Streptococcus_agalactiae',
        plus=True,
        report_common=True
    )
    run_mlst(
        infile, mlst_dirname, '/media/GenomicResearch/Tools/CGE/mlst_db', 'sagalactiae'
    )
    run_plasmidfinder(
        infile, plasmidfinder_dirname, '/media/GenomicResearch/Tools/CGE/plasmidfinder_db'
    )
    gbs_serotype(infile, gbs_sbg_filename)
#     run_resfinder(
#         infile, resfinder_dirname,
#         db_res='/media/GenomicResearch/Tools/CGE/resfinder_db',
#         db_point='/media/GenomicResearch/Tools/CGE/pointfinder_db',
#         point=True,
#         species='klebsiella',
#     )

In [7]:
dirpath = Path('/media/GenomicResearch/Issue/20220127_GBS_outbreaks_cgmlst_test/Contigs')
outpath = Path('/media/GenomicResearch/Issue/20220127_GBS_outbreaks_cgmlst_test/Analysis')

In [8]:
with Pool(20) as p:
    try:
        for filepath in dirpath.iterdir():
            outdir = outpath/filepath.stem
            sample_name = filepath.stem
            p.apply_async(pipeline, (filepath, outdir, sample_name), {'threads':4})
        p.close()
        p.join()
    except:
        p.terminate()

In [9]:
from parse import parse_amrfinder_result, parse_mlst_result, parse_plasmidfinder_result

In [10]:
dirpath = Path('/media/GenomicResearch/Issue/20220127_GBS_outbreaks_cgmlst_test/Analysis')

summaries = dict()
for i in dirpath.iterdir():
    summary = dict()
    amrfinder_filename = os.path.join(i, 'amrfinder.txt')
    mlst_filename = os.path.join(i, 'mlst', 'data.json')
    plasmidfinder_filename = os.path.join(i, 'plasmidfinder', 'results_tab.tsv')
    
    records = parse_amrfinder_result(amrfinder_filename)
    summary['AMR'] = ', '.join(sorted(record['gene_symbol'] for record in records if record['element_subtype'] == 'AMR'))
    summary['POINT'] = ', '.join(sorted(record['gene_symbol'] for record in records if record['element_subtype'] == 'POINT'))
    summary.update(parse_mlst_result(mlst_filename))
    records = parse_plasmidfinder_result(plasmidfinder_filename)
    summary['Inc type'] = ', '.join(sorted(records))
    summary['Serotype'] = parse_gbs_serotype(os.path.join(i, 'GBS-SBG.txt'))
    summaries[i.name] = summary

In [11]:
df = pd.DataFrame(summaries).T
df.index.name = 'Key'
df = df.sort_index()
df.head()

Unnamed: 0_level_0,AMR,POINT,ST,pheS,glnA,atr,glcK,tkt,sdhA,adhP,Inc type,Serotype
Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
PHEGBS0041,tet(M),,1,1,1,2,2,2,1,1,repUS43,V
PHEGBS0055,tet(M),,1,1,1,2,2,2,1,1,repUS43,V
PHEGBS0081,tet(M),,1,1,1,2,2,2,1,1,repUS43,V
PHEGBS0082,"erm(A), tet(M)",,1,1,1,2,2,2,1,1,repUS43,V
PHEGBS0098,"erm(A), tet(M)",,1,1,1,2,2,2,1,1,repUS43,V


In [12]:
df.to_csv('/media/GenomicResearch/Issue/20220127_GBS_outbreaks_cgmlst_test/summaries.txt', sep='\t')