In [13]:
import os
from pathlib import Path
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor
import pandas as pd
from Bio.Application import AbstractCommandline, _Option, _Switch

In [14]:
class KmerFinderCommandline(AbstractCommandline):
    def __init__(self, cmd='kmerfinder.py', **kwargs):
        self.parameters = [
            _Option(['-i', 'infile'], '', equate=False, is_required=True),
            _Option(['-o', 'outdir'], '', equate=False),
            _Option(['-db', 'database'], '', equate=False),
            _Option(['-tax', 'taxonomy_file'], '', equate=False),
            _Option(['-kp', 'kma_path'], '', equate=False),
            _Switch(['-x', 'extented_output'], ''),
            _Switch(['-q', 'quiet'], '')
        ]
        AbstractCommandline.__init__(self, cmd, **kwargs)

In [15]:
database = '/dev/shm/bacteria/bacteria.ATG'
taxonomy_file = '/dev/shm/bacteria/bacteria.tax'

In [16]:
def fn(infile, outdir, database, taxonomy_file, kma_path='/usr/bin/'):
    os.makedirs(outdir, exist_ok=True)
    cline = KmerFinderCommandline('/media/NGS/Data_Analysis/CGE/kmerfinder/kmerfinder.py', infile=infile, outdir=outdir, database=database, taxonomy_file=taxonomy_file, 
                                  extented_output=True)
    cline()

In [86]:
fn('/media/NGS/Data_Analysis/20200214_Incomplete_Neisseria_genomes/Contigs/GCA_000818515.1.fa',
   '/media/NGS/Data_Analysis/20200214_Incomplete_Neisseria_genomes/Kmerfinder/GCA_000818515.1', database, taxonomy_file)

In [17]:
dirpath = Path('/media/NGS/MiSeqAnalysis/Listeria_monocytogenes/Contigs')
outpath = Path('/media/NGS/MiSeqAnalysis/Listeria_monocytogenes/Kmerfinder')

In [18]:
pairwise = defaultdict(list)
for i in dirpath.iterdir():
    prefix = i.stem.split('_')[0]
    pairwise[prefix].append(i.as_posix())

In [20]:
for i, j in pairwise.items():
    if len(j) != 1:
        print(i)

In [21]:
with ProcessPoolExecutor(12) as executor:
    for prefix, seqfiles in pairwise.items():
        outdir = outpath/prefix
        executor.submit(fn, ' '.join(seqfiles), outdir, database, taxonomy_file, '/usr/bin/')

In [22]:
def kmerfinder_summary(infile):
    profile = pd.read_csv(infile, sep='\t')
    query = profile.groupby('Species')['Query_Coverage'].sum().to_dict()
    template = profile.groupby('Species')['Template_Coverage'].max().to_dict()
    query = dict(sorted(query.items(), key=lambda x: x[1], reverse=True))
    qcov_tcov = {species: f'{query[species]:.2f}% / {template[species]:.2f}%' for species in query}
    return {idx: f'{item[0]} | {item[1]}' for idx, item in enumerate(qcov_tcov.items(), 1)}

In [None]:
dirpath = Path('/media/NAS/MiSeqOutput/181112_M04855_0032_000000000-C5YTF/Analysis')
abundance = {subpath.name: kmerfinder_summary(subpath/'kmerfinder.txt') for subpath in dirpath.iterdir()}

df = pd.DataFrame(abundance).T
df = df.sort_index()

In [50]:
df.to_csv('/media/NAS/MiSeqOutput/181112_M04855_0032_000000000-C5YTF/KmerFinder.tsv', sep='\t', header=False)

In [23]:
dirpath = Path('/media/NGS/MiSeqAnalysis/Listeria_monocytogenes/Kmerfinder')

In [24]:
abundance = dict()
for subpath in dirpath.iterdir():
    target = subpath/'results.txt'
    abundance[subpath.name] = kmerfinder_summary(target)
df = pd.DataFrame(abundance).T
df = df.sort_index()

In [25]:
df.to_csv('/media/NGS/MiSeqAnalysis/Listeria_monocytogenes/Kmerfinder.tsv', sep='\t', header=False)