In [1]:
import os
import shutil
from pathlib import Path
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor

import pandas as pd
from Bio.Application import AbstractCommandline, _Option, _Switch

In [2]:
class PointfinderCommandline(AbstractCommandline):
    def __init__(self, cmd='pointfinder.py', **kwargs):
        self.parameters = [
            _Option(['-i', 'infile'], '', equate=False, is_required=True),
            _Option(['-o', 'outdir'], '', equate=False, is_required=True),
            _Option(['-p', 'database'], '', equate=False, is_required=True),
            _Option(['-s', 'species'], '', equate=False, is_required=True),
            _Option(['-t', 'threshold'], '', equate=False),
            _Option(['-l', 'min_cov'], '', equate=False),
            _Option(['-m', 'method'], '', equate=False),
            _Option(['-m_p', 'method_path'], '', equate=False),
            _Switch(['-n', 'no_Ns'], ''),
            _Switch(['-u', 'unknown_mut'], ''),
            _Switch(['-r', 'stop_codons'], ''),
        ]
        AbstractCommandline.__init__(self, cmd, **kwargs)

In [3]:
def run_pointfinder(src, outdir, database, species, threshold=0.8, unknown_mut=False):
    os.makedirs(outdir, exist_ok=True)
    dst = os.path.join(outdir, 'contigs.fa')
    os.symlink(src, dst)
    cline = PointfinderCommandline(infile=dst, outdir=outdir, database=database, species=species,
                                   method='blastn', method_path='/usr/bin/blastn', unknown_mut=unknown_mut)
    cline()
    os.remove(dst)
    shutil.rmtree(os.path.join(outdir, 'tmp'))

In [4]:
dirpath = Path('/media/NGS/Data_Analysis/20210401_NHRI_Ecoli/Contigs')
outpath = Path('/media/NGS/Data_Analysis/20210401_NHRI_Ecoli/Analysis')
database='/media/NGS/Data_Analysis/CGE/pointfinder_db/'
species='escherichia_coli'

In [5]:
with ProcessPoolExecutor(64) as executor:
    for filepath in dirpath.iterdir():
        outdir = outpath/filepath.stem/'pointfinder'
        executor.submit(run_pointfinder, filepath, outdir, database, species, threshold=0.6, unknown_mut=False)

In [None]:
dirpath = Path('/media/NAS/Synology_222/3_個人儲存區/陳柏翰/佑文資料/Pointfinder')

In [None]:
summaries = dict()
for subpath in dirpath.iterdir():
    filepath = subpath/'PointFinder_results.txt'
    df = pd.read_csv(filepath, sep='\t')
    summary = defaultdict(list)
    for gene_name, position in df['Mutation'].str.split(' ', 1):
        summary[gene_name].append(position)
    summary = {i: ', '.join(j) for i, j in summary.items()}
    summaries[filepath.parent.name] = summary

In [None]:
summaries = pd.DataFrame(summaries).T
summaries.index.name = 'ID'
summaries = summaries.sort_index()

In [None]:
summaries.to_csv('/media/NAS/Synology_222/3_個人儲存區/陳柏翰/佑文資料/Pointfinder.tsv', sep='\t')

In [None]:
import re

In [None]:
gene_name = 'gyrA'
position = '86'

In [None]:
def specific_mutation(pointfinder_output, gene_name, position):
    df = pd.read_csv(pointfinder_output, sep='\t')
    df = df[df['Mutation'].str.startswith(gene_name)]
    position_match = df['Mutation'].agg(lambda x :re.findall('[0-9]+', x)[-1]) == position
#     change_match = df['Nucleotide change'].agg(lambda x :tuple(re.findall('[A-Z]', x))) == change
    if position_match.any():
        return df[position_match]

In [None]:
dirpath = Path('/media/NGS/MiSeqAnalysis/Campylobacter_coli/pointfinder_all')

results = []
for fp in dirpath.iterdir():
    pointfinder_output = fp/'contigs_blastn_results.tsv'
    result = specific_mutation(pointfinder_output, gene_name, position)
    if result is not None:
        result.index = [fp.name]
        results.append(result)

In [None]:
pd.concat(results)