In [17]:
from pathlib import Path
from multiprocessing import Pool

In [2]:
import sys
sys.path.append('../src')

from utils import run_cmd

In [3]:
def gene_annotation(seqfile, prefix, outdir, training_file=None, cpus=2):
    if training_file:
        cmd = f'prokka --prefix {prefix} --prodigaltf {training_file} --cpus {cpus} --outdir {outdir} {seqfile}'
    else:
        cmd = f'prokka --prefix {prefix} --cpus {cpus} --outdir {outdir} {seqfile}'
    run_cmd(cmd)

In [4]:
dirpath = Path('/media/NGS/MiSeqAnalysis/Neisseria_meningitidis/Contigs')
outpath = Path('/media/NGS/MiSeqAnalysis/Neisseria_meningitidis/Annotation')

In [5]:
with Pool(16) as p:
    try:
        for filepath in dirpath.iterdir():
            prefix = filepath.stem
            outdir = outpath/prefix
            p.apply_async(gene_annotation, args=(filepath, prefix, outdir), kwds={'cpus': 4})
        p.close()
        p.join()
    except KeyboardInterrupt:
        p.terminate()

In [12]:
import pandas as pd
from Bio import SeqIO

In [19]:
dirpath = Path('/media/NGS/MiSeqAnalysis/Neisseria_meningitidis/Annotation')
outpath = Path('/media/NGS/MiSeqAnalysis/Neisseria_meningitidis/penA')

In [21]:
for fp in dirpath.iterdir():
    features = pd.read_csv(fp/(fp.name + '.tsv'), sep='\t')
    features = features[features['product'].notna()]
    features = features[features['product'].str.contains('Penicillin-binding protein')]
    locus_tags = set(features['locus_tag'])
    
    records = SeqIO.parse(fp/(fp.name + '.ffn'), 'fasta')
    specific_records = []
    for record in records:
        if record.id in locus_tags:
            record.id = fp.name
            specific_records.append(record)
    SeqIO.write(specific_records, outpath/(fp.name + '.fna'), 'fasta')