In [1]:
import re
import os
import sys
import shutil
from pathlib import Path
from tempfile import TemporaryDirectory
from multiprocessing import Pool

import pandas as pd

In [2]:
sys.path.append('../src')
from utils import run_cmd

genus          |database
---------------|-------
unkonw         |bacteria_odb10
Shigella       |enterobacterales_odb10
Neisseria      |neisseriales_odb10
Listeria       |bacillales_odb10
Salmonella     |enterobacterales_odb10
Mycobacterium  |corynebacteriales_odb10
Campylobacter  |campylobacterales_odb10
Vibrio         |vibrionales_odb10
Acinetobacter  |pseudomonadales_odb10
Pseudomonas    |pseudomonadales_odb10

## Sequence from NCBI Assembly Database

In [7]:
def run_busco(infile, outfile, lineage, db_path, threads=1):
    with TemporaryDirectory(dir='/tmp') as tmpdir:
        cmd = f'conda run -n busco busco -i {infile} -c {threads} -o busco --out_path {tmpdir} -l {lineage} -m geno --offline --download_path {db_path}'
        run_cmd(cmd)
        busco_summary = os.path.join(tmpdir, 'busco', f'short_summary.specific.{lineage}.busco.txt')
        shutil.copyfile(busco_summary, outfile)

In [None]:
dirpath = Path('/media/Central_Lab_Storage/NcbiSRA/NCBI_Neisseria_meningitidis_SRA/Contigs')
outpath = Path('/media/Central_Lab_Storage/NcbiSRA/NCBI_Neisseria_meningitidis_SRA/Busco')
database_path = '/media/GenomicResearch/Tools/busco_downloads'
lineage = 'lactobacillales_odb10'

In [8]:
with Pool(72) as p:
    try:
        for filepath in dirpath.iterdir():
            outfile = outpath/f"short_summary.specific.{lineage}.{filepath.stem}.txt"
            if outfile.exists() is False:
                p.apply_async(run_busco, (filepath, outfile, lineage, database_path))
        p.close()
        p.join()
    except KeyboardInterrupt:
        p.terminate()

In [7]:
run_busco(filepath, filepath.stem, outpath, database_path, 8)

## Sequence from NCBI SRA Database

In [3]:
def run_busco(seqfile, sample_name, outdir, db_path, threads=1):
    with TemporaryDirectory(dir='/tmp') as tmpdir:
        cmd = f'conda run -n busco busco -i {seqfile} -c {threads} -o {sample_name} --out_path {tmpdir} --auto-lineage-prok -m geno --offline --download_path {db_path}'
        run_cmd(cmd)
        for summary_file in Path(tmpdir, sample_name).glob(f'short_summary.specific.*.{sample_name}.txt'):
            shutil.copy(summary_file, outdir)

In [4]:
dirpath = Path('/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/Contigs')
outpath = Path('/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/Busco')
database_path = '/media/GenomicResearch/Tools/busco_downloads'

In [7]:
ani = pd.read_csv('/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/ANI.tsv', sep='\t', header=None)
accs = set(ani[ani[2]>=95][0])

In [9]:
with Pool(88) as p:
    try:
        for filepath in dirpath.iterdir():
            if filepath.stem in accs:
                p.apply_async(run_busco, (filepath, filepath.stem, outpath, database_path, 1))
        p.close()
        p.join()
    except KeyboardInterrupt:
        p.terminate()

In [10]:
def busco_summary(file):
    prog = re.compile('C:([0-9]+.[0-9])%\[S:([0-9]+.[0-9])%,D:([0-9]+.[0-9])%\],F:([0-9]+.[0-9])%,M:([0-9]+.[0-9])%')
    with open(file) as handle:
        for line in handle.readlines():
            match = prog.search(line)
            if match:
                return match.groups()

In [11]:
dirpath = Path('/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/Busco')

In [12]:
data = []
for i in dirpath.iterdir():
    lineage, sample_name = i.stem.split('.', 3)[-2:]
    complete, single, duplicated, fragmented, missing = busco_summary(i)
    data.append((sample_name, lineage, complete, single, duplicated, fragmented, missing))

In [13]:
df = pd.DataFrame(data, columns=['Key', 'lineage', 'complete', 'single', 'duplicated', 'fragmented', 'missing'])
df.head(15)

Unnamed: 0,Key,lineage,complete,single,duplicated,fragmented,missing
0,ERR1852957,enterobacterales_odb10,98.7,98.2,0.5,0.2,1.1
1,SRR6892745,enterobacterales_odb10,94.7,94.5,0.2,2.5,2.8
2,ERR1640653,enterobacterales_odb10,98.7,98.2,0.5,0.2,1.1
3,ERR1289820,enterobacterales_odb10,98.6,97.7,0.9,0.2,1.2
4,SRR5385935,enterobacterales_odb10,98.5,98.0,0.5,0.2,1.3
5,ERR1217299,enterobacterales_odb10,98.9,98.4,0.5,0.2,0.9
6,SRR6958726,enterobacterales_odb10,98.7,98.2,0.5,0.2,1.1
7,SRR5886490,enterobacterales_odb10,98.7,98.2,0.5,0.2,1.1
8,SRR5386082,enterobacterales_odb10,98.7,98.0,0.7,0.2,1.1
9,ERR1218735,enterobacterales_odb10,98.7,98.2,0.5,0.2,1.1


In [25]:
df.to_csv('/media/Central_Lab_Storage/NcbiSRA/NCBI_Neisseria_meningitidis_SRA/Busco.tsv', sep='\t', index=False)