In [1]:
import os
import shutil
import datetime
import subprocess
from pathlib import Path
from collections import defaultdict
from multiprocessing import Pool

import sys
sys.path.append('../src')
from utils import run_cmd

In [2]:
basepath = Path('/media/MiSeqOutput/220121_M04855_0094_000000000-K424V')
dirpath = Path('/media/MiSeqOutput/220121_M04855_0094_000000000-K424V/Alignment_1/20220123_211538/Fastq')
outpath = basepath/'Analysis'

In [3]:
pairwise = defaultdict(list)

for filepath in dirpath.iterdir():
    prefix = filepath.stem.split('_')[0]
    pairwise[prefix].append(filepath.as_posix())

pairwise = {i: sorted(j) for i, j in pairwise.items() if len(j) == 2}

In [4]:
program = '/media/Central_Lab_Storage/bo-han/Projects/MiAW/miaw.py'
kmerfinder_db = '/media/GenomicResearch/Tools/CGE/kmerfinder_db/bacteria/bacteria'

cmds = []
for prefix, (reads_1, reads_2) in pairwise.items():
    outdir = outpath/prefix
    if outdir.exists() is False:
        cmd = f'conda run -n miaw python {program} -1 {reads_1} -2 {reads_2} -o {outdir} -t 8 --qc --tax_db {kmerfinder_db}'
        cmds.append(cmd)
len(cmds)

64

In [5]:
with Pool(10) as p:
    try:
        p.map(run_cmd, cmds)
        p.close()
        p.join()
    except KeyboardInterrupt:
        p.terminate()

# Collect data

In [6]:
import glob
import shutil
from io import StringIO, BytesIO
from zipfile import ZipFile
import pandas as pd

In [7]:
def basic_info(source):
    archive = ZipFile(source)
    for filename in archive.namelist():
        if os.path.basename(filename) == 'fastqc_data.txt':
            handle = archive.read(filename)
    archive.close()
    handle = handle.decode('utf8').splitlines()
    handle = (line for line in handle)
    for line in handle:
        if line.startswith('>>'):
            column = next(handle).split('\t')
            break

    data = []
    for line in handle:
        if line.startswith('>>'):
            break
        else:
            data.append(line.split('\t'))
    stats = pd.DataFrame(data, columns=column)
    stats = stats.set_index('#Measure')
    return stats

In [8]:
contigs_dir = basepath/'Assembly'
contigs_dir.mkdir(exist_ok=True)
for i in outpath.iterdir():
    try:
        seqfile = i/'assembly.fasta'
        shutil.copy(seqfile, contigs_dir/(i.name + '.fa'))
    except FileNotFoundError:
        print(i.name)

In [9]:
outfile = basepath/'contigs-stats.tsv'

stdout, stderr = run_cmd(f'seqkit stats -a -T {contigs_dir/"*"}')
df = pd.read_csv(StringIO(stdout), sep='\t')
df['file'] = df['file'].apply(lambda x: Path(x).stem)
df.to_csv(outfile, sep='\t', index=False)

In [10]:
summarys = [basic_info(zipfile) for zipfile in outpath.glob("**/*.zip")]
summarys = pd.concat(summarys, axis=1).T.reset_index(drop=True)
summarys.index.name = 'Key'
summarys.to_csv(basepath/"FastQC.tsv", index=False, sep='\t')

In [11]:
def kmerfinder_summary(infile):
    profile = pd.read_csv(infile, sep='\t')
    query = profile.groupby('Species')['Query_Coverage'].sum().to_dict()
    template = profile.groupby('Species')['Template_Coverage'].max().to_dict()
    query = dict(sorted(query.items(), key=lambda x: x[1], reverse=True))
    qcov_tcov = {species: f'{query[species]:.2f}% / {template[species]:.2f}%' for species in query}
    return {idx: f'{item[0]} | {item[1]}' for idx, item in enumerate(qcov_tcov.items(), 1)}

summaries = dict()
for i in outpath.iterdir():
    result = i/'kmerfinder.txt'
    summary = kmerfinder_summary(result)
    summaries[i.name] = summary
summaries = pd.DataFrame(summaries).T
summaries.to_csv(basepath/'KmerFinder.tsv', sep='\t', header=False)

In [12]:
run_cmd(f"multiqc -o {basepath} {outpath}")

('|         searching | ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 704/704  ',
 "\n  /// MultiQC 🔍 | v1.11\n\n|           multiqc | Search path : /media/MiSeqOutput/220121_M04855_0094_000000000-K424V/Analysis\n|             fastp | Found 64 reports\n|            fastqc | Found 128 reports\n|           multiqc | Compressing plot data\n|           multiqc | Report      : ../../../../media/MiSeqOutput/220121_M04855_0094_000000000-K424V/multiqc_report.html\n|           multiqc | Data        : ../../../../media/MiSeqOutput/220121_M04855_0094_000000000-K424V/multiqc_data\n|           multiqc | MultiQC complete\n|           multiqc | 7 flat-image plots used in the report due to large sample numbers\n|           multiqc | To force interactive plots, use the '--interactive' flag. \nSee the documentation.\n")

In [13]:
import re
from tempfile import TemporaryDirectory

def busco_summary(file):
    prog = re.compile('C:([0-9]+.[0-9])%\[S:([0-9]+.[0-9])%,D:([0-9]+.[0-9])%\],F:([0-9]+.[0-9])%,M:([0-9]+.[0-9])%')
    with open(file) as handle:
        for line in handle.readlines():
            match = prog.search(line)
            if match:
                return match.groups()

            
def run_busco(seqfile, sample_name, outdir, db_path, threads=1):
    with TemporaryDirectory(dir='/tmp') as tmpdir:
        cmd = f'conda run -n busco busco -i {seqfile} -c {threads} -o {sample_name} --out_path {tmpdir} --auto-lineage-prok -m geno --offline --download_path {db_path}'
        run_cmd(cmd)
        for summary_file in Path(tmpdir, sample_name).glob(f'short_summary.specific.*.{sample_name}.txt'):
            shutil.copy(summary_file, outdir)

In [14]:
database_path = '/media/GenomicResearch/Tools/busco_downloads'

In [15]:
busco_dir = basepath/'busco'
busco_dir.mkdir(exist_ok=True)

In [16]:
with Pool(20) as p:
    try:
        for filepath in contigs_dir.iterdir():
            p.apply_async(run_busco, (filepath, filepath.stem, busco_dir, database_path, 4))
        p.close()
        p.join()
    except KeyboardInterrupt:
        p.terminate()

In [17]:
data = []
for summary_file in busco_dir.iterdir():
    lineage, sample_name = summary_file.stem.rsplit('.', 2)[1:]
    complete, single, duplicated, fragmented, missing = busco_summary(summary_file)
    data.append((sample_name, lineage, complete, single, duplicated, fragmented, missing))
busco_results = pd.DataFrame(data, columns=['sample_name', 'lineage', 'complete', 'single', 'duplicated', 'fragmented', 'missing'])

In [18]:
busco_results = busco_results.sort_values('sample_name')

In [19]:
busco_results.to_csv(basepath/'busco.tsv', sep='\t', index=False)