In [1]:
import os
import re
import sys
import gzip
import shutil
import subprocess
import xml.etree.cElementTree as ET
from pathlib import Path
from tempfile import TemporaryDirectory
from multiprocessing import Pool

import pandas as pd
import numpy as np
from Bio.SeqIO.QualityIO import FastqGeneralIterator
from Bio.Application import AbstractCommandline, _Option, _Switch

In [2]:
def run_cmd(cmd):
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    stdout, stderr = p.communicate()
    returncode = p.returncode
    if returncode:
        raise subprocess.CalledProcessError(returncode=returncode, cmd=cmd, output=stdout, stderr=stderr)
    return stdout, stderr

def count_bases(reads):
    stdout, stderr = run_cmd(f"seqtk fqchk -q3 {reads} | grep ALL")
    return int(stdout.decode().split()[1])

def parse_genome_size(gsize):
    mult = {'G': 1e9, 'M': 1e6, 'K': 1e3}
    prog = re.compile('([\d\.]+)([GMK])?')
    result = prog.fullmatch(gsize)
    if result:
        return float(result.group(1)) * mult[result.group(2)]
    else:
        raise ValueError(f"Couldn't parse {gsize}")

class InputError(Exception):
    def __init__(self, *args, **kwargs):
        pass


class SequenceReadArchive:
    def __init__(self, source):
        if self.is_exists(source) is False:
            raise FileNotFoundError(f'File {source} does not exist.')
        if self.is_sra(source) is False:
            raise InputError(f'File {source} is not SRA format.')
        self.source = source

    @staticmethod
    def is_exists(source):
        return os.access(source, os.F_OK) and os.path.isfile(source)

    @staticmethod
    def is_sra(source):
        with open(source, 'rb') as handle:
            header = next(handle)
        return header[:8].decode() == 'NCBI.sra'

    @property
    def layout(self):
        cmd = f'sra-stat -x -s -b 1 -e 2 {self.source}'
        stdout, stderr = run_cmd(cmd)
        nreads = ET.fromstring(stdout).find('Statistics').attrib['nreads']
        return nreads
    
    @property
    def total_bases(self):
        cmd = f'sra-stat -x -s -b 1 -e 2 {self.source}'
        stdout, stderr = run_cmd(cmd)
        return sum(int(quality.attrib['count']) for quality in ET.fromstring(stdout).findall('*/Quality'))

    def validate(self):
        cmd = f'vdb-validate {self.source}'
        stdout, stderr = run_cmd(cmd)
        return stderr.decode('utf8').strip().endswith('consistent')

    def dump_fastq(self, outdir):
        cmd = f'fastq-dump --split-files --outdir {outdir} {self.source}'
        run_cmd(cmd)

SANGER_SCORE_OFFSET = 33
q_mapping = {chr(letter): letter - SANGER_SCORE_OFFSET
             for letter in range(SANGER_SCORE_OFFSET, 94 + SANGER_SCORE_OFFSET)}


def sequences_quality_check(source, q_score=30, proportion=0.8):
    handle = gzip.open(source, 'rt')
    per_sequence_quality = []
    for title_line, seq_string, quality_string in FastqGeneralIterator(handle):
        qualities = [q_mapping[letter] for letter in quality_string]
        sequence_quality = sum(qualities) / len(qualities)
        per_sequence_quality.append(sequence_quality)
    handle.close()
    hist, bin_edges = np.histogram(per_sequence_quality, bins=range(0, 94))
    q_hist = list(zip(hist, bin_edges[:-1]))
    return sum(i for i, j in q_hist if j >= q_score)/sum(i for i, j in q_hist) >= proportion

ADAPTERS = '/home/chen1i6c04/Tools/shovill/db/trimmomatic.fa'
MIN_BQ = 3
TRIM_OPT = f"ILLUMINACLIP:{ADAPTERS}:2:30:10 LEADING:{MIN_BQ} TRAILING:{MIN_BQ} SLIDINGWINDOW:4:20 MINLEN:36 TOPHRED33"

def trimming(input_reads, outdir, threads=2):
    output_reads = os.path.join(outdir, 'clean_reads.fq.gz')
    cmd = f"trimmomatic SE -threads {threads} {input_reads} {output_reads} {TRIM_OPT}"
    run_cmd(cmd)
    return output_reads

In [3]:
class SpadesCommandline(AbstractCommandline):
    def __init__(self, cmd='spades.py', **kwargs):
        self.parameters = [
            _Option(['-1', 'r1'], '', equate=False),
            _Option(['-2', 'r2'], '', equate=False),
            _Option(['-s', 's'], '', equate=False),
            _Option(['-o', 'outdir'], '', equate=False, is_required=True),
            _Option(['--tmp-dir', 'tmpdir'], '', equate=False),
            _Option(['-t', 'threads'], '', equate=False),
            _Switch(['--careful', 'careful'], ''),
            _Switch(['--isolate', 'isolate'], ''),
            _Switch(['--disable-gzip-output', 'disable_gzip_output'], ''),
        ]
        AbstractCommandline.__init__(self, cmd, **kwargs)

In [4]:
def worker(sra_file, outdir, gsize, threads):
    prefix = os.path.splitext(os.path.basename(sra_file))[0]
    os.makedirs(outdir, exist_ok=True)
    
    sra = SequenceReadArchive(sra_file)
    
    tmp = TemporaryDirectory(dir=outdir)
    tmp_dir = tmp.name
    if sra.layout == '1':
        sra.dump_fastq(tmp_dir)
        single_reads = os.path.join(tmp_dir, prefix + '_1.fastq')
    else:
        sys.exit(0)
    trim_reads = trimming(single_reads, tmp_dir, threads)
    if sequences_quality_check(trim_reads) is False:
        sys.exit(0)

    depth = sra.total_bases/parse_genome_size(gsize)
    if depth > 100:
        isolate, careful = True, False
    else:
        isolate, careful = False, True
    spades_tmp = TemporaryDirectory(prefix='spades', dir='/dev/shm/')
    spades_outdir = spades_tmp.name
    logfile = os.path.join(outdir, 'spades.log')
    cline = SpadesCommandline(s=trim_reads, outdir=spades_outdir, tmpdir='/dev/shm/', threads=threads,
                              careful=careful, isolate=isolate, disable_gzip_output=True)
    stdout, stderr = cline()
    with open(logfile, 'w') as handle:
        handle.write(stdout)
    os.rename(os.path.join(spades_outdir, 'contigs.fasta'), os.path.join(outdir, 'spades.fa'))

In [5]:
dirpath = Path('/media/Synology_49/Vibrio_cholerae_SE_SRA')
outpath = Path('/media/NGS/SRA_1/NCBI_Vibrio_cholerae_SE_SRA')

In [6]:
if __name__ == '__main__':
    with Pool(4) as p:
        try:
            for i in dirpath.iterdir():
                sra_file = i/(i.name + '.sra')
                p.apply_async(worker, (sra_file, outpath/i.name, '4M', 8))
            p.close()
            p.join()
        except:
            p.terminate()

Process ForkPoolWorker-5:
Process ForkPoolWorker-6:
Process ForkPoolWorker-7:
Process ForkPoolWorker-4:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/chen1i6c04/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/chen1i6c04/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/chen1i6c04/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/chen1i6c04/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/chen1i6c04/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/chen1i6c04/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line

In [7]:
dirpath = Path('/media/NGS/SRA_1/NCBI_Vibrio_cholerae_SE_SRA/SPAdes')
outpath = Path('/media/NGS/SRA_1/NCBI_Vibrio_cholerae_SE_SRA/Contigs')

for i in dirpath.iterdir():
    src_file = i/'spades.fa'
    dst_file = outpath/(i.name + '.fa')
    try:
        shutil.copy(src_file, dst_file)
    except:
        print(i.name)

SRR1944522
SRR1944515
SRR1944210
SRR1944492
SRR1944526
