-
Notifications
You must be signed in to change notification settings - Fork 295
/
extract-paired-reads.py
executable file
·105 lines (82 loc) · 3.32 KB
/
extract-paired-reads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#! /usr/bin/env python2
#
# This script is part of khmer, http://github.com/ged-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt.
# Contact: khmer-project@idyll.org
#
# pylint: disable=invalid-name,missing-docstring
"""
Take a file containing a mixture of interleaved and orphaned reads, and
extract them into separate files (.pe and .se).
% scripts/extract-paired-reads.py <infile>
Reads FASTQ and FASTA input, retains format for output.
"""
import screed
import sys
import os.path
import textwrap
import argparse
import khmer
from khmer.kfile import check_file_status, check_space
from khmer.khmer_args import info
from khmer.utils import broken_paired_reader, write_record, write_record_pair
def get_parser():
epilog = """
The output is two files, <input file>.pe and <input file>.se, placed in the
current directory. The .pe file contains interleaved and properly paired
sequences, while the .se file contains orphan sequences.
Many assemblers (e.g. Velvet) require that you give them either perfectly
interleaved files, or files containing only single reads. This script takes
files that were originally interleaved but where reads may have been
orphaned via error filtering, application of abundance filtering, digital
normalization in non-paired mode, or partitioning.
Example::
extract-paired-reads.py tests/test-data/paired.fq
"""
parser = argparse.ArgumentParser(
description='Take a mixture of reads and split into pairs and '
'orphans.', epilog=textwrap.dedent(epilog))
parser.add_argument('infile')
parser.add_argument('--version', action='version', version='%(prog)s ' +
khmer.__version__)
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
return parser
def main():
info('extract-paired-reads.py')
args = get_parser().parse_args()
check_file_status(args.infile, args.force)
infiles = [args.infile]
check_space(infiles, args.force)
outfile = os.path.basename(args.infile)
if len(sys.argv) > 2:
outfile = sys.argv[2]
single_fp = open(outfile + '.se', 'w')
paired_fp = open(outfile + '.pe', 'w')
print >>sys.stderr, 'reading file "%s"' % args.infile
print >>sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile
print >>sys.stderr, 'outputting orphans to "%s.se"' % outfile
n_pe = 0
n_se = 0
screed_iter = screed.open(args.infile, parse_description=False)
for index, is_pair, read1, read2 in broken_paired_reader(screed_iter):
if index % 100000 == 0 and index > 0:
print >>sys.stderr, '...', index
if is_pair:
write_record_pair(read1, read2, paired_fp)
n_pe += 1
else:
write_record(read1, single_fp)
n_se += 1
single_fp.close()
paired_fp.close()
if n_pe == 0:
raise Exception("no paired reads!? check file formats...")
print >>sys.stderr, 'DONE; read %d sequences,' \
' %d pairs and %d singletons' % \
(n_pe * 2 + n_se, n_pe, n_se)
print >> sys.stderr, 'wrote to: ' + outfile \
+ '.se' + ' and ' + outfile + '.pe'
if __name__ == '__main__':
main()