-
Notifications
You must be signed in to change notification settings - Fork 15
/
fastq-pe-random.py
executable file
·51 lines (40 loc) · 1.41 KB
/
fastq-pe-random.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/python
"""
take paired end files and generate a new
set of paired end files with only a random
subset of reads.
Usage:
python %s reads_1.fastq reads_2.fastq 100
will take 100 random reads (still paired) from each file
and create the new files reads_1.fastq.subset and reads_2.fastq.subset
"""
import random
import sys
print "Usage: python fastq-pe-random.py reads_1.fastq reads_2.fastq 100"
def write_random_records(fqa, fqb, N=100000):
""" get N random headers from a fastq file without reading the
whole thing into memory"""
records = sum(1 for _ in open(fqa)) / 4
rand_records = sorted([random.randint(0, records - 1) for _ in xrange(N)])
fha, fhb = open(fqa), open(fqb)
suba, subb = open(fqa + ".subset", "w"), open(fqb + ".subset", "w")
rec_no = -1
written = 0
for rr in rand_records:
while rec_no < rr:
for i in range(4): fha.readline()
for i in range(4): fhb.readline()
rec_no += 1
for i in range(4):
suba.write(fha.readline())
subb.write(fhb.readline())
rec_no += 1
written += 1
assert written == N
print >>sys.stderr, "wrote to %s, %s" % (suba.name, subb.name)
if __name__ == "__main__":
if len(sys.argv) < 3:
print __doc__ % sys.argv[0]
sys.exit()
N = 100 if len(sys.argv) < 4 else int(sys.argv[3])
write_random_records(sys.argv[1], sys.argv[2], N)