forked from voidptr/Caper
/
tests.py
302 lines (232 loc) · 12.9 KB
/
tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# @CTB to test: do our own mapping parsing & verify against caper, too.
# @CTB refactor test_bridge_nlmsa_equiv to use the new pygr ival construction
# lib code.
import os.path
try:
import screed
except ImportError:
raise Exception, "you need to install screed!"
import caper
from caper_pygr_bridge import CaperBridge
from screed.pygr_api import ScreedSequenceDB
from pygr import cnestedlist, seqdb
sequence_path = 'data/REL606.gmc.fa'
sequence_fastq = 'data/REL606-seqs.fastq'
contig = 'rel606'
#genome_index = 'data/cache/REL606.gmc.fa.genomeindex'
#sequence_index = 'data/cache/REL606.gmc.fa.indexed'
#map_path = 'data/REL606-maq-map.txt'
#map_index = 'data/cache/REL606-maq-map.txt.mappingindex'
map_bundle = 'data/cache/REL606-maq-map.txt.bundle'
def setup():
# assert os.path.exists(map_index), "run data/build-indexes.sh, please!"
# assert os.path.exists(sequence_index), "run data/build-indexes.sh, please!"
assert os.path.exists(map_bundle), "run data/build-indexes.sh, please!"
###
"""
The REL606 genome, from index 0-76, along with reads mapped to indexes from 2(1) to 42(41), looks as follows:
1111111111222222222233333333334444444444555555555566666666667777777
01234567890123456789012345678901234567890123456789012345678901234567890123456
^ ^ ^ ^ ^ ^ ^ ^
agcttttcattctgactgcaacgggcaatatgtctctgtgtggattaaaaaaagagtgtctgatagcagcttctgaa
HWI-EAS_4_PE-FC20GCB:1:146:930:514/1 2 (1) gcttttcattctgactgcaacgggcaatatgtctct
HWI-EAS_4_PE-FC20GCB:1:210:846:698/1 4 (3) ttttcattctgactgcaacgggcaatatgtctctgt
HWI-EAS_4_PE-FC20GCB:1:285:772:762/1 5 (4) tttcattctgactgcaacgggcaatatgtctctgtg
HWI-EAS_4_PE-FC20GCB:1:172:430:569/1 11 (10) tctgactgcaacgggcaatatgtctctgtgtggatt
HWI-EAS_4_PE-FC20GCB:1:1:563:116/1 12 (11) ctgactgcaacgggcaatatgtctctgtgcagaata
HWI-EAS_4_PE-FC20GCB:1:113:162:700/1 13 (12) tgactgcaacgggcaatatgtctctgtgtggattaa
HWI-EAS_4_PE-FC20GCB:1:212:153:213/1 14 (13) gactgcaacgggcaatatgtctctgtgtggattaaa
HWI-EAS_4_PE-FC20GCB:1:246:99:950/1 14 (13) gactgcaacgggcaatatgtctctgcgtggattcaa
HWI-EAS_4_PE-FC20GCB:1:278:188:572/1 19 (18) caacgggcaatatgtctctgtgtggattaaaaaaag
HWI-EAS_4_PE-FC20GCB:1:267:373:737/1 20 (19) aacgggcaatatgtctctgtgtggattaaaaaaaga
HWI-EAS_4_PE-FC20GCB:1:306:568:350/1 26 (25) aaatatgtctctgtgtggattaaaaaaagagtgtct
HWI-EAS_4_PE-FC20GCB:1:14:877:706/1 27 (26) aatatgtctctgtgtggattaaaaaaagagtgtctg
HWI-EAS_4_PE-FC20GCB:1:65:981:916/1 28 (27) atatgtctctgtgtggattaaaaaaagagtgtctga
HWI-EAS_4_PE-FC20GCB:1:163:985:583/1 30 (29) atgtctctgtgtggattaaaaaaagagtgtctgata
HWI-EAS_4_PE-FC20GCB:1:180:581:274/1 34 (33) ctctgtatggattaaaaaaagagtgtctgatagcag
HWI-EAS_4_PE-FC20GCB:1:160:122:568/1 38 (37) gtgtggattaaaaaaagagtgtctgatagcagcttc
HWI-EAS_4_PE-FC20GCB:1:23:564:736/1 39 (38) tgtggattaaaaaaagagtgtctgatagcagcttct
HWI-EAS_4_PE-FC20GCB:1:166:896:40/1 39 (38) tgtggattaaaaaaagagtgtctgatagcagcttct
HWI-EAS_4_PE-FC20GCB:1:133:865:549/1 42 (41) ggattaaaaaaagagtgtctgatagcagtttttgaa
A single read 36bp in length:
11111111112222222222333333
012345678901234567890123456789012345
ggattaaaaaaagagtgtctgatagcagtttttgaa
111111111111111111111111111111111111
333333333333333333333333333333333333
777777777777777777777888888888888888
788888888889999999999 11111
901234567890123456789012345678901234
ccatctgctgcacagcacccgtaagcaggttgaaga
"""
class MappingContainer_Test(object):
def setup(self):
self.cont = caper.mapping_container(map_bundle)
self.db = seqdb.SequenceFileDB(sequence_path)
self.seq = self.db[contig]
def test_read_length(self):
"""Test the read length"""
length = self.cont.get_read_length()
assert length == 36
def test_intersect_borders(self):
"""Test that the intersect returns different results when a read drops off the front, and hops on the back"""
inter36 = self.cont.get_intersect(contig, 36)
inter37 = self.cont.get_intersect(contig, 37)
## the first results shouldn't be the same, since we drop the read at index 1
assert inter36[0][4][0][0] != inter37[0][4][0][0]
## the last result shouldn't be the same, since we pick up a new read at index 37
assert inter36[13][4][0][0] != inter37[13][4][0][0]
def test_slice_borders(self):
"""Test that the slice returns different results when a read drops off the front, and hops on the back"""
inter36 = self.cont.get_slice(contig, 36, 36)
inter37 = self.cont.get_slice(contig, 37, 37)
## the first results shouldn't be the same, since we drop the read at index 1
assert inter36[0][4][0][0] != inter37[0][4][0][0]
## the last result shouldn't be the same, since we pick up a new read at index 37
assert inter36[13][4][0][0] != inter37[13][4][0][0]
def test_intersect_edge_case(self):
"""Test that at the 0th interval, that intersect doesn't return anything weird"""
inter0 = self.cont.get_intersect(contig, 0)
assert len(inter0) == 0
def test_slice_edge_cases(self):
"""Test that at the edges, that intersect doesn't return anything weird"""
slice_minus_1 = self.cont.get_slice(contig, -1, -1) ## this should just zero out
assert len(slice_minus_1) == 0
slice0 = self.cont.get_slice(contig, 0, 0) ## nothing at this index, but ok.
assert len(slice0) == 0
slice_last_read_indexed_front = self.cont.get_slice(contig, 13779, 13779) ## the last read starts here
assert len(slice_last_read_indexed_front) == 18
slice_last_read_indexed_back = self.cont.get_slice(contig, 13814, 13814) ## the last read ends here
print len(slice_last_read_indexed_back)
assert len(slice_last_read_indexed_back) == 1
slice_just_past_last_read_indexed = self.cont.get_slice(contig, 13815, 13815) ## nothing here
print len(slice_just_past_last_read_indexed)
assert len(slice_just_past_last_read_indexed) == 0
slice_just_past_last_read_indexed_to_the_next_window = self.cont.get_slice(contig, 13815, 14815) ## nothing here
print len(slice_just_past_last_read_indexed_to_the_next_window)
assert len(slice_just_past_last_read_indexed_to_the_next_window) == 0
## this causes a segfault. Fix!
slice_start_over_one_window_past_last_read_indexed = self.cont.get_slice(contig, 14815, 14815) ## nothing here
print len(slice_start_over_one_window_past_last_read_indexed)
assert len(slice_start_over_one_window_past_last_read_indexed) == 0
def test_get_reads_retrieve(self):
"""Test retrieval of reads indexed at a particular index"""
reads = self.cont.get_reads(contig, 13)
assert len(reads) == 2
reads_result = list()
for index, name, sequence, orientation in reads:
thing = (index, name, sequence, orientation)
reads_result.append(thing)
assert reads_result == \
[(13, 'HWI-EAS_4_PE-FC20GCB:1:212:153:213/1', 'gactgcaacgggcaatatgtctctgtgtggattaaa', 1),
(13, 'HWI-EAS_4_PE-FC20GCB:1:246:99:950/1', 'gactgcaacgggcaatatgtctctgcgtggattcaa', 1)], reads_result
def test_slice_retrieve(self):
"""Test retrieval of slices"""
slice = self.cont.get_slice(contig, 0, 5)
slice_result = list()
for ival_start, ival_stop, start, stop, matches in slice:
for index, name, sequence, orientation in matches:
thing = (ival_start, ival_stop, start, stop, name, orientation);
slice_result.append(thing)
assert len(slice) == 3
assert slice_result == \
[(1, 5, 0, 4, 'HWI-EAS_4_PE-FC20GCB:1:146:930:514/1', -1),
(3, 5, 0, 2, 'HWI-EAS_4_PE-FC20GCB:1:210:846:698/1', 1),
(4, 5, 0, 1, 'HWI-EAS_4_PE-FC20GCB:1:285:772:762/1', 1)], slice_result
# check lengths
for (tstart, tstop, bstart, bstop, _, o) in slice_result:
assert tstop - tstart == bstop - bstart
def test_intesect_retrieve(self):
"""Test retrieval of intersections"""
intersect = self.cont.get_intersect(contig, 5)
intersect_result = list()
for ival_start, ival_stop, start, stop, matches in intersect:
for index, name, sequence, orientation in matches:
thing = (ival_start, ival_stop, start, stop, name, orientation);
intersect_result.append(thing)
assert len(intersect) == 3
assert intersect_result == \
[(5, 5, 4, 4, 'HWI-EAS_4_PE-FC20GCB:1:146:930:514/1', -1),
(5, 5, 2, 2, 'HWI-EAS_4_PE-FC20GCB:1:210:846:698/1', 1),
(5, 5, 1, 1, 'HWI-EAS_4_PE-FC20GCB:1:285:772:762/1', 1)], intersect_result
# check lengths
for (tstart, tstop, bstart, bstop, _, o) in intersect_result:
assert tstop - tstart == bstop - bstart
def test_restricted_slice_and_intersect_equivalency(self):
"""Test that the values returned by intersect at a particular index, and slice restricted to that index are the same"""
slice = self.cont.get_slice(contig, 5, 5)
slice_result = list()
for ival_start, ival_stop, start, stop, matches in slice:
for index, name, sequence, orientation in matches:
thing = (ival_start, ival_stop, start, stop, name, orientation);
slice_result.append(thing)
assert len(slice) == 3
intersect = self.cont.get_intersect(contig, 5)
intersect_result = list()
for ival_start, ival_stop, start, stop, matches in intersect:
for index, name, sequence, orientation in matches:
thing = (ival_start, ival_stop, start, stop, name, orientation);
intersect_result.append(thing)
assert len(intersect) == 3
assert slice_result == intersect_result
def test_empty_slice_retrieve(self):
"""Test that slice returns an emtpy set at the right time"""
matches = self.cont.get_slice(contig, 14000, 14999)
assert len(matches) == 0
def test_slice_was_broken_boundary(self):
"""Test that slice returns the proper set of reads, even if we overrun the edge of the known reads for a given contig"""
matches = self.cont.get_slice(contig, 0, 20000)
count = 0;
for index in matches:
count = count + len(index[4])
assert count == 10000
def test_retrieve_all(self):
"""Test that slice retrieves all the reads in the whole genome"""
matches = self.cont.get_slice(contig, 0, len(self.seq))
count = 0;
for index in matches:
count = count + len(index[4])
assert count == 10000
class PygrBridge_Test(object):
def setup(self):
self.cont = caper.mapping_container(map_bundle)
self.db = seqdb.SequenceFileDB(sequence_path) #ScreedSequenceDB('data/REL606.gmc.fa')
self.db = ScreedSequenceDB(sequence_path)
self.seq = self.db[contig]
self.reads_db = ScreedSequenceDB(sequence_fastq)
self.al = CaperBridge(self.cont, self.db, self.reads_db)
def test_basic_slice(self):
ival = self.seq[0:50]
slice = self.al[ival]
assert len(slice) == 22
def test_bridge_nlmsa_equiv(self):
nlmsa = cnestedlist.NLMSA('foo', 'memory', pairwiseMode=True)
nlmsa += self.seq
# select across a wider window then we'll actually slice, to get
# an inclusive set of reads
m = self.cont.get_slice('rel606', 0, 200)
# construct nlmsa
N=0
for ival_start, ival_stop, read_start, read_stop, match in m:
for index, read_name, sequence, orientation in match:
N += 1
if ival_start == ival_stop:
continue
ival = self.seq[ival_start:ival_stop]
if orientation == -1:
match_seq = -self.reads_db[read_name]
else:
match_seq = self.reads_db[read_name]
match_ival = match_seq[read_start:read_stop]
nlmsa[ival] += match_ival
nlmsa.build()
### now, slice!
ival = self.seq[75:80]
nlmsa_slice = nlmsa[ival]
caper_slice = self.al[ival]
s = set(nlmsa_slice.keys())
t = set(caper_slice.keys())
assert s == t
for k, v, e in nlmsa[ival].edges():
print repr(k), repr(v), e.pIdentity(), v.orientation
assert e.pIdentity() >= 0.8, (repr(k), repr(v), e.pIdentity())