-
Notifications
You must be signed in to change notification settings - Fork 294
/
parsing.pyx
529 lines (412 loc) · 15.7 KB
/
parsing.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
# -*- coding: UTF-8 -*-
from cython.operator cimport dereference as deref
cimport cython
from libcpp cimport bool
from libcpp.string cimport string
import sys
from khmer._oxli.utils cimport _bstring, _ustring
cdef class Alphabets:
@staticmethod
def get(name):
cdef unicode alphabet = _ustring(Alphabets._get(_bstring(name)))
if not alphabet:
raise ValueError('No alphabet with name {0}'.format(name))
return alphabet
@staticmethod
cdef string _get(string name):
if name == b'DNA_SIMPLE':
return DNA_SIMPLE
elif name == b'DNAN_SIMPLE':
return DNAN_SIMPLE
elif name == b'RNA_SIMPLE':
return RNA_SIMPLE
elif name == b'RNAN_SIMPLE':
return RNAN_SIMPLE
elif name == b'IUPAC_NUCL':
return IUPAC_NUCL
elif name == b'IUPAC_AA':
return IUPAC_AA
else:
return string()
@cython.freelist(100)
cdef class Sequence:
def __cinit__(self, name=None, sequence=None,
quality=None, description=None,
cleaned_seq=None):
if name is not None and sequence is not None:
self._obj.sequence = _bstring(sequence)
self._obj.name = _bstring(name)
if description is not None:
self._obj.description = _bstring(description)
if quality is not None:
self._obj.quality = _bstring(quality)
if cleaned_seq is not None:
self._obj.cleaned_seq = _bstring(cleaned_seq)
else:
self._obj.cleaned_seq = self._obj.sequence
def __str__(self):
return repr(self)
def __repr__(self):
return 'Sequence(name="{0}", sequence="{1}")'.format(self.name, self.sequence)
def __len__(self):
return self._obj.sequence.length()
def __richcmp__(x, y, op):
if op == 2:
return x.name == y.name and x.sequence == y.sequence
else:
raise NotImplementedError('Operator not available')
def kmers(self, int K):
cdef int i = 0
cdef unicode sequence = self.sequence
for i in range(0, len(self)-K+1):
yield sequence[i:i+K]
def __getitem__(self, x):
# Definitely optimize this.
return self.sequence[x]
@property
def name(self):
cdef unicode name = self._obj.name
return self._obj.name if name else None
@property
def sequence(self):
cdef unicode sequence = self._obj.sequence
return self._obj.sequence if sequence else None
@property
def description(self):
cdef unicode description = self._obj.description
return description if description else None
@property
def quality(self):
cdef unicode quality = self._obj.quality
return quality if quality else None
@property
def cleaned_seq(self):
cdef unicode cleaned_seq = self._obj.cleaned_seq
return cleaned_seq if cleaned_seq else None
@staticmethod
def from_screed_record(record):
cdef Sequence seq = Sequence(name=record.name,
sequence=record.sequence)
if hasattr(record, 'quality'):
seq._obj.quality = _bstring(record.quality)
for attr in ('annotations', 'description'):
if hasattr(record, attr):
seq._obj.description = _bstring(getattr(record, attr))
return seq
@staticmethod
cdef Sequence _wrap(CpSequence cseq):
cdef Sequence seq = Sequence()
seq._obj = cseq
return seq
cdef class ReadBundle:
def __cinit__(self, *raw_records):
self.reads = [r for r in raw_records if r]
@property
def num_reads(self):
return len(self.reads)
@property
def total_length(self):
return sum([len(r.sequence) for r in self.reads])
def print_error(msg):
"""Print the given message to 'stderr'."""
print(msg, file=sys.stderr)
class UnpairedReadsError(ValueError):
"""ValueError with refs to the read pair in question."""
def __init__(self, msg, r1, r2):
r1_name = "<no read>"
r2_name = "<no read>"
if r1:
r1_name = r1.name
if r2:
r2_name = r2.name
msg = msg + '\n"{0}"\n"{1}"'.format(r1_name, r2_name)
ValueError.__init__(self, msg)
self.read1 = r1
self.read2 = r2
cdef inline bool is_valid(const char base, string& alphabet):
cdef char b
for b in alphabet:
if b == base:
return True
return False
cdef inline bool sanitize_sequence(string& sequence,
string& alphabet,
bool convert_n):
cdef int i = 0
for i in range(sequence.length()):
sequence[i] &= 0xdf
if not is_valid(sequence[i], alphabet):
return False
if convert_n and sequence[i] == b'N':
sequence[i] = b'A'
return True
cdef class FastxParser:
def __cinit__(self, filename, *args, **kwargs):
self._this = get_parser[CpFastxReader](_bstring(filename))
cdef Sequence _next(self):
if not self.is_complete():
return Sequence._wrap(deref(self._this).get_next_read())
else:
return None
cpdef bool is_complete(self):
return deref(self._this).is_complete()
def __iter__(self):
cdef Sequence seq
while not self.is_complete():
seq = self._next()
yield seq
cdef class SanitizedFastxParser(FastxParser):
def __cinit__(self, filename, alphabet='DNAN_SIMPLE',
bool convert_n=True):
self.n_bad = 0
self.convert_n = convert_n
self._alphabet = Alphabets._get(_bstring(alphabet))
cdef Sequence _next(self):
cdef Sequence seq
cdef bool good
if not self.is_complete():
seq = FastxParser._next(self)
good = sanitize_sequence(seq._obj.sequence,
self._alphabet,
self.convert_n)
if not good:
self.n_bad += 1
return None
else:
return seq
else:
return None
def __iter__(self):
cdef Sequence seq
while not self.is_complete():
seq = self._next()
if seq is not None:
yield seq
cdef class SplitPairedReader:
def __cinit__(self, FastxParser left_parser,
FastxParser right_parser,
int min_length=-1,
bool force_name_match=False):
self.left_parser = left_parser
self.right_parser = right_parser
self.min_length = min_length
self.force_name_match = force_name_match
def __iter__(self):
cdef Sequence first, second
cdef object err
cdef read_num = 0
cdef int found
found, first, second, err = self._next()
while found != 0:
if err is not None:
raise err
if self.min_length > 0:
if len(first) >= self.min_length and \
len(second) >= self.min_length:
yield read_num, True, first, second
else:
yield read_num, True, first, second
read_num += found
found, first, second, err = self._next()
cdef tuple _next(self):
cdef Sequence first = self.left_parser._next()
cdef bool first_complete = self.left_parser.is_complete()
cdef Sequence second = self.right_parser._next()
cdef bool second_complete = self.right_parser.is_complete()
if first_complete is not second_complete:
err = UnpairedReadsError('Differing lengths of left '\
'and right files!')
return -1, None, None, err
if first_complete:
return 0, None, None, None
if first is None or second is None:
return 1, first, second, None
if self.force_name_match:
if _check_is_pair(first, second):
return 2, first, second, None
else:
err = UnpairedReadsError('', first, second)
return -1, None, None, err
else:
return 2, first, second, None
cdef class BrokenPairedReader:
def __cinit__(self, FastxParser parser,
int min_length=-1,
bool force_single=False,
bool require_paired=False):
if force_single and require_paired:
raise ValueError("force_single and require_paired cannot both be set!")
self.parser = parser
self.min_length = min_length
self.force_single = force_single
self.require_paired = require_paired
self.record = None
def __iter__(self):
cdef Sequence first
cdef Sequence second
cdef object err
cdef int found
cdef int read_num = 0
found, first, second, err = self._next()
while (found != 0):
if err is not None:
raise err
if self.min_length > 0:
if first is not None and len(first) < self.min_length:
first = None
found -= 1
if second is not None and len(second) < self.min_length:
second = None
found -= 1
if self.force_single:
if first is not None:
yield read_num, found == 2, first, None
read_num += found
if second is not None:
yield read_num, found == 2, second, None
read_num += found
elif self.require_paired:
if first is not None and second is not None:
yield read_num, found == 2, first, second
read_num += found
else:
if first is not None or second is not None:
yield read_num, found == 2, first, second
read_num += found
found, first, second, err = self._next()
cdef tuple _next(self):
cdef Sequence first, second
cdef int is_pair
if self.record is None:
first = self.parser._next()
if first is None:
if self.parser.is_complete():
return 0, None, None, None
else:
if self.require_paired:
err = UnpairedReadsError(
"Uneven number of reads when require_paired is set!",
first)
return -1, None, None, err
return 1, first, None, None
else:
first = self.record
second = self.parser._next()
# check if paired
if second is not None and first is not None:
is_pair = _check_is_pair(first, second)
if is_pair == -1:
err = ValueError("records must be same type (FASTA or FASTQ)")
return -1, None, None, err
if is_pair and not self.force_single:
self.record = None
return 2, first, second, None # found 2 proper records
else: # orphan.
if self.require_paired:
err = UnpairedReadsError(
"Unpaired reads when require_paired is set!",
first, second)
return -1, None, None, err
self.record = second
return 1, first, None, None
elif self.parser.is_complete():
# ran out of reads getting second, handle last record
if self.require_paired:
err = UnpairedReadsError("Unpaired reads when require_paired "
"is set!", first, None)
return -1, None, None, err
self.record = None
return 1, first, second, None
else: # one read was invalid, but that doesn't mean they were unpaired
return 1, first, second, None
cpdef tuple _split_left_right(unicode s):
cdef string cppstr = s.encode('UTF-8')
return _cppstring_split_left_right(cppstr)
cdef tuple _cppstring_split_left_right(string& s):
"""Split record name at the first whitespace and return both parts.
RHS is set to an empty string if not present.
"""
cdef unsigned int i
cdef const char * c_str = s.c_str()
cdef unicode lhs, rhs
lhs = u''
rhs = u''
for i in range(len(s)):
if lhs == u'':
if (c_str[i] == b' ' or c_str[i] == b'\t'):
lhs = _ustring(c_str[0:i])
else:
if c_str[i] != b' ' or c_str[i] != b'\t':
rhs = _ustring(c_str[i:len(s)])
break
lhs = _ustring(c_str[0:len(s)]) if lhs == u'' else lhs
return lhs, rhs
cdef int _check_is_pair(Sequence first, Sequence second):
"""Check if the two sequence records belong to the same fragment.
In an matching pair the records are left and right pairs
of each other, respectively. Returns True or False as appropriate.
Handles both Casava formats: seq/1 and seq/2, and 'seq::... 1::...'
and 'seq::... 2::...'.
Also handles the default format of the SRA toolkit's fastq-dump:
'Accession seq/1'
"""
if first.quality is None or second.quality is None:
if first.quality is not second.quality:
return -1
cdef unicode lhs1, rhs1, lhs2, rhs2
lhs1, rhs1 = _cppstring_split_left_right(first._obj.name)
lhs2, rhs2 = _cppstring_split_left_right(second._obj.name)
# handle 'name/1'
cdef unicode subpart1, subpart2
if lhs1.endswith('/1') and lhs2.endswith('/2'):
subpart1 = lhs1.split('/', 1)[0]
subpart2 = lhs2.split('/', 1)[0]
if subpart1 and subpart1 == subpart2:
return 1
# handle '@name 1:rst'
elif lhs1 == lhs2 and rhs1.startswith('1:') and rhs2.startswith('2:'):
return 1
# handle @name seq/1
elif lhs1 == lhs2 and rhs1.endswith('/1') and rhs2.endswith('/2'):
subpart1 = rhs1.split('/', 1)[0]
subpart2 = rhs2.split('/', 1)[0]
if subpart1 and subpart1 == subpart2:
return 1
return 0
def check_is_pair(first, second):
if type(first) is not Sequence:
first = Sequence.from_screed_record(first)
if type(second) is not Sequence:
second = Sequence.from_screed_record(second)
cdef int ret = _check_is_pair(first, second)
if ret == -1:
raise ValueError("both records must be same type (FASTA or FASTQ)")
return ret == 1
cpdef bool check_is_left(s):
"""Check if the name belongs to a 'left' sequence (/1).
Returns True or False.
Handles both Casava formats: seq/1 and 'seq::... 1::...'
"""
cdef unicode lhs, rhs
lhs, rhs = _split_left_right(_ustring(s))
if lhs.endswith('/1'): # handle 'name/1'
return True
elif rhs.startswith('1:'): # handle '@name 1:rst'
return True
elif rhs.endswith('/1'): # handles '@name seq/1'
return True
return False
cpdef bool check_is_right(s):
"""Check if the name belongs to a 'right' sequence (/2).
Returns True or False.
Handles both Casava formats: seq/2 and 'seq::... 2::...'
"""
cdef unicode lhs, rhs
lhs, rhs = _split_left_right(_ustring(s))
if lhs.endswith('/2'): # handle 'name/2'
return True
elif rhs.startswith('2:'): # handle '@name 2:rst'
return True
elif rhs.endswith('/2'): # handles '@name seq/2'
return True
return False