-
Notifications
You must be signed in to change notification settings - Fork 8
/
strip_masked.py
executable file
·70 lines (65 loc) · 2.17 KB
/
strip_masked.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
"""
script for stripping out masked sequences if the masked sequence
is above a specified length
"""
import sys
import os
from ctbBio.fasta import iterate_fasta as parse_fasta
import argparse
def parse_masked(seq, min_len):
"""
parse masked sequence into non-masked and masked regions
"""
nm, masked = [], [[]]
prev = None
for base in seq[1]:
if base.isupper():
nm.append(base)
if masked != [[]] and len(masked[-1]) < min_len:
nm.extend(masked[-1])
del masked[-1]
prev = False
elif base.islower():
if prev is False:
masked.append([])
masked[-1].append(base)
prev = True
return nm, masked
def strip_masked(fasta, min_len, print_masked):
"""
remove masked regions from fasta file as long as
they are longer than min_len
"""
for seq in parse_fasta(fasta):
nm, masked = parse_masked(seq, min_len)
nm = ['%s removed_masked >=%s' % (seq[0], min_len), ''.join(nm)]
yield [0, nm]
if print_masked is True:
for i, m in enumerate([i for i in masked if i != []], 1):
m = ['%s insertion:%s' % (seq[0], i), ''.join(m)]
yield [1, m]
if __name__ == '__main__':
parser = argparse.ArgumentParser(description = '# remove masked portion of sequences in fasta file')
parser.add_argument(\
'-f', required = True, \
help = 'fasta file')
parser.add_argument(\
'-l', default = 0, \
type = int, \
help = 'minimum length of masked region required for removal')
parser.add_argument(\
'--print-masked', action = 'store_true', \
help = 'print masked sequences to stderr')
args = vars(parser.parse_args())
fasta, min_len, print_masked = \
args['f'], args['l'], args['print_masked']
if fasta == '-':
fasta = sys.stdin
else:
fasta = open(fasta)
for i in strip_masked(fasta, min_len, print_masked):
if i[0] == 0:
print('\n'.join(i[1]))
else:
print('\n'.join(i[1]), file=sys.stderr)