-
Notifications
You must be signed in to change notification settings - Fork 59
/
sequence_utilities.py
162 lines (143 loc) · 5.1 KB
/
sequence_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python
#
# Sequence related utilitiesa - written during development
# of the test project of GFang
#
# Emre Guney, 20/12/2012, distributed under GNU-GPL.
from bisect import bisect_left, bisect_right
def read_fasta_file(fasta_file):
f = open(fasta_file)
for i, line in enumerate(f):
line = line.strip()
if line == "":
continue
# Only consider fasta header lines
if line[0] == ">":
gene, positions, strand = parse_fasta_header(line)
# For this example inter-genic regions are skipped and the gene is assumed to span whole region
start = positions[0][0]
end = positions[-1][1]
f.close()
return
def get_closest_distances(positions, start, end, k):
"""
Returns the k closest distances to start and end within positions array.
>>> get_closest_distances([ 3, 6, 12, 14 ], 7, 9, 2)
[-1, 3]
>>> get_closest_distances([ 3, 6, 8, 9, 10, 12, 14 ], 7, 9, 3)
[0, 0, -1]
"""
distances = []
zeros = []
# Trick to only use sublist (positions are sorted)
lower = bisect_left(positions, start)
upper = bisect_right(positions, end)
lower -= 10
if lower < 0: lower = 0
upper += 10
positions = positions[lower:upper]
for pos in positions:
if pos <= start:
distances.append(pos-start)
elif pos > start and pos <= end:
zeros.append(0)
elif pos > end:
distances.append(pos-end)
else:
raise ValueError("Logical error")
distances.sort(lambda x,y: cmp(abs(x), abs(y)))
zeros.extend(distances)
return zeros[:k]
def parse_fasta_header(line):
"""
Returns gene_name, [(start, end), ..], strand for a given fasta header line.
>>> parse_fasta_header(">lcl|NC_000913.2_cdsid_NP_417358.2 [gene=xanQ] [protein=xanthine permease] [protein_id=NP_417358.2] [location=3022373..3023773]")
('xanQ', [(3022373, 3023773)], '+')
>>> parse_fasta_header(">lcl|NC_000913.2_cdsid_NP_414616.1 [gene=leuA] [protein=2-isopropylmalate synthase] [protein_id=NP_414616.1] [location=complement(81958..83529)]")
('leuA', [(81958, 83529)], '-')
>>> parse_fasta_header(">lcl|NC_000913.2_cdsid_NP_417367.1 [gene=prfB] [protein=peptide chain release factor RF-2] [protein_id=NP_417367.1] [location=complement(join(3033206..3034228,3034230..3034304))]")
('prfB', [(3033206, 3034228), (3034230, 3034304)], '-')
"""
import re
# Regular expressions to match id and location
#exp_id = re.compile("\[protein_id=([a-zA-Z0-9_\.]+)\]")
exp_id = re.compile("\[gene=([a-zA-Z0-9]+)\]")
exp_loc = re.compile("\[location=([a-zA-Z0-9_\.(),]+)\]")
positions = []
strand = '+'
protein_id = None
m = exp_id.search(line)
if m:
protein_id = m.group(1)
start, end = None, None
m = exp_loc.search(line)
if m:
loc_str = m.group(1)
if loc_str.startswith("complement"):
strand = '-'
loc_str = loc_str[11:-1]
if loc_str.startswith("join"):
loc_str = loc_str[5:-1]
for pair in loc_str.split(","):
start, end = map(int, pair.split(".."))
positions.append((start, end))
else:
start, end = map(int, loc_str.split(".."))
positions.append((start, end))
return protein_id, positions, strand
#def flip(c):
# d = { 'A': 'T', 'G': 'C', 'C': 'G', 'T': 'A' }
# return d[c]
def find_all_occurrences_of_motif(motif, fasta_file):
"""
Returns all occurrences of the motif in a fasta file.
>>> find_all_occurrences_of_motif("GATC", "mock_genome.fasta")
[61, 68, 138]
"""
f = open(fasta_file)
# Skip fasta header
f.readline()
# For remainder motifs coming from previous line
previous_line = "XXX"
all_occurrences = []
for i, line in enumerate(f):
line = line.strip()
if line == "":
continue
# All indices will be shifted by 3 due to consideration of remainders
sequence = previous_line[-3:] + line
previous_line = line
# Assumes properly formatted FASTA
if len(line) != 70:
print "Line %d in FASTA file does not contain 70 characters" % (i+2)
base_position = -3 + i*70
occurrences = search_motif(motif, sequence)
for k in occurrences:
all_occurrences.append(base_position+k)
f.close()
return all_occurrences
def search_motif(motif, sequence):
"""
Returns occurrences (start indices) of the motif in the sequence.
>>> search_motif("GATC", "ACAACGATCGCCAGCAGATCGTGAGGC")
[5, 16]
>>> search_motif("GATC", "GATCGCCAGCAGATC")
[0, 11]
>>> search_motif("GATC", "GCTCGCCAGCATATC")
[]
>>> search_motif("GATC", "GAT")
[]
"""
occurrences = []
i_limit = len(sequence) - 4
for i, c in enumerate(sequence):
if c == motif[0]:
if i > i_limit:
continue
if sequence[i:i+4] == motif:
occurrences.append(i)
return occurrences
if __name__ == "__main__":
import doctest
doctest.testmod()
main()