-
Notifications
You must be signed in to change notification settings - Fork 8
/
seqbag.go
462 lines (415 loc) · 12 KB
/
seqbag.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
package align
import (
"errors"
"fmt"
"log"
"math"
"math/rand"
"regexp"
"sort"
"strings"
"unicode"
"github.com/fredericlemoine/goalign/io"
)
type SeqBag interface {
AddSequence(name string, sequence string, comment string) error
AddSequenceChar(name string, sequence []rune, comment string) error
AppendSeqIdentifier(identifier string, right bool)
Alphabet() int
AlphabetStr() string
AlphabetCharacters() []rune
AutoAlphabet() // detects and sets alphabet automatically for all the sequences
CharStats() map[rune]int64
CleanNames() // Clean sequence names (newick special char)
GetSequence(name string) (string, bool) // Get a sequence by names
GetSequenceById(ith int) (string, bool)
GetSequenceChar(name string) ([]rune, bool)
GetSequenceCharById(ith int) ([]rune, bool)
GetSequenceNameById(ith int) (string, bool)
SetSequenceChar(ithAlign, ithSite int, char rune) error
Identical(SeqBag) bool
Iterate(it func(name string, sequence string))
IterateChar(it func(name string, sequence []rune))
IterateAll(it func(name string, sequence []rune, comment string))
NbSequences() int
Rename(namemap map[string]string)
RenameRegexp(regex, replace string, namemap map[string]string) error
ShuffleSequences() // Shuffle sequence order
TrimNames(namemap map[string]string, size int) error
TrimNamesAuto(namemap map[string]string, curid *int) error
Sort() // Sorts the sequences by name
}
type seqbag struct {
seqmap map[string]*seq // Map of sequences
seqs []*seq // Set of sequences (to preserve order)
alphabet int // AMINOACIDS , NUCLEOTIDS or UNKOWN
}
func NewSeqBag(alphabet int) *seqbag {
switch alphabet {
case AMINOACIDS, NUCLEOTIDS, UNKNOWN:
// OK
default:
io.ExitWithMessage(errors.New("Unexpected sequence alphabet type"))
}
return &seqbag{
make(map[string]*seq),
make([]*seq, 0, 100),
alphabet,
}
}
// Adds a sequence to this alignment
func (sb *seqbag) AddSequence(name string, sequence string, comment string) error {
err := sb.AddSequenceChar(name, []rune(sequence), comment)
return err
}
func (sb *seqbag) AddSequenceChar(name string, sequence []rune, comment string) error {
_, ok := sb.seqmap[name]
idx := 0
tmpname := name
/* If the sequence name already exists, we add a 4 digit index at the end and print a warning on stderr */
for ok {
idx++
log.Print(fmt.Sprintf("Warning: sequence \"%s\" already exists in alignment, renamed in \"%s_%04d\"", tmpname, name, idx))
tmpname = fmt.Sprintf("%s_%04d", name, idx)
_, ok = sb.seqmap[tmpname]
}
seq := NewSequence(tmpname, sequence, comment)
sb.seqmap[tmpname] = seq
sb.seqs = append(sb.seqs, seq)
return nil
}
// Append a string to all sequence names of the alignment
// If right is true, then append it to the right of each names,
// otherwise, appends it to the left
func (sb *seqbag) AppendSeqIdentifier(identifier string, right bool) {
if len(identifier) != 0 {
for _, seq := range sb.seqs {
if right {
seq.name = seq.name + identifier
} else {
seq.name = identifier + seq.name
}
}
}
}
func (sb *seqbag) Alphabet() int {
return sb.alphabet
}
func (sb *seqbag) AlphabetStr() string {
switch sb.Alphabet() {
case NUCLEOTIDS:
return "nucleotide"
case AMINOACIDS:
return "protein"
default:
return "unknown"
}
}
func (sb *seqbag) AlphabetCharacters() (alphabet []rune) {
if sb.Alphabet() == AMINOACIDS {
return stdaminoacid
} else {
return stdnucleotides
}
}
// Removes spaces and tabs at beginning and end of sequence names
// and replaces newick special characters \s\t()[];,.: by "-"
func (sb *seqbag) CleanNames() {
firstlast := regexp.MustCompile("(^[\\s\\t]+|[\\s\\t]+$)")
inside := regexp.MustCompile("[\\s\\t,\\[\\]\\(\\),;\\.:]+")
for _, seq := range sb.seqs {
seq.name = firstlast.ReplaceAllString(seq.name, "")
seq.name = inside.ReplaceAllString(seq.name, "-")
}
}
// If sequence exists in alignment, return true,sequence
// Otherwise, return false,nil
func (sb *seqbag) GetSequence(name string) (string, bool) {
seq, ok := sb.seqmap[name]
if ok {
return seq.Sequence(), ok
}
return "", ok
}
// If sequence exists in alignment, return sequence,true
// Otherwise, return "",false
func (sb *seqbag) GetSequenceById(ith int) (string, bool) {
if ith >= 0 && ith < sb.NbSequences() {
return sb.seqs[ith].Sequence(), true
}
return "", false
}
// If ith >=0 && i < nbSequences() return sequence,true
// Otherwise, return nil, false
func (sb *seqbag) GetSequenceCharById(ith int) ([]rune, bool) {
if ith >= 0 && ith < sb.NbSequences() {
return sb.seqs[ith].SequenceChar(), true
}
return nil, false
}
// If sequence exists in alignment, return sequence, true
// Otherwise, return nil,false
func (sb *seqbag) GetSequenceChar(name string) ([]rune, bool) {
seq, ok := sb.seqmap[name]
if ok {
return seq.SequenceChar(), ok
}
return nil, false
}
// If ith >=0 && i < nbSequences() return name,true
// Otherwise, return "", false
func (sb *seqbag) GetSequenceNameById(ith int) (string, bool) {
if ith >= 0 && ith < sb.NbSequences() {
return sb.seqs[ith].Name(), true
}
return "", false
}
// If sequence exists in alignment, return true,sequence
// Otherwise, return false,nil
func (sb *seqbag) NbSequences() int {
return len(sb.seqs)
}
func (sb *seqbag) SetSequenceChar(ithAlign, ithSite int, char rune) error {
if ithAlign < 0 || ithAlign >= sb.NbSequences() {
return errors.New("Sequence index is > number of sequences")
}
if ithSite < 0 || ithSite >= sb.seqs[ithAlign].Length() {
return errors.New("Site index is outside sequence length")
}
sb.seqs[ithAlign].sequence[ithSite] = char
return nil
}
// Returns true if:
//
// - sb and comp have the same number of sequences &&
// - each sequence in sb have a sequence in comp with the same name
// and the same sequence
//
// Identical seqbags may have sequences in different order
func (sb *seqbag) Identical(comp SeqBag) bool {
if sb.NbSequences() != comp.NbSequences() {
return false
}
for _, seq := range sb.seqs {
seq2, ok := comp.GetSequence(seq.name)
if !ok {
return false
}
if string(seq.sequence) != seq2 {
return false
}
}
return true
}
func (sb *seqbag) Iterate(it func(name string, sequence string)) {
for _, seq := range sb.seqs {
it(seq.name, string(seq.sequence))
}
}
func (sb *seqbag) IterateChar(it func(name string, sequence []rune)) {
for _, seq := range sb.seqs {
it(seq.name, seq.sequence)
}
}
func (sb *seqbag) IterateAll(it func(name string, sequence []rune, comment string)) {
for _, seq := range sb.seqs {
it(seq.name, seq.sequence, seq.comment)
}
}
/* It appends the given sequence to the sequence having given name */
func (sb *seqbag) appendToSequence(name string, sequence []rune) error {
seq, ok := sb.seqmap[name]
if !ok {
return errors.New(fmt.Sprintf("Sequence with name %s does not exist in alignment", name))
}
seq.sequence = append(seq.sequence, sequence...)
return nil
}
func (sb *seqbag) AutoAlphabet() {
isaa := true
isnt := true
sb.IterateChar(func(name string, seq []rune) {
for _, nt := range seq {
nt = unicode.ToUpper(nt)
couldbent := false
couldbeaa := false
switch nt {
case 'A', 'C', 'B', 'R', 'G', '?', GAP, POINT, OTHER, 'D', 'K', 'S', 'H', 'M', 'N', 'V', 'X', 'T', 'W', 'Y':
couldbent = true
couldbeaa = true
case 'U', 'O':
couldbent = true
case 'Q', 'E', 'I', 'L', 'F', 'P', 'Z':
couldbeaa = true
}
isaa = isaa && couldbeaa
isnt = isnt && couldbent
}
})
if isnt {
sb.alphabet = NUCLEOTIDS
} else if isaa {
sb.alphabet = AMINOACIDS
} else {
sb.alphabet = UNKNOWN
}
}
// Returns the distribution of all characters
func (sb *seqbag) CharStats() map[rune]int64 {
outmap := make(map[rune]int64)
for _, seq := range sb.seqs {
for _, r := range seq.sequence {
outmap[unicode.ToUpper(r)]++
}
}
return outmap
}
func DetectAlphabet(seq string) int {
isaa := true
isnt := true
for _, nt := range strings.ToUpper(seq) {
couldbent := false
couldbeaa := false
switch nt {
case 'A', 'C', 'B', 'R', 'G', '?', GAP, POINT, OTHER, 'D', 'K', 'S', 'H', 'M', 'N', 'V', 'X', 'T', 'W', 'Y':
couldbent = true
couldbeaa = true
case 'U', 'O':
couldbent = true
case 'Q', 'E', 'I', 'L', 'F', 'P', 'Z':
couldbeaa = true
}
isaa = isaa && couldbeaa
isnt = isnt && couldbent
if !(isaa || isnt) {
return UNKNOWN
}
}
if isnt {
return NUCLEOTIDS
} else {
return AMINOACIDS
}
}
// This function renames sequences of the alignment based on the map in argument
// If a name in the map does not exist in the alignment, does nothing
// If a sequence in the alignment does not have a name in the map: does nothing
func (sb *seqbag) Rename(namemap map[string]string) {
for seq := 0; seq < sb.NbSequences(); seq++ {
newname, ok := namemap[sb.seqs[seq].name]
if ok {
sb.seqs[seq].name = newname
}
// else {
// io.PrintMessage("Sequence " + a.seqs[seq].name + " not present in the map file")
// }
}
}
// Shuffle the order of the sequences in the alignment
// Does not change biological information
func (sb *seqbag) ShuffleSequences() {
var temp *seq
var n int = sb.NbSequences()
for n > 1 {
r := rand.Intn(n)
n--
temp = sb.seqs[n]
sb.seqs[n] = sb.seqs[r]
sb.seqs[r] = temp
}
}
// This function renames sequences of the alignment based on the given regex and replace strings
func (sb *seqbag) RenameRegexp(regex, replace string, namemap map[string]string) error {
r, err := regexp.Compile(regex)
if err != nil {
return err
}
for seq := 0; seq < sb.NbSequences(); seq++ {
newname := r.ReplaceAllString(sb.seqs[seq].name, replace)
namemap[sb.seqs[seq].name] = newname
sb.seqs[seq].name = newname
}
return nil
}
// Sorts sequences by name
func (sb *seqbag) Sort() {
names := make([]string, len(sb.seqs))
// Get sequence names
for i, seq := range sb.seqs {
names[i] = seq.Name()
}
// Sort names
sort.Strings(names)
for i, n := range names {
s, _ := sb.seqmap[n]
sb.seqs[i] = s
}
}
// Shorten sequence names to the given size. If duplicates are generated
// then add an identifier.
//
// The map in argument is updated with new oldname=>newname key values
func (sb *seqbag) TrimNames(namemap map[string]string, size int) error {
shortmap := make(map[string]bool)
if math.Pow10(size-2) < float64(sb.NbSequences()) {
return fmt.Errorf("New name size (%d) does not allow to identify that amount of sequences (%d)",
size-2, sb.NbSequences())
}
// If previous short names, we take them into account for uniqueness
for _, v := range namemap {
shortmap[v] = true
}
for _, seq := range sb.seqs {
newname, ok := namemap[seq.Name()]
if !ok {
newname = seq.Name()
newname = strings.Replace(newname, ":", "", -1)
newname = strings.Replace(newname, "_", "", -1)
if len(newname) >= size-2 {
newname = newname[0 : size-2]
} else {
for m := 0; m < (size - 2 - len(newname)); m++ {
newname = newname + "x"
}
}
id := 1
_, ok2 := shortmap[fmt.Sprintf("%s%02d", newname, id)]
for ok2 {
id++
if id > 99 {
return errors.New("More than 100 identical short names (" + newname + "), cannot shorten the names")
}
_, ok2 = shortmap[fmt.Sprintf("%s%02d", newname, id)]
}
newname = fmt.Sprintf("%s%02d", newname, id)
shortmap[newname] = true
namemap[seq.Name()] = newname
}
delete(sb.seqmap, seq.name)
seq.name = newname
sb.seqmap[seq.name] = seq
}
return nil
}
// namemap : map that is updated during the process
// curid : pointer to the current identifier to use for next seq names
// it is incremented in the function.
func (sb *seqbag) TrimNamesAuto(namemap map[string]string, curid *int) (err error) {
length := int(math.Ceil(math.Log10(float64(sb.NbSequences() + 1))))
for _, seq := range sb.seqs {
newname, ok := namemap[seq.Name()]
if !ok {
newname = fmt.Sprintf(fmt.Sprintf("S%%0%dd", (length)), *curid)
namemap[seq.Name()] = newname
(*curid)++
// In case of several alignments to rename,
// The number of necessary digits may be updated
newlength := int(math.Ceil(math.Log10(float64(*curid + 1))))
if newlength > length {
length = newlength
}
}
seq.name = newname
}
return
}