/
phaser.go
441 lines (408 loc) · 12.1 KB
/
phaser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
package align
import (
"fmt"
"sync"
)
// * If SetTranslate(true):
//
// align all sequences to the given ORF and trims sequences to the start
// position
// If orf is nil, searches for the longest ORF (in 3 or 6 phases depending on reverse arg) in all sequences
//
// To do so, Phase() will:
//
// 1. Translate the given ORF in aminoacids;
// 2. For each sequence of the dataset: translate it in the 3 phases (forward) if reverse is false or 6
// phases (forward and reverse) if reverse is true, align it with the translated orf, and take the phase
// giving the best alignment; If no phase gives a good alignment (>lencutoff * orf length, >matchcutoff
// matches over the align length and starting at first position of the ORF), then the sequence is discarded;
// 3. For each sequence, take the Start corresponding to the Start of the ORF, and remove
// nucleotides before;
// 4. Return the trimmed nucleotidic sequences (phased), the corresponding amino-acid sequences (phasedaa)
// the positions of starts in the nucleotidic sequences, and the removed sequence names.
//
// If cutend is true, then also remove the end of sequences that do not align with orf
//
// It does not modify the input object
//
//
// * If SetTranslate(false):
//
// align all sequences to the given ORF and trims sequences to the start
// position, it does not take into account protein information
//
// If orf is nil, searches for the longest ORF (in forward only or both strands depending on reverse arg) in all sequences
//
// To do so:
//
// 1. If alignment is bad (>lencutoff * orf length, >matchcutoff matches over the align length and starting at first position of the ORF), then the sequence is discarded;
// 3. For each sequence, take the Start corresponding to the Start of the ORF, and remove nucleotides before;
// 4. Return the trimmed nucleotidic sequences (phased), the positions of starts in the nucleotidic sequences, and the removed sequence names.
// If cutend is true, then also remove the end of sequences that do not align with orf
// It does not modify the input object
type Phaser interface {
Phase(orfs, seqs SeqBag) (chan PhasedSequence, error)
SetLenCutoff(cutoff float64)
SetMatchCutoff(cutoff float64)
SetReverse(reverse bool)
SetCutEnd(cutend bool)
SetCpus(cpus int)
SetTranslate(translate bool, geneticcode int) (err error)
SetAlignScores(match, mismatch float64)
SetGapOpen(float64)
SetGapExtend(float64)
}
type phaser struct {
// Alignment length cutoff for sequences to orfs alignment
lencutoff float64
// #Matches cutoff for sequences to orfs alignment
matchcutoff float64
// If sequences must be analyzed also in reverse strand
reverse bool
// Cut the end of sequences
cutend bool
// Number of CPUs for computation
cpus int
// Translate orf (1st phase) & input sequences in 3 or 6 phases
// to search for the best phase. If false, just align sequences
// as is
translate bool
geneticcode int
//
// For Pairwise alignment
changedscores bool
matchscore float64
mismatchscore float64
gapopen float64
gapextend float64
}
type PhasedSequence struct {
Err error
Removed bool
Position int
// phased nt sequence
NtSeq Sequence
// phased nt sequence
// with first nt corresponding
// first position of aa codon
CodonSeq Sequence
// phased aa sequence
AaSeq Sequence
// Aligned sequences
// 1st: best found orf
// 2nd: sequence
Ali Alignment
}
func NewPhaser() Phaser {
return &phaser{
lencutoff: .8,
matchcutoff: .5,
reverse: false,
cutend: false,
cpus: 1,
translate: true,
geneticcode: GENETIC_CODE_STANDARD,
changedscores: false,
matchscore: -1,
mismatchscore: -1,
gapopen: -10,
gapextend: -0.5,
}
}
func (p *phaser) SetLenCutoff(cutoff float64) {
p.lencutoff = cutoff
}
func (p *phaser) SetMatchCutoff(cutoff float64) {
p.matchcutoff = cutoff
}
func (p *phaser) SetReverse(reverse bool) {
p.reverse = reverse
}
func (p *phaser) SetCutEnd(cutend bool) {
p.cutend = cutend
}
func (p *phaser) SetCpus(cpus int) {
p.cpus = cpus
}
func (p *phaser) SetTranslate(translate bool, geneticcode int) (err error) {
if _, err = geneticCode(geneticcode); err != nil {
return
}
p.translate = translate
p.geneticcode = geneticcode
return
}
func (p *phaser) SetAlignScores(match, mismatch float64) {
p.matchscore = match
p.mismatchscore = mismatch
p.changedscores = true
}
func (p *phaser) SetGapOpen(gapopen float64) {
p.gapopen = gapopen
}
func (p *phaser) SetGapExtend(gapextend float64) {
p.gapextend = gapextend
}
// orfs: Reference sequences/ORFs to phase sequences with
// seqs: Sequences to phase
func (p *phaser) Phase(orfs, seqs SeqBag) (phased chan PhasedSequence, err error) {
phased = make(chan PhasedSequence, 50)
var orf Sequence
var alphabet int
var orfsaa SeqBag
// Channels for concurrency
var seqchan <-chan Sequence
if seqs.Alphabet() != NUCLEOTIDS {
err = fmt.Errorf("wrong alphabet for phase : %s", seqs.AlphabetStr())
return
}
// If no orf given, then we find the longest among the sequences
if orfs == nil {
if orf, err = seqs.LongestORF(p.reverse); err != nil {
return
}
orfs = NewSeqBag(UNKNOWN)
orfs.AddSequenceChar(orf.Name(), orf.SequenceChar(), orf.Comment())
orfs.AutoAlphabet()
}
// We translate the longest ORF in AA if it is nucleotides
alphabet = orfs.Alphabet()
if orfsaa, err = orfs.CloneSeqBag(); err != nil {
return
}
if alphabet == NUCLEOTIDS {
if err = orfsaa.Translate(0, p.geneticcode); err != nil {
return
}
}
// Now we align all sequences against this longest orf aa sequence with Modified Smith/Waterman
// We use n threads
// Fill the sequence channel
seqchan = seqs.SequencesChan()
// All threads consuming sequences
var wg sync.WaitGroup
for cpu := 0; cpu < p.cpus; cpu++ {
wg.Add(1)
go func() {
var inerr error
defer wg.Done()
var ph PhasedSequence
for seq := range seqchan {
if p.translate {
ph, inerr = p.alignAgainstRefsAA(seq, orfsaa.Sequences())
} else {
ph, inerr = p.alignAgainstRefsNT(seq, orfs.Sequences())
}
if ph.Err != nil {
err = inerr
phased <- ph
return
} else if inerr != nil {
err = inerr
ph.Err = inerr
phased <- ph
return
}
if err != nil {
return
}
phased <- ph
}
}()
}
go func() {
wg.Wait()
close(phased)
// In case an error occured
// we must finish to read the seqchan
for range seqchan {
}
}()
return
}
func (p *phaser) alignAgainstRefsAA(seq Sequence, orfsaa []Sequence) (ph PhasedSequence, err error) {
var bestscore float64 = .0
var bestratematches, bestlen float64 = .0, .0
var beststart, bestend int = 0, 0
var beststartaa, bestendaa int = 0, 0
var bestseq, bestseqaa Sequence = nil, nil
var bestali Alignment = nil
var phases int = 3 // Number of phases 3 or 6
var phase int
var seqaa Sequence
var tmpseq, revcomp Sequence
var aligner PairwiseAligner
var al Alignment
// We translate the sequence in the 3 phases to search for the best
// alignment
tmpseq = seq
revcomp = seq
if p.reverse {
phases = 6
revcomp = seq.Clone()
revcomp.Reverse()
revcomp.Complement()
}
// We search for the best score among all references
// and all phases
for _, orfaa := range orfsaa {
for phase = 0; phase < phases; phase++ {
if phase < 3 {
tmpseq = seq
} else {
tmpseq = revcomp
}
if seqaa, err = tmpseq.Translate(phase%3, p.geneticcode); err != nil {
ph = PhasedSequence{Err: fmt.Errorf("error while translating %s : %v", seq.Name(), err)}
return
}
aligner = NewPwAligner(orfaa, seqaa, ALIGN_ALGO_ATG)
aligner.SetGapOpenScore(p.gapopen)
aligner.SetGapExtendScore(p.gapextend)
if p.changedscores {
aligner.SetScore(p.matchscore, p.mismatchscore)
}
if al, err = aligner.Alignment(); err != nil {
ph = PhasedSequence{Err: fmt.Errorf("error while aligning %s with %s : %v", orfaa.Name(), seqaa.Name(), err)}
return
}
_, seqstart := aligner.AlignStarts()
_, seqend := aligner.AlignEnds()
if aligner.MaxScore() > bestscore {
bestscore = aligner.MaxScore()
// Alignment start in nucleotidic sequence
beststart = (phase % 3) + (seqstart * 3)
// Alignment start in proteic sequence
beststartaa = seqstart
bestseqaa = seqaa
bestseq = tmpseq
bestali = al
bestratematches = float64(aligner.NbMatches()) / float64(aligner.Length())
bestlen = float64(aligner.Length()) / float64(orfaa.Length())
bestend = bestseq.Length()
bestendaa = bestseqaa.Length()
if p.cutend {
bestend = (phase % 3) + ((seqend + 1) * 3)
bestendaa = seqend + 1
}
}
}
}
ph = PhasedSequence{
Err: nil,
Removed: false,
Position: beststart,
NtSeq: NewSequence(bestseq.Name(),
bestseq.SequenceChar()[beststart:bestend],
bestseq.Comment()),
CodonSeq: NewSequence(bestseq.Name(),
bestseq.SequenceChar()[beststart:bestend],
bestseq.Comment()),
AaSeq: NewSequence(bestseqaa.Name(),
bestseqaa.SequenceChar()[beststartaa:bestendaa],
bestseqaa.Comment()),
Ali: bestali,
} // We set a threshold for matches over the alignment length...
if (p.matchcutoff > .0 && bestratematches <= p.matchcutoff) ||
(p.lencutoff > .0 && bestlen <= p.lencutoff) {
ph.Removed = true
}
return
}
// orfs: Reference sequences/ORFs to phase sequences with
// seqs: Sequences to phase
func (p *phaser) alignAgainstRefsNT(seq Sequence, orfs []Sequence) (ph PhasedSequence, err error) {
var bestscore float64 = .0
var bestratematches, bestlen float64 = .0, .0
var beststart, bestend int = 0, 0
var bestseq Sequence = nil
var phases int = 1 // Number of phases 1 or 2 (+/+-)
var nbgapstart int = 0 // Number of gaps at the start of compared seq
var phase int
var tmpseq, revcomp Sequence
var aligner PairwiseAligner
var al, bestali Alignment
// We translate the sequence in the 3 phases to search for the best
// alignment
phases = 1
tmpseq = seq
revcomp = seq
if p.reverse {
phases = 2
revcomp = seq.Clone()
revcomp.Reverse()
revcomp.Complement()
}
// We search for the best score among all references
// and all phases
for _, orf := range orfs {
for phase = 0; phase < phases; phase++ {
if phase < 1 {
tmpseq = seq
} else {
tmpseq = revcomp
}
aligner = NewPwAligner(orf, tmpseq, ALIGN_ALGO_ATG)
aligner.SetGapOpenScore(p.gapopen)
aligner.SetGapExtendScore(p.gapextend)
if p.changedscores {
aligner.SetScore(p.matchscore, p.mismatchscore)
}
if al, err = aligner.Alignment(); err != nil {
ph = PhasedSequence{Err: fmt.Errorf("error while aligning %s with %s : %v", orf.Name(), tmpseq.Name(), err)}
return
}
_, seqstart := aligner.AlignStarts()
_, seqend := aligner.AlignEnds()
if aligner.MaxScore() > bestscore {
bestscore = aligner.MaxScore()
// Alignment start in nucleotidic sequence
beststart = seqstart
bestseq = tmpseq
bestali = al
// Number of gaps at the start of sequence
// Allows to translate in the right phase
nbgapstart = 0
for i := 0; aligner.Seq2Ali()[i] == '-'; i++ {
nbgapstart++
}
bestratematches = float64(aligner.NbMatches()) / float64(aligner.Length())
bestlen = float64(aligner.Length()) / float64(orf.Length())
bestend = bestseq.Length()
if p.cutend {
bestend = seqend + 1
}
}
}
}
phase = (3 - nbgapstart%3) % 3
ph = PhasedSequence{
Err: nil,
Removed: false,
Position: beststart,
NtSeq: NewSequence(bestseq.Name(),
bestseq.SequenceChar()[beststart:bestend],
bestseq.Comment()),
// For two next sequences we take into account the right phase
// (taking into account initial gaps)
// Ex:
// NNN NNN NNN => phase 0
// -NN NNN NNN => phase 2
// --N NNN NNN => phase 1
// --- NNN NNN => phase 0
CodonSeq: NewSequence(bestseq.Name(),
bestseq.SequenceChar()[beststart+phase:bestend],
bestseq.Comment()),
AaSeq: NewSequence(bestseq.Name(),
bestseq.SequenceChar()[beststart+phase:bestend],
bestseq.Comment()),
Ali: bestali,
}
ph.AaSeq, err = ph.AaSeq.Translate(0, p.geneticcode)
// We set a threshold for matches over the alignment length...
if (p.matchcutoff > .0 && bestratematches <= p.matchcutoff) ||
(p.lencutoff > .0 && bestlen <= p.lencutoff) {
ph.Removed = true
}
return
}