/
model.go
230 lines (205 loc) · 6.3 KB
/
model.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
package protein
import (
"fmt"
"math"
"github.com/evolbioinfo/goalign/align"
"github.com/evolbioinfo/goalign/models/protein"
"gonum.org/v1/gonum/mat"
)
const (
PROT_DIST_MAX = 20.00
)
type ProtDistModel struct {
model *protein.ProtModel
modelfreqs bool // amino acid frequency: If true we use model frequencies, else empirical data frequencies
removegaps bool
pij *mat.Dense // Matrix of Pij
stepsize int
}
// Initialize a new protein model, given the name of the model as const int:
// MODEL_DAYHOFF, MODEL_JTT, MODEL_MTREV, MODEL_LG or MODEL_WAG
func NewProtDistModel(model int, modelfreqs bool, usegamma bool, alpha float64, removegaps bool) (*ProtDistModel, error) {
m, err := protein.NewProtModel(model, usegamma, alpha)
if err != nil {
return nil, err
}
return &ProtDistModel{
m,
modelfreqs,
removegaps,
nil,
1,
}, nil
}
func (model *ProtDistModel) InitModel(a align.Alignment, weights []float64) (err error) {
var pi []float64
ns := model.Ns()
// Count equilibrium frequencies from input alignment (do not use model frequencies)
if !model.modelfreqs {
if ns = len(a.AlphabetCharacters()); ns != model.Ns() {
err = fmt.Errorf("Alphabet has not a length of 20")
return
}
if weights == nil {
weights = make([]float64, a.Length())
for i, _ := range weights {
weights[i] = 1.0
}
}
_, selected := selectedSites(a, weights, model.removegaps)
if pi, err = aaFrequency(a, weights, selected); err != nil {
return
}
}
model.pij = mat.NewDense(ns, ns, nil)
model.model.InitModel(pi)
return nil
}
// Basic JC69 Protein Distance Matrix
func (model *ProtDistModel) JC69Dist(a align.Alignment, weights []float64, selected []bool) (p *mat.Dense, q *mat.Dense, dist *mat.Dense) {
var site, i, j, k int
var len *mat.Dense
ns := model.Ns()
len = mat.NewDense(a.NbSequences(), a.NbSequences(), nil)
p = mat.NewDense(a.NbSequences(), a.NbSequences(), nil)
q = mat.NewDense(a.NbSequences(), a.NbSequences(), nil)
dist = mat.NewDense(a.NbSequences(), a.NbSequences(), nil)
for site = 0; site < a.Length(); site += model.stepsize {
if selected[site] {
for j = 0; j < a.NbSequences()-1; j++ {
s1, _ := a.GetSequenceCharById(j)
for k = j + 1; k < a.NbSequences(); k++ {
s2, _ := a.GetSequenceCharById(k)
if (!isAmbigu(s1[site])) && (!isAmbigu(s2[site])) {
len.Set(j, k, len.At(j, k)+weights[site])
len.Set(k, j, weights[site])
for n, c1 := range s1[site : site+model.stepsize] {
if c1 != s2[site+n] {
p.Set(j, k, p.At(j, k)+weights[site])
break
}
}
}
}
}
}
}
for i = 0; i < a.NbSequences()-1; i++ {
for j = i + 1; j < a.NbSequences(); j++ {
if len.At(i, j) > 0 {
p.Set(i, j, p.At(i, j)/len.At(i, j))
} else {
p.Set(i, j, 1.)
}
p.Set(j, i, p.At(i, j))
if (1. - float64(ns)/float64(ns-1.)*p.At(i, j)) < .0 {
dist.Set(i, j, PROT_DIST_MAX)
} else {
dist.Set(i, j, -float64(ns-1.)/float64(ns)*math.Log(1.-float64(ns)/float64(ns-1.)*p.At(i, j)))
}
if dist.At(i, j) > PROT_DIST_MAX {
dist.Set(i, j, PROT_DIST_MAX)
}
dist.Set(j, i, dist.At(i, j))
}
}
len = nil
return p, q, dist
}
func (model *ProtDistModel) Ns() int {
return 20
}
func (model *ProtDistModel) pMat(l float64) {
if l < BL_MIN {
model.pMatZeroBrLen()
} else {
model.pMatEmpirical(l)
}
}
func (model *ProtDistModel) pMatZeroBrLen() {
model.pij.Apply(func(i, j int, v float64) float64 {
if i == j {
return 1.0
}
return 0.0
}, model.pij)
}
/********************************************************************/
/* Code taken from FastME */
/* Computes the substitution probability matrix
* from the initial substitution rate matrix and frequency vector
* and one specific branch length
*
* input : l , branch length
* input : mod , choosen model parameters, qmat and pi
* ouput : Pij , substitution probability matrix
*
* matrix P(l) is computed as follows :
* P(l) = exp(Q*t) , where :
*
* Q = substitution rate matrix = Vr*D*inverse(Vr) , where :
*
* Vr = right eigenvector matrix for Q
* D = diagonal matrix of eigenvalues for Q
*
* t = time interval = l / mr , where :
*
* mr = mean rate = branch length/time interval
* = sum(i)(pi[i]*p(i->j)) , where :
*
* pi = state frequency vector
* p(i->j) = subst. probability from i to a different state
* = -Q[ii] , as sum(j)(Q[ij]) +Q[ii] = 0
*
* the Taylor development of exp(Q*t) gives :
* P(l) = Vr*exp(D*t) *inverse(Vr)
* = Vr*pow(exp(D/mr),l)*inverse(Vr)
*
* for performance we compute only once the following matrices :
* Vr, inverse(Vr), exp(D/mr)
* thus each time we compute P(l) we only have to :
* make 20 times the operation pow()
* make 2 20x20 matrix multiplications, that is :
* 16000 = 2x20x20x20 times the operation *
* 16000 = 2x20x20x20 times the operation +
* which can be reduced to (the central matrix being diagonal) :
* 8400 = 20x20 + 20x20x20 times the operation *
* 8000 = 20x20x20 times the operation + */
func (model *ProtDistModel) pMatEmpirical(len float64) {
var i, k int
var U, V *mat.Dense
var R []float64
var expt []float64
var uexpt *mat.Dense
ns := model.Ns()
U = model.model.ReigenVects() //mod->eigen->r_e_vect;
R = model.model.Eval() //mod->eigen->e_val;// To take only real part from that vector /* eigen value matrix */
V = model.model.LeigenVects()
expt = make([]float64, ns) //model.eigen.Values(nil) // To take only imaginary part from that vector
uexpt = mat.NewDense(ns, ns, nil) //model.eigen.Vectors() // don't know yet how to handle that // mod->eigen->r_e_vect_im;
model.pij.Zero()
alpha := model.model.Alpha()
if model.model.UseGamma() && (math.Abs(alpha) > DBL_EPSILON) {
// compute pow (alpha / (alpha - e_val[i] * l), alpha)
for i = 0; i < ns; i++ {
expt[i] = math.Pow(alpha/(alpha-(R[i]*len)), alpha)
}
} else {
for i = 0; i < ns; i++ {
expt[i] = float64(math.Exp(R[i] * len))
}
}
// multiply Vr* pow (alpha / (alpha - e_val[i] * l), alpha) *Vi into Pij
uexpt.Apply(func(i, j int, v float64) float64 {
return U.At(i, j) * expt[j]
}, uexpt)
model.pij.Apply(func(i, j int, v float64) float64 {
for k = 0; k < ns; k++ {
v += uexpt.At(i, k) * V.At(k, j)
}
if v < DBL_MIN {
v = DBL_MIN
}
return v
}, model.pij)
}