-
Notifications
You must be signed in to change notification settings - Fork 17
/
row.go
227 lines (214 loc) · 9.21 KB
/
row.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
// Copyright 2020 Grail Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package snp
import (
"encoding/binary"
"github.com/grailbio/bio/pileup"
)
type PerReadFeatures struct {
// dist5p is the 0-based distance of the current base from its 5' end. (Note
// that Dist3p := fraglen - 1 - dist5p, so we don't need to store it
// separately.)
Dist5p uint16
Fraglen uint16
Qual byte
Strand byte
}
const (
FieldCounts = 1 << iota
FieldPerReadA
FieldPerReadC
FieldPerReadG
FieldPerReadT
FieldPerReadAny = FieldPerReadA | FieldPerReadC | FieldPerReadG | FieldPerReadT
)
// PileupPayload is a container for all types of pileup data which may be
// associated with a single position. It does not store the position itself,
// or a tag indicating which parts of the container are used.
//
// Depth and count values are of type uint32 instead of int to reduce cache
// footprint.
type PileupPayload struct {
Depth uint32
Counts [pileup.NBaseEnum][2]uint32
PerRead [pileup.NBase][]PerReadFeatures
}
// PileupRow contains all pileup data associated with a single position, along
// with the position itself and the set of PileupPayload fields used.
//
// The main loop splits the genome into shards, and generates lightly
// compressed (zstd level 1) per-shard PileupRow recordio files. Then, the
// per-shard files are read in sequence and converted to the final requested
// output format. This is a bit inefficient, but we can easily afford it.
type PileupRow struct {
FieldsPresent uint32 // field... flags
RefID uint32
Pos uint32
Payload PileupPayload
}
// cutAndAdvance() returns s[offset:offset+pieceLen], and increments offset by
// pieceLen.
//
// This is the lowest-overhead idiom I've found so far for filling a
// preallocated []byte. In particular, despite recent advances in
// bounds-check-elimination, the Go 1.12 compiler still does not recognize that
// s[offset:offset+k] has length k when k is a hardcoded integer and offset is
// known to be smaller than (MaxInt - k); it really is necessary to write x :=
// s[offset:] followed by x = x[:k] if you want to avoid spurious
// bounds-checking.
//
// Other things I tried:
// - "splitBefore(s *[]byte, x int) []byte { ... }". This is a slightly
// simpler interface, but unfortunately it proved to be higher-overhead due
// to length and capacity requiring separate updates/checks.
// - Making offset and pieceLen into unsigned integers. That makes no
// difference to the compiled code; the compiler is able to prove that offset
// is never negative.
func cutAndAdvance(offset *int, s []byte, pieceLen int) []byte {
tmpSlice := s[(*offset):]
*offset += pieceLen
return tmpSlice[:pieceLen]
}
// Serialized format:
// [0..4): fieldsPresent
// [4..8): refID
// [8..12): pos
// [12..16): depth
// if counts present, stored in next 40 bytes
// if perRead[pileup.baseA] present, length stored in next 4 bytes, then
// values stored in next 6*n bytes
// if perRead[pileup.baseC] present... etc.
// This is essentially the simplest format that can support the variable-length
// per-read feature arrays that are needed. It is not difficult to decrease
// the nominal size of these records by (i) using varints instead of uint32s,
// and (ii) making fieldsPresent indicate which counts[][] values are nonzero
// and only storing those; but I wouldn't expect that to be worth the
// additional complexity since all uses of this marshal function are bundled
// with the "zstd 1" transformer anyway. (Instead, all the 'extra' complexity
// in this function concerns (i) avoiding extra allocations and (ii) avoiding a
// ridiculous number of spurious bounds-checks, in ways that make sense for a
// wide variety of other serialization functions.)
//
// In the future, we may need to add indel support.
func MarshalPileupRow(scratch []byte, p interface{}) ([]byte, error) {
pr := p.(*PileupRow)
fieldsPresent := pr.FieldsPresent
// Compute length up-front so that, if we need to allocate, we only do so
// once.
bytesReq := 16
if fieldsPresent&FieldCounts != 0 {
bytesReq += 40
}
if fieldsPresent&FieldPerReadAny != 0 {
for b := range pr.Payload.PerRead {
// For b in {0,1,2,3}, (FieldPerReadA << b) is the bit indicating that
// perRead[b] must be stored.
if fieldsPresent&(FieldPerReadA<<uint(b)) != 0 {
bytesReq += 4 + 6*len(pr.Payload.PerRead[b])
}
}
}
t := scratch
if len(t) < bytesReq {
t = make([]byte, bytesReq)
}
offset := 0
tStart := cutAndAdvance(&offset, t, 16)
binary.LittleEndian.PutUint32(tStart[0:4], pr.FieldsPresent)
binary.LittleEndian.PutUint32(tStart[4:8], pr.RefID)
binary.LittleEndian.PutUint32(tStart[8:12], pr.Pos)
binary.LittleEndian.PutUint32(tStart[12:16], pr.Payload.Depth)
if fieldsPresent&FieldCounts != 0 {
tCounts := cutAndAdvance(&offset, t, 40)
// Unfortunately, while the obvious double-loop works fine for reading
// values from pr.Payload.Counts[], I don't see any way to express the
// writes to tCounts[] that the Go 1.12 bounds-check-eliminator
// understands.
binary.LittleEndian.PutUint32(tCounts[:4], pr.Payload.Counts[pileup.BaseA][0])
binary.LittleEndian.PutUint32(tCounts[4:8], pr.Payload.Counts[pileup.BaseA][1])
binary.LittleEndian.PutUint32(tCounts[8:12], pr.Payload.Counts[pileup.BaseC][0])
binary.LittleEndian.PutUint32(tCounts[12:16], pr.Payload.Counts[pileup.BaseC][1])
binary.LittleEndian.PutUint32(tCounts[16:20], pr.Payload.Counts[pileup.BaseG][0])
binary.LittleEndian.PutUint32(tCounts[20:24], pr.Payload.Counts[pileup.BaseG][1])
binary.LittleEndian.PutUint32(tCounts[24:28], pr.Payload.Counts[pileup.BaseT][0])
binary.LittleEndian.PutUint32(tCounts[28:32], pr.Payload.Counts[pileup.BaseT][1])
binary.LittleEndian.PutUint32(tCounts[32:36], pr.Payload.Counts[pileup.BaseX][0])
binary.LittleEndian.PutUint32(tCounts[36:40], pr.Payload.Counts[pileup.BaseX][1])
}
if fieldsPresent&FieldPerReadAny != 0 {
for b := range pr.Payload.PerRead {
if fieldsPresent&(FieldPerReadA<<uint(b)) != 0 {
lenSlice := cutAndAdvance(&offset, t, 4)
binary.LittleEndian.PutUint32(lenSlice, uint32(len(pr.Payload.PerRead[b])))
for _, src := range pr.Payload.PerRead[b] {
dst := cutAndAdvance(&offset, t, 6)
binary.LittleEndian.PutUint16(dst[:2], src.Dist5p)
binary.LittleEndian.PutUint16(dst[2:4], src.Fraglen)
dst[4] = src.Qual
dst[5] = src.Strand
}
}
}
}
return t, nil
}
// tried the block-unmarshal strategy in grail.com/bio/variants, it actually
// seemed to have worse performance for this use case
func unmarshalPileupRow(in []byte) (out interface{}, err error) {
offset := 0
inStart := cutAndAdvance(&offset, in, 16)
pr := &PileupRow{
FieldsPresent: binary.LittleEndian.Uint32(inStart[:4]),
RefID: binary.LittleEndian.Uint32(inStart[4:8]),
Pos: binary.LittleEndian.Uint32(inStart[8:12]),
}
pr.Payload.Depth = binary.LittleEndian.Uint32(inStart[12:16])
if pr.FieldsPresent&FieldCounts != 0 {
inCounts := cutAndAdvance(&offset, in, 40)
pr.Payload.Counts[pileup.BaseA][0] = binary.LittleEndian.Uint32(inCounts[0:4])
pr.Payload.Counts[pileup.BaseA][1] = binary.LittleEndian.Uint32(inCounts[4:8])
pr.Payload.Counts[pileup.BaseC][0] = binary.LittleEndian.Uint32(inCounts[8:12])
pr.Payload.Counts[pileup.BaseC][1] = binary.LittleEndian.Uint32(inCounts[12:16])
pr.Payload.Counts[pileup.BaseG][0] = binary.LittleEndian.Uint32(inCounts[16:20])
pr.Payload.Counts[pileup.BaseG][1] = binary.LittleEndian.Uint32(inCounts[20:24])
pr.Payload.Counts[pileup.BaseT][0] = binary.LittleEndian.Uint32(inCounts[24:28])
pr.Payload.Counts[pileup.BaseT][1] = binary.LittleEndian.Uint32(inCounts[28:32])
pr.Payload.Counts[pileup.BaseX][0] = binary.LittleEndian.Uint32(inCounts[32:36])
pr.Payload.Counts[pileup.BaseX][1] = binary.LittleEndian.Uint32(inCounts[36:40])
}
if pr.FieldsPresent&FieldPerReadAny != 0 {
for b := range pr.Payload.PerRead {
if pr.FieldsPresent&(FieldPerReadA<<uint(b)) != 0 {
lenSlice := cutAndAdvance(&offset, in, 4)
curLen := binary.LittleEndian.Uint32(lenSlice)
// If we wanted to further reduce the number of small allocations, we
// could allocate a single []PerReadFeatures slice outside this loop,
// and then make the per-base slices point to subslices of the single
// allocation. (I don't bother since, most of the time, only one
// newFeatures slice corresponding to the REF base is allocated
// anyway.)
newFeatures := make([]PerReadFeatures, curLen)
pr.Payload.PerRead[b] = newFeatures
for i := range newFeatures {
src := cutAndAdvance(&offset, in, 6)
newFeatures[i].Dist5p = binary.LittleEndian.Uint16(src[:2])
newFeatures[i].Fraglen = binary.LittleEndian.Uint16(src[2:4])
newFeatures[i].Qual = src[4]
newFeatures[i].Strand = src[5]
}
}
}
}
return pr, nil
}