/
properties.go
450 lines (417 loc) · 17.2 KB
/
properties.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.
package sstable
import (
"bytes"
"encoding/binary"
"fmt"
"math"
"reflect"
"sort"
"unsafe"
"github.com/cockroachdb/pebble/internal/intern"
)
const propertiesBlockRestartInterval = math.MaxInt32
const propGlobalSeqnumName = "rocksdb.external_sst_file.global_seqno"
var propTagMap = make(map[string]reflect.StructField)
var propBoolTrue = []byte{'1'}
var propBoolFalse = []byte{'0'}
var propOffsetTagMap = make(map[uintptr]string)
func generateTagMaps(t reflect.Type, indexPrefix []int) {
for i := 0; i < t.NumField(); i++ {
f := t.Field(i)
if f.Type.Kind() == reflect.Struct {
if tag := f.Tag.Get("prop"); i == 0 && tag == "pebble.embbeded_common_properties" {
// CommonProperties struct embedded in Properties. Note that since
// CommonProperties is placed at the top of properties we can use
// the offsets of the fields within CommonProperties to determine
// the offsets of those fields within Properties.
generateTagMaps(f.Type, []int{i})
continue
}
panic("pebble: unknown struct type in Properties")
}
if tag := f.Tag.Get("prop"); tag != "" {
switch f.Type.Kind() {
case reflect.Bool:
case reflect.Uint32:
case reflect.Uint64:
case reflect.String:
default:
panic(fmt.Sprintf("unsupported property field type: %s %s", f.Name, f.Type))
}
if len(indexPrefix) > 0 {
// Prepend the index prefix so that we can use FieldByIndex on the top-level struct.
f.Index = append(indexPrefix[:len(indexPrefix):len(indexPrefix)], f.Index...)
}
propTagMap[tag] = f
propOffsetTagMap[f.Offset] = tag
}
}
}
func init() {
generateTagMaps(reflect.TypeOf(Properties{}), nil)
}
// CommonProperties holds properties for either a virtual or a physical sstable. This
// can be used by code which doesn't care to make the distinction between physical
// and virtual sstables properties.
//
// For virtual sstables, fields are constructed through extrapolation upon virtual
// reader construction. See MakeVirtualReader for implementation details.
//
// NB: The values of these properties can affect correctness. For example,
// if NumRangeKeySets == 0, but the sstable actually contains range keys, then
// the iterators will behave incorrectly.
type CommonProperties struct {
// The number of entries in this table.
NumEntries uint64 `prop:"rocksdb.num.entries"`
// Total raw key size.
RawKeySize uint64 `prop:"rocksdb.raw.key.size"`
// Total raw value size.
RawValueSize uint64 `prop:"rocksdb.raw.value.size"`
// Total raw key size of point deletion tombstones. This value is comparable
// to RawKeySize.
RawPointTombstoneKeySize uint64 `prop:"pebble.raw.point-tombstone.key.size"`
// Sum of the raw value sizes carried by point deletion tombstones
// containing size estimates. See the DeleteSized key kind. This value is
// comparable to Raw{Key,Value}Size.
RawPointTombstoneValueSize uint64 `prop:"pebble.raw.point-tombstone.value.size"`
// The number of point deletion entries ("tombstones") in this table that
// carry a size hint indicating the size of the value the tombstone deletes.
NumSizedDeletions uint64 `prop:"pebble.num.deletions.sized"`
// The number of deletion entries in this table, including both point and
// range deletions.
NumDeletions uint64 `prop:"rocksdb.deleted.keys"`
// The number of range deletions in this table.
NumRangeDeletions uint64 `prop:"rocksdb.num.range-deletions"`
// The number of RANGEKEYDELs in this table.
NumRangeKeyDels uint64 `prop:"pebble.num.range-key-dels"`
// The number of RANGEKEYSETs in this table.
NumRangeKeySets uint64 `prop:"pebble.num.range-key-sets"`
// Total size of value blocks and value index block. Only serialized if > 0.
ValueBlocksSize uint64 `prop:"pebble.value-blocks.size"`
}
// String is only used for testing purposes.
func (c *CommonProperties) String() string {
var buf bytes.Buffer
v := reflect.ValueOf(*c)
loaded := make(map[uintptr]struct{})
writeProperties(loaded, v, &buf)
return buf.String()
}
// NumPointDeletions is the number of point deletions in the sstable. For virtual
// sstables, this is an estimate.
func (c *CommonProperties) NumPointDeletions() uint64 {
return c.NumDeletions - c.NumRangeDeletions
}
// Properties holds the sstable property values. The properties are
// automatically populated during sstable creation and load from the properties
// meta block when an sstable is opened.
type Properties struct {
// CommonProperties needs to be at the top of the Properties struct so that the
// offsets of the fields in CommonProperties match the offsets of the embedded
// fields of CommonProperties in Properties.
CommonProperties `prop:"pebble.embbeded_common_properties"`
// The name of the comparer used in this table.
ComparerName string `prop:"rocksdb.comparator"`
// The compression algorithm used to compress blocks.
CompressionName string `prop:"rocksdb.compression"`
// The compression options used to compress blocks.
CompressionOptions string `prop:"rocksdb.compression_options"`
// The total size of all data blocks.
DataSize uint64 `prop:"rocksdb.data.size"`
// The external sstable version format. Version 2 is the one RocksDB has been
// using since 5.13. RocksDB only uses the global sequence number for an
// sstable if this property has been set.
ExternalFormatVersion uint32 `prop:"rocksdb.external_sst_file.version"`
// The name of the filter policy used in this table. Empty if no filter
// policy is used.
FilterPolicyName string `prop:"rocksdb.filter.policy"`
// The size of filter block.
FilterSize uint64 `prop:"rocksdb.filter.size"`
// The global sequence number to use for all entries in the table. Present if
// the table was created externally and ingested whole.
GlobalSeqNum uint64 `prop:"rocksdb.external_sst_file.global_seqno"`
// Total number of index partitions if kTwoLevelIndexSearch is used.
IndexPartitions uint64 `prop:"rocksdb.index.partitions"`
// The size of index block.
IndexSize uint64 `prop:"rocksdb.index.size"`
// The index type. TODO(peter): add a more detailed description.
IndexType uint32 `prop:"rocksdb.block.based.table.index.type"`
// For formats >= TableFormatPebblev4, this is set to true if the obsolete
// bit is strict for all the point keys.
IsStrictObsolete bool `prop:"pebble.obsolete.is_strict"`
// The name of the merger used in this table. Empty if no merger is used.
MergerName string `prop:"rocksdb.merge.operator"`
// The number of blocks in this table.
NumDataBlocks uint64 `prop:"rocksdb.num.data.blocks"`
// The number of merge operands in the table.
NumMergeOperands uint64 `prop:"rocksdb.merge.operands"`
// The number of RANGEKEYUNSETs in this table.
NumRangeKeyUnsets uint64 `prop:"pebble.num.range-key-unsets"`
// The number of value blocks in this table. Only serialized if > 0.
NumValueBlocks uint64 `prop:"pebble.num.value-blocks"`
// The number of values stored in value blocks. Only serialized if > 0.
NumValuesInValueBlocks uint64 `prop:"pebble.num.values.in.value-blocks"`
// The name of the prefix extractor used in this table. Empty if no prefix
// extractor is used.
PrefixExtractorName string `prop:"rocksdb.prefix.extractor.name"`
// If filtering is enabled, was the filter created on the key prefix.
PrefixFiltering bool `prop:"rocksdb.block.based.table.prefix.filtering"`
// A comma separated list of names of the property collectors used in this
// table.
PropertyCollectorNames string `prop:"rocksdb.property.collectors"`
// Total raw rangekey key size.
RawRangeKeyKeySize uint64 `prop:"pebble.raw.range-key.key.size"`
// Total raw rangekey value size.
RawRangeKeyValueSize uint64 `prop:"pebble.raw.range-key.value.size"`
// The total number of keys in this table that were pinned by open snapshots.
SnapshotPinnedKeys uint64 `prop:"pebble.num.snapshot-pinned-keys"`
// The cumulative bytes of keys in this table that were pinned by
// open snapshots. This value is comparable to RawKeySize.
SnapshotPinnedKeySize uint64 `prop:"pebble.raw.snapshot-pinned-keys.size"`
// The cumulative bytes of values in this table that were pinned by
// open snapshots. This value is comparable to RawValueSize.
SnapshotPinnedValueSize uint64 `prop:"pebble.raw.snapshot-pinned-values.size"`
// Size of the top-level index if kTwoLevelIndexSearch is used.
TopLevelIndexSize uint64 `prop:"rocksdb.top-level.index.size"`
// User collected properties.
UserProperties map[string]string
// If filtering is enabled, was the filter created on the whole key.
WholeKeyFiltering bool `prop:"rocksdb.block.based.table.whole.key.filtering"`
// Loaded set indicating which fields have been loaded from disk. Indexed by
// the field's byte offset within the struct
// (reflect.StructField.Offset). Only set if the properties have been loaded
// from a file. Only exported for testing purposes.
Loaded map[uintptr]struct{}
}
// NumPointDeletions returns the number of point deletions in this table.
func (p *Properties) NumPointDeletions() uint64 {
return p.NumDeletions - p.NumRangeDeletions
}
// NumRangeKeys returns a count of the number of range keys in this table.
func (p *Properties) NumRangeKeys() uint64 {
return p.NumRangeKeyDels + p.NumRangeKeySets + p.NumRangeKeyUnsets
}
func writeProperties(loaded map[uintptr]struct{}, v reflect.Value, buf *bytes.Buffer) {
vt := v.Type()
for i := 0; i < v.NumField(); i++ {
ft := vt.Field(i)
if ft.Type.Kind() == reflect.Struct {
// Embedded struct within the properties.
writeProperties(loaded, v.Field(i), buf)
continue
}
tag := ft.Tag.Get("prop")
if tag == "" {
continue
}
f := v.Field(i)
// TODO(peter): Use f.IsZero() when we can rely on go1.13.
if zero := reflect.Zero(f.Type()); zero.Interface() == f.Interface() {
// Skip printing of zero values which were not loaded from disk.
if _, ok := loaded[ft.Offset]; !ok {
continue
}
}
fmt.Fprintf(buf, "%s: ", tag)
switch ft.Type.Kind() {
case reflect.Bool:
fmt.Fprintf(buf, "%t\n", f.Bool())
case reflect.Uint32:
fmt.Fprintf(buf, "%d\n", f.Uint())
case reflect.Uint64:
fmt.Fprintf(buf, "%d\n", f.Uint())
case reflect.String:
fmt.Fprintf(buf, "%s\n", f.String())
default:
panic("not reached")
}
}
}
func (p *Properties) String() string {
var buf bytes.Buffer
v := reflect.ValueOf(*p)
writeProperties(p.Loaded, v, &buf)
// Write the UserProperties.
keys := make([]string, 0, len(p.UserProperties))
for key := range p.UserProperties {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
fmt.Fprintf(&buf, "%s: %s\n", key, p.UserProperties[key])
}
return buf.String()
}
func (p *Properties) load(
b block, blockOffset uint64, deniedUserProperties map[string]struct{},
) error {
i, err := newRawBlockIter(bytes.Compare, b)
if err != nil {
return err
}
p.Loaded = make(map[uintptr]struct{})
v := reflect.ValueOf(p).Elem()
for valid := i.First(); valid; valid = i.Next() {
if f, ok := propTagMap[string(i.Key().UserKey)]; ok {
p.Loaded[f.Offset] = struct{}{}
field := v.FieldByIndex(f.Index)
switch f.Type.Kind() {
case reflect.Bool:
field.SetBool(bytes.Equal(i.Value(), propBoolTrue))
case reflect.Uint32:
field.SetUint(uint64(binary.LittleEndian.Uint32(i.Value())))
case reflect.Uint64:
var n uint64
if string(i.Key().UserKey) == propGlobalSeqnumName {
n = binary.LittleEndian.Uint64(i.Value())
} else {
n, _ = binary.Uvarint(i.Value())
}
field.SetUint(n)
case reflect.String:
field.SetString(intern.Bytes(i.Value()))
default:
panic("not reached")
}
continue
}
if p.UserProperties == nil {
p.UserProperties = make(map[string]string)
}
if _, denied := deniedUserProperties[string(i.Key().UserKey)]; !denied {
p.UserProperties[intern.Bytes(i.Key().UserKey)] = string(i.Value())
}
}
return nil
}
func (p *Properties) saveBool(m map[string][]byte, offset uintptr, value bool) {
tag := propOffsetTagMap[offset]
if value {
m[tag] = propBoolTrue
} else {
m[tag] = propBoolFalse
}
}
func (p *Properties) saveUint32(m map[string][]byte, offset uintptr, value uint32) {
var buf [4]byte
binary.LittleEndian.PutUint32(buf[:], value)
m[propOffsetTagMap[offset]] = buf[:]
}
func (p *Properties) saveUint64(m map[string][]byte, offset uintptr, value uint64) {
var buf [8]byte
binary.LittleEndian.PutUint64(buf[:], value)
m[propOffsetTagMap[offset]] = buf[:]
}
func (p *Properties) saveUvarint(m map[string][]byte, offset uintptr, value uint64) {
var buf [10]byte
n := binary.PutUvarint(buf[:], value)
m[propOffsetTagMap[offset]] = buf[:n]
}
func (p *Properties) saveString(m map[string][]byte, offset uintptr, value string) {
m[propOffsetTagMap[offset]] = []byte(value)
}
func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) {
m := make(map[string][]byte)
for k, v := range p.UserProperties {
m[k] = []byte(v)
}
if p.ComparerName != "" {
p.saveString(m, unsafe.Offsetof(p.ComparerName), p.ComparerName)
}
if p.CompressionName != "" {
p.saveString(m, unsafe.Offsetof(p.CompressionName), p.CompressionName)
}
if p.CompressionOptions != "" {
p.saveString(m, unsafe.Offsetof(p.CompressionOptions), p.CompressionOptions)
}
p.saveUvarint(m, unsafe.Offsetof(p.DataSize), p.DataSize)
if p.ExternalFormatVersion != 0 {
p.saveUint32(m, unsafe.Offsetof(p.ExternalFormatVersion), p.ExternalFormatVersion)
p.saveUint64(m, unsafe.Offsetof(p.GlobalSeqNum), p.GlobalSeqNum)
}
if p.FilterPolicyName != "" {
p.saveString(m, unsafe.Offsetof(p.FilterPolicyName), p.FilterPolicyName)
}
p.saveUvarint(m, unsafe.Offsetof(p.FilterSize), p.FilterSize)
if p.IndexPartitions != 0 {
p.saveUvarint(m, unsafe.Offsetof(p.IndexPartitions), p.IndexPartitions)
p.saveUvarint(m, unsafe.Offsetof(p.TopLevelIndexSize), p.TopLevelIndexSize)
}
p.saveUvarint(m, unsafe.Offsetof(p.IndexSize), p.IndexSize)
p.saveUint32(m, unsafe.Offsetof(p.IndexType), p.IndexType)
if p.IsStrictObsolete {
p.saveBool(m, unsafe.Offsetof(p.IsStrictObsolete), p.IsStrictObsolete)
}
if p.MergerName != "" {
p.saveString(m, unsafe.Offsetof(p.MergerName), p.MergerName)
}
p.saveUvarint(m, unsafe.Offsetof(p.NumDataBlocks), p.NumDataBlocks)
p.saveUvarint(m, unsafe.Offsetof(p.NumEntries), p.NumEntries)
p.saveUvarint(m, unsafe.Offsetof(p.NumDeletions), p.NumDeletions)
if p.NumSizedDeletions > 0 {
p.saveUvarint(m, unsafe.Offsetof(p.NumSizedDeletions), p.NumSizedDeletions)
}
p.saveUvarint(m, unsafe.Offsetof(p.NumMergeOperands), p.NumMergeOperands)
p.saveUvarint(m, unsafe.Offsetof(p.NumRangeDeletions), p.NumRangeDeletions)
// NB: We only write out some properties for Pebble formats. This isn't
// strictly necessary because unrecognized properties are interpreted as
// user-defined properties, however writing them prevents byte-for-byte
// equivalence with RocksDB files that some of our testing requires.
if p.RawPointTombstoneKeySize > 0 && tblFormat >= TableFormatPebblev1 {
p.saveUvarint(m, unsafe.Offsetof(p.RawPointTombstoneKeySize), p.RawPointTombstoneKeySize)
}
if p.RawPointTombstoneValueSize > 0 {
p.saveUvarint(m, unsafe.Offsetof(p.RawPointTombstoneValueSize), p.RawPointTombstoneValueSize)
}
if p.NumRangeKeys() > 0 {
p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyDels), p.NumRangeKeyDels)
p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeySets), p.NumRangeKeySets)
p.saveUvarint(m, unsafe.Offsetof(p.NumRangeKeyUnsets), p.NumRangeKeyUnsets)
p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyKeySize), p.RawRangeKeyKeySize)
p.saveUvarint(m, unsafe.Offsetof(p.RawRangeKeyValueSize), p.RawRangeKeyValueSize)
}
if p.NumValueBlocks > 0 {
p.saveUvarint(m, unsafe.Offsetof(p.NumValueBlocks), p.NumValueBlocks)
}
if p.NumValuesInValueBlocks > 0 {
p.saveUvarint(m, unsafe.Offsetof(p.NumValuesInValueBlocks), p.NumValuesInValueBlocks)
}
if p.PrefixExtractorName != "" {
p.saveString(m, unsafe.Offsetof(p.PrefixExtractorName), p.PrefixExtractorName)
}
p.saveBool(m, unsafe.Offsetof(p.PrefixFiltering), p.PrefixFiltering)
if p.PropertyCollectorNames != "" {
p.saveString(m, unsafe.Offsetof(p.PropertyCollectorNames), p.PropertyCollectorNames)
}
if p.SnapshotPinnedKeys > 0 {
p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedKeys), p.SnapshotPinnedKeys)
p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedKeySize), p.SnapshotPinnedKeySize)
p.saveUvarint(m, unsafe.Offsetof(p.SnapshotPinnedValueSize), p.SnapshotPinnedValueSize)
}
p.saveUvarint(m, unsafe.Offsetof(p.RawKeySize), p.RawKeySize)
p.saveUvarint(m, unsafe.Offsetof(p.RawValueSize), p.RawValueSize)
if p.ValueBlocksSize > 0 {
p.saveUvarint(m, unsafe.Offsetof(p.ValueBlocksSize), p.ValueBlocksSize)
}
p.saveBool(m, unsafe.Offsetof(p.WholeKeyFiltering), p.WholeKeyFiltering)
if tblFormat < TableFormatPebblev1 {
m["rocksdb.column.family.id"] = binary.AppendUvarint([]byte(nil), math.MaxInt32)
m["rocksdb.fixed.key.length"] = []byte{0x00}
m["rocksdb.index.key.is.user.key"] = []byte{0x00}
m["rocksdb.index.value.is.delta.encoded"] = []byte{0x00}
m["rocksdb.oldest.key.time"] = []byte{0x00}
m["rocksdb.creation.time"] = []byte{0x00}
m["rocksdb.format.version"] = []byte{0x00}
}
keys := make([]string, 0, len(m))
for key := range m {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
w.add(InternalKey{UserKey: []byte(key)}, m[key])
}
}