Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sstable: add writer option to remove a common prefix #3242

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions sstable/data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ func optsFromArgs(td *datadriven.TestData, writerOpts *WriterOptions) error {
writerOpts.WritingToLowestLevel = true
case "is-strict-obsolete":
writerOpts.IsStrictObsolete = true
case "elide-prefix":
writerOpts.ElidePrefix = []byte(arg.Vals[0])
}
}
return nil
Expand Down
6 changes: 6 additions & 0 deletions sstable/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,12 @@ type WriterOptions struct {
// 750MB sstables -- see
// https://github.com/cockroachdb/cockroach/issues/117113).
DisableValueBlocks bool

// ElidePrefix is a common prefix that will be present in and removed from all
// keys and keyspans passed to the Add* methods of the writer. The writer will
// confirm all passed keys do indeed have the prefix before removing it and
// will return and error if they do not. The elided prefix is noted in a prop.
ElidePrefix []byte
}

func (o WriterOptions) ensureDefaults() WriterOptions {
Expand Down
7 changes: 6 additions & 1 deletion sstable/properties.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ type Properties struct {
CompressionOptions string `prop:"rocksdb.compression_options"`
// The total size of all data blocks.
DataSize uint64 `prop:"rocksdb.data.size"`
// ElidedPrefix is the byte prefix, if any, that was elided from every key and
// key span during the construction of the sstable.
ElidedPrefix string `prop:"pebble.elided_prefix"`
// The external sstable version format. Version 2 is the one RocksDB has been
// using since 5.13. RocksDB only uses the global sequence number for an
// sstable if this property has been set.
Expand Down Expand Up @@ -428,7 +431,9 @@ func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) {
p.saveUvarint(m, unsafe.Offsetof(p.ValueBlocksSize), p.ValueBlocksSize)
}
p.saveBool(m, unsafe.Offsetof(p.WholeKeyFiltering), p.WholeKeyFiltering)

if p.ElidedPrefix != "" {
p.saveString(m, unsafe.Offsetof(p.ElidedPrefix), p.ElidedPrefix)
}
if tblFormat < TableFormatPebblev1 {
m["rocksdb.column.family.id"] = binary.AppendUvarint([]byte(nil), math.MaxInt32)
m["rocksdb.fixed.key.length"] = []byte{0x00}
Expand Down
20 changes: 20 additions & 0 deletions sstable/testdata/writer
Original file line number Diff line number Diff line change
Expand Up @@ -368,3 +368,23 @@ layout
759 meta-index (57)
821 footer (53)
874 EOF

build elide-prefix=foo_ props=(elided)
foo_a.SET.1:a
foo_b.DEL.2:
foo_c.MERGE.3:c
foo_d.RANGEDEL.4:foo_e
foo_f.SET.5:f
foo_g.DEL.6:
foo_h.MERGE.7:h
foo_i.RANGEDEL.8:foo_j
rangekey: foo_j-foo_k:{(#9,RANGEKEYDEL)}
rangekey: foo_k-foo_l:{(#10,RANGEKEYUNSET,@t5)}
rangekey: foo_l-foo_m:{(#11,RANGEKEYSET,@t10,foo)}
----
point: [a#1,1-h#7,2]
rangedel: [d#4,15-j#72057594037927935,15]
rangekey: [j#9,19-m#72057594037927935,21]
seqnums: [1-11]
props "elided":
pebble.elided_prefix: foo_
32 changes: 32 additions & 0 deletions sstable/writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ type Writer struct {
indexBlockSizeThreshold int
compare Compare
split Split
elidePrefix []byte
formatKey base.FormatKey
compression Compression
separator Separator
Expand Down Expand Up @@ -926,6 +927,10 @@ func (w *Writer) makeAddPointDecisionV3(
}

func (w *Writer) addPoint(key InternalKey, value []byte, forceObsolete bool) error {
if !bytes.HasPrefix(key.UserKey, w.elidePrefix) {
return errBadElidePrefix
}
key.UserKey = key.UserKey[len(w.elidePrefix):]
if w.isStrictObsolete && key.Kind() == InternalKeyKindMerge {
return errors.Errorf("MERGE not supported in a strict-obsolete sstable")
}
Expand Down Expand Up @@ -1059,6 +1064,16 @@ func (w *Writer) prettyTombstone(k InternalKey, value []byte) fmt.Formatter {
}

func (w *Writer) addTombstone(key InternalKey, value []byte) error {
if !bytes.HasPrefix(key.UserKey, w.elidePrefix) {
return errBadElidePrefix
}
key.UserKey = key.UserKey[len(w.elidePrefix):]

if !bytes.HasPrefix(value, w.elidePrefix) {
return errBadElidePrefix
}
value = value[len(w.elidePrefix):]

if !w.disableKeyOrderChecks && !w.rangeDelV1Format && w.rangeDelBlock.nEntries > 0 {
// Check that tombstones are being added in fragmented order. If the two
// tombstones overlap, their start and end keys must be identical.
Expand Down Expand Up @@ -1199,6 +1214,8 @@ func (w *Writer) RangeKeyDelete(start, end []byte) error {
})
}

var errBadElidePrefix = fmt.Errorf("key does not match writer's common prefix")

// AddRangeKey adds a range key set, unset, or delete key/value pair to the
// table being written.
//
Expand All @@ -1212,10 +1229,23 @@ func (w *Writer) AddRangeKey(key InternalKey, value []byte) error {
if w.err != nil {
return w.err
}
if len(w.elidePrefix) != 0 {
if !bytes.HasPrefix(key.UserKey, w.elidePrefix) {
return errBadElidePrefix
}
key.UserKey = key.UserKey[len(w.elidePrefix):]
}

return w.addRangeKey(key, value)
}

func (w *Writer) addRangeKeySpan(span keyspan.Span) error {
if len(w.elidePrefix) != 0 {
if !bytes.HasPrefix(span.Start, w.elidePrefix) || !bytes.HasPrefix(span.End, w.elidePrefix) {
return errBadElidePrefix
}
span.Start, span.End = span.Start[len(w.elidePrefix):], span.End[len(w.elidePrefix):]
}
if w.compare(span.Start, span.End) >= 0 {
return errors.Errorf(
"pebble: start key must be strictly less than end key",
Expand Down Expand Up @@ -2201,6 +2231,7 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write
writingToLowestLevel: o.WritingToLowestLevel,
cache: o.Cache,
restartInterval: o.BlockRestartInterval,
elidePrefix: o.ElidePrefix,
checksumType: o.Checksum,
indexBlock: newIndexBlockBuf(o.Parallelism),
rangeDelBlock: blockWriter{
Expand Down Expand Up @@ -2272,6 +2303,7 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write
w.props.MergerName = o.MergerName
w.props.PropertyCollectorNames = "[]"
w.props.ExternalFormatVersion = rocksDBExternalFormatVersion
w.props.ElidedPrefix = string(o.ElidePrefix)

if len(o.BlockPropertyCollectors) > 0 || w.tableFormat >= TableFormatPebblev4 {
var buf bytes.Buffer
Expand Down
4 changes: 2 additions & 2 deletions testdata/event_listener
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ Zombie tables: 0 (0B)
Backing tables: 0 (0B)
Virtual tables: 0 (0B)
Block cache: 6 entries (1.1KB) hit rate: 11.1%
Table cache: 1 entries (808B) hit rate: 40.0%
Table cache: 1 entries (824B) hit rate: 40.0%
Secondary cache: 0 entries (0B) hit rate: 0.0%
Snapshots: 0 earliest seq num: 0
Table iters: 0
Expand Down Expand Up @@ -321,7 +321,7 @@ Zombie tables: 0 (0B)
Backing tables: 0 (0B)
Virtual tables: 0 (0B)
Block cache: 12 entries (2.3KB) hit rate: 14.3%
Table cache: 1 entries (808B) hit rate: 50.0%
Table cache: 1 entries (824B) hit rate: 50.0%
Secondary cache: 0 entries (0B) hit rate: 0.0%
Snapshots: 0 earliest seq num: 0
Table iters: 0
Expand Down
2 changes: 1 addition & 1 deletion testdata/ingest
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ Zombie tables: 0 (0B)
Backing tables: 0 (0B)
Virtual tables: 0 (0B)
Block cache: 6 entries (1.2KB) hit rate: 35.7%
Table cache: 1 entries (808B) hit rate: 50.0%
Table cache: 1 entries (824B) hit rate: 50.0%
Secondary cache: 0 entries (0B) hit rate: 0.0%
Snapshots: 0 earliest seq num: 0
Table iters: 0
Expand Down
8 changes: 4 additions & 4 deletions testdata/metrics
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ Zombie tables: 0 (0B)
Backing tables: 0 (0B)
Virtual tables: 0 (0B)
Block cache: 3 entries (556B) hit rate: 0.0%
Table cache: 1 entries (808B) hit rate: 0.0%
Table cache: 1 entries (824B) hit rate: 0.0%
Secondary cache: 0 entries (0B) hit rate: 0.0%
Snapshots: 0 earliest seq num: 0
Table iters: 1
Expand Down Expand Up @@ -201,7 +201,7 @@ Zombie tables: 1 (661B)
Backing tables: 0 (0B)
Virtual tables: 0 (0B)
Block cache: 3 entries (556B) hit rate: 42.9%
Table cache: 1 entries (808B) hit rate: 66.7%
Table cache: 1 entries (824B) hit rate: 66.7%
Secondary cache: 0 entries (0B) hit rate: 0.0%
Snapshots: 0 earliest seq num: 0
Table iters: 1
Expand Down Expand Up @@ -467,7 +467,7 @@ Zombie tables: 0 (0B)
Backing tables: 0 (0B)
Virtual tables: 0 (0B)
Block cache: 12 entries (2.4KB) hit rate: 24.5%
Table cache: 1 entries (808B) hit rate: 60.0%
Table cache: 1 entries (824B) hit rate: 60.0%
Secondary cache: 0 entries (0B) hit rate: 0.0%
Snapshots: 0 earliest seq num: 0
Table iters: 0
Expand Down Expand Up @@ -528,7 +528,7 @@ Zombie tables: 0 (0B)
Backing tables: 0 (0B)
Virtual tables: 0 (0B)
Block cache: 12 entries (2.4KB) hit rate: 24.5%
Table cache: 1 entries (808B) hit rate: 60.0%
Table cache: 1 entries (824B) hit rate: 60.0%
Secondary cache: 0 entries (0B) hit rate: 0.0%
Snapshots: 0 earliest seq num: 0
Table iters: 0
Expand Down
Loading