Skip to content

Commit eec4041

Browse files
committed
objstorageprovider: add dual-write for cold metadata
This commit adds logic to write the metadata portion of cold blob files on both tiers, and to transparently read the metadata from the hot tier. We assume that the metadata is a suffix of the file; the metadata filenames encode the start offset, for example "000001.blobmeta.1234" means that the file contains the data from "00001.blob" starting at offset 1234. We add a `Writable` implementation which writes the metadata portion of a file to both cold and hot storage and a `Readable` implementation which reads data from a cold tier Readable but uses a `vfs.File` for the metadata. The provider keeps track internally of which hot metadata files exist and their start offsets.
1 parent fd11d17 commit eec4041

File tree

13 files changed

+752
-13
lines changed

13 files changed

+752
-13
lines changed

internal/base/filenames.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,15 @@ const (
128128
FileTypeOldTemp
129129
FileTypeTemp
130130
FileTypeBlob
131+
// FileTypeBlobMeta is a file that contains only the metadata portion of a
132+
// blob file (used when the blob file in on cold storage). The filename for
133+
// blobmeta files is of the form `<file-num>.blobmeta.<offset>`, where
134+
// <offset> indicates that the file mirrors the contents of the corresponding
135+
// blob file starting at this offset.
136+
//
137+
// The lifetime of this type of file is managed internally by the objstorage
138+
// provider.
139+
FileTypeBlobMeta
131140
)
132141

133142
var fileTypeStrings = [...]string{
@@ -139,6 +148,7 @@ var fileTypeStrings = [...]string{
139148
FileTypeOldTemp: "old-temp",
140149
FileTypeTemp: "temp",
141150
FileTypeBlob: "blob",
151+
FileTypeBlobMeta: "blobmeta",
142152
}
143153

144154
// FileTypeFromName parses a FileType from its string representation.
@@ -166,6 +176,8 @@ func (ft FileType) String() string {
166176
}
167177

168178
// MakeFilename builds a filename from components.
179+
//
180+
// Note that for FileTypeBlobMeta, ".<offset>" must be appended to the filename.
169181
func MakeFilename(fileType FileType, dfn DiskFileNum) string {
170182
// Make a buffer sufficiently large for most possible filenames, especially
171183
// the common case of a numbered table or blob file.
@@ -192,18 +204,25 @@ func appendFilename(buf []byte, fileType FileType, dfn DiskFileNum) []byte {
192204
buf = fmt.Appendf(buf, "temporary.%06d.dbtmp", uint64(dfn))
193205
case FileTypeBlob:
194206
buf = fmt.Appendf(buf, "%06d.blob", uint64(dfn))
207+
case FileTypeBlobMeta:
208+
buf = fmt.Appendf(buf, "%06d.blobmeta", uint64(dfn))
195209
default:
196210
panic("unreachable")
197211
}
198212
return buf
199213
}
200214

201215
// MakeFilepath builds a filepath from components.
216+
//
217+
// Note that for FileTypeBlobMeta, ".<offset>" must be appended to the filepath.
202218
func MakeFilepath(fs vfs.FS, dirname string, fileType FileType, dfn DiskFileNum) string {
203219
return fs.PathJoin(dirname, MakeFilename(fileType, dfn))
204220
}
205221

206222
// ParseFilename parses the components from a filename.
223+
//
224+
// Note that the offset component of a FileTypeBlobMeta is not parsed by this
225+
// function.
207226
func ParseFilename(fs vfs.FS, filename string) (fileType FileType, dfn DiskFileNum, ok bool) {
208227
filename = fs.PathBase(filename)
209228
switch {
@@ -250,6 +269,9 @@ func ParseFilename(fs vfs.FS, filename string) (fileType FileType, dfn DiskFileN
250269
case "blob":
251270
return FileTypeBlob, dfn, true
252271
}
272+
if strings.HasPrefix(filename[i+1:], "blobmeta.") {
273+
return FileTypeBlobMeta, dfn, true
274+
}
253275
}
254276
return 0, dfn, false
255277
}

internal/base/filenames_test.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package base
77
import (
88
"bytes"
99
"fmt"
10+
"math/rand/v2"
1011
"os"
1112
"testing"
1213

@@ -49,6 +50,7 @@ func TestParseFilename(t *testing.T) {
4950
"000000.blob": true,
5051
"000001.blob": true,
5152
"935203523.blob": true,
53+
"000001.blobmeta.0": true,
5254
}
5355
fs := vfs.NewMem()
5456
for tc, want := range testCases {
@@ -95,6 +97,17 @@ func TestFilenameRoundTrip(t *testing.T) {
9597
}
9698
}
9799

100+
func TestFilenameBlobMeta(t *testing.T) {
101+
fileNum := DiskFileNum(rand.Uint64())
102+
offset := rand.Int64()
103+
fs := vfs.NewMem()
104+
path := fmt.Sprintf("%s.%d", MakeFilepath(fs, "foo", FileTypeBlobMeta, fileNum), offset)
105+
typ, fn, ok := ParseFilename(fs, path)
106+
require.True(t, ok)
107+
require.Equal(t, FileTypeBlobMeta, typ)
108+
require.Equal(t, fileNum, fn)
109+
}
110+
98111
type bufferFataler struct {
99112
buf bytes.Buffer
100113
}

objstorage/objstorage.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,27 @@ type Writable interface {
109109

110110
// Finish completes the object and makes the data durable.
111111
// No further calls are allowed after calling Finish.
112+
//
113+
// An error during Finish does not mean that the object is deleted and removed
114+
// from the provider. It is expected that the caller will do that separately
115+
// (via Provider.Remove).
112116
Finish() error
113117

114-
// Abort gives up on finishing the object. There is no guarantee about whether
115-
// the object exists after calling Abort.
118+
// StartMetadataPortion signals to the writer that the metadata part of the
119+
// object starts here. If the object is being written to the cold tier, data
120+
// in subsequent Write() calls will also be written to the hot tier.
121+
//
122+
// The function should be called at most one time.
123+
//
124+
// An error means that we won't be able to successfully finish this object.
125+
StartMetadataPortion() error
126+
127+
// Abort gives up on finishing the object.
128+
//
129+
// Aborting does not mean that the object is deleted and removed from the
130+
// provider. It is expected that the caller will do that separately (via
131+
// Provider.Remove).
132+
//
116133
// No further calls are allowed after calling Abort.
117134
Abort()
118135
}
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2+
// of this source code is governed by a BSD-style license that can be found in
3+
// the LICENSE file.
4+
5+
package objstorageprovider
6+
7+
import (
8+
"context"
9+
"sync"
10+
11+
"github.com/cockroachdb/pebble/objstorage"
12+
"github.com/cockroachdb/pebble/vfs"
13+
)
14+
15+
// newColdReadableWithHotMeta returns an objstorage.Readable that reads the main
16+
// data from the wrapped "cold storage" readable, and the metadata from a
17+
// separate file in a local filesystem. The separate file contains a suffix of
18+
// the full file, starting at metaStartOffset.
19+
func newColdReadableWithHotMeta(
20+
cold objstorage.Readable, metaFS vfs.FS, metaFilepath string, metaStartOffset int64,
21+
) *coldReadableWithHotMeta {
22+
r := &coldReadableWithHotMeta{
23+
cold: cold,
24+
}
25+
r.meta.fs = metaFS
26+
r.meta.filepath = metaFilepath
27+
r.meta.startOffset = metaStartOffset
28+
return r
29+
}
30+
31+
type coldReadableWithHotMeta struct {
32+
cold objstorage.Readable
33+
34+
meta struct {
35+
fs vfs.FS
36+
filepath string
37+
startOffset int64
38+
once struct {
39+
sync.Once
40+
file vfs.File
41+
err error
42+
}
43+
}
44+
}
45+
46+
var _ objstorage.Readable = (*coldReadableWithHotMeta)(nil)
47+
48+
// readMetaAt reads from the metadata file at the given offset.
49+
func (r *coldReadableWithHotMeta) readMetaAt(p []byte, off int64) error {
50+
r.meta.once.Do(func() {
51+
r.meta.once.file, r.meta.once.err = r.meta.fs.Open(r.meta.filepath, vfs.RandomReadsOption)
52+
})
53+
if r.meta.once.err != nil {
54+
return r.meta.once.err
55+
}
56+
_, err := r.meta.once.file.ReadAt(p, off)
57+
return err
58+
}
59+
60+
// ReadAt is part of the objstorage.Readable interface.
61+
func (r *coldReadableWithHotMeta) ReadAt(ctx context.Context, p []byte, off int64) error {
62+
// We don't expect reads that span both regions, but in that case it is
63+
// correct to read it all from the cold file (which contains all the data).
64+
if off < r.meta.startOffset {
65+
return r.cold.ReadAt(ctx, p, off)
66+
}
67+
return r.readMetaAt(p, off-r.meta.startOffset)
68+
}
69+
70+
// Close is part of the objstorage.Readable interface.
71+
func (r *coldReadableWithHotMeta) Close() error {
72+
err := r.cold.Close()
73+
if r.meta.once.file != nil {
74+
err = firstError(err, r.meta.once.file.Close())
75+
r.meta.once.file = nil
76+
}
77+
return err
78+
}
79+
80+
// Size is part of the objstorage.Readable interface.
81+
func (r *coldReadableWithHotMeta) Size() int64 {
82+
return r.cold.Size()
83+
}
84+
85+
// NewReadHandle is part of the objstorage.Readable interface.
86+
func (r *coldReadableWithHotMeta) NewReadHandle(
87+
readBeforeSize objstorage.ReadBeforeSize,
88+
) objstorage.ReadHandle {
89+
// The readBeforeSize is used to optimize reading the metadata suffix of a
90+
// file, for cases where small reads are expensive. In our case, the metadata
91+
// is specifically in a hot tier file, for which small reads are cheap (so we
92+
// ignore readBeforeSize, like fileReadable).
93+
//
94+
// Since we are using the local file for metadata, we pass NoReadBefore for
95+
// the cold file.
96+
return &coldReadHandle{
97+
r: r,
98+
cold: r.cold.NewReadHandle(objstorage.NoReadBefore),
99+
}
100+
}
101+
102+
type coldReadHandle struct {
103+
r *coldReadableWithHotMeta
104+
cold objstorage.ReadHandle
105+
}
106+
107+
var _ objstorage.ReadHandle = (*coldReadHandle)(nil)
108+
109+
// ReadAt is part of the objstorage.ReadHandle interface.
110+
func (rh *coldReadHandle) ReadAt(ctx context.Context, p []byte, off int64) error {
111+
if off < rh.r.meta.startOffset {
112+
// Read from cold storage only.
113+
return rh.cold.ReadAt(ctx, p, off)
114+
}
115+
// Read from metadata only.
116+
return rh.r.readMetaAt(p, off-rh.r.meta.startOffset)
117+
}
118+
119+
// Close is part of the objstorage.ReadHandle interface.
120+
func (rh *coldReadHandle) Close() error {
121+
return rh.cold.Close()
122+
}
123+
124+
// SetupForCompaction is part of the objstorage.ReadHandle interface.
125+
func (rh *coldReadHandle) SetupForCompaction() {
126+
rh.cold.SetupForCompaction()
127+
}
128+
129+
// RecordCacheHit is part of the objstorage.ReadHandle interface.
130+
func (rh *coldReadHandle) RecordCacheHit(ctx context.Context, offset, size int64) {
131+
// We don't use prefetching for the metadata portion, so we only need to
132+
// report cache hits to the cold readable.
133+
if offset < rh.r.meta.startOffset {
134+
rh.cold.RecordCacheHit(ctx, offset, min(size, rh.r.meta.startOffset-offset))
135+
}
136+
}

0 commit comments

Comments
 (0)