55package pebble
66
77import (
8+ "io"
9+
810 "github.com/cockroachdb/errors"
911 "github.com/cockroachdb/pebble/internal/base"
12+ "github.com/cockroachdb/pebble/internal/invariants"
13+ "github.com/cockroachdb/pebble/internal/manifest"
1014 "github.com/cockroachdb/pebble/objstorage"
1115 "github.com/cockroachdb/pebble/objstorage/objstorageprovider"
16+ "github.com/cockroachdb/pebble/record"
1217 "github.com/cockroachdb/pebble/vfs"
1318 "github.com/cockroachdb/pebble/vfs/atomicfs"
1419)
@@ -39,10 +44,7 @@ func (rs *recoveredState) init(opts *Options, dirname string) error {
3944 if err != nil {
4045 return errors .Wrapf (err , "pebble: database %q" , dirname )
4146 }
42- rs .manifestMarker , rs .manifestFileNum , rs .manifestExists , err = findCurrentManifest (opts .FS , dirname , rs .ls )
43- if err != nil {
44- return errors .Wrapf (err , "pebble: database %q" , dirname )
45- }
47+
4648 // Open the object storage provider.
4749 providerSettings := opts .MakeObjStorageProviderSettings (dirname )
4850 providerSettings .FSDirInitialListing = rs .ls
@@ -51,6 +53,26 @@ func (rs *recoveredState) init(opts *Options, dirname string) error {
5153 return errors .Wrapf (err , "pebble: database %q" , dirname )
5254 }
5355
56+ // Determine which manifest is current, and if one exists, replay it to
57+ // recover the current Version of the LSM.
58+ var manifestExists bool
59+ rs .manifestMarker , rs .manifestFileNum , manifestExists , err = findCurrentManifest (opts .FS , dirname , rs .ls )
60+ if err != nil {
61+ return errors .Wrapf (err , "pebble: database %q" , dirname )
62+ }
63+ if manifestExists {
64+ recoveredVersion , err := recoverVersion (opts , dirname , rs .objProvider , rs .manifestFileNum )
65+ if err != nil {
66+ return err
67+ }
68+ if ! opts .DisableConsistencyCheck {
69+ if err := checkConsistency (recoveredVersion .version , rs .objProvider ); err != nil {
70+ return err
71+ }
72+ }
73+ rs .recoveredVersion = recoveredVersion
74+ }
75+
5476 // Identify the maximal file number in the directory. We do not want to
5577 // reuse any existing file numbers even if they are obsolete file numbers to
5678 // avoid modifying an ingested sstable's original external file.
@@ -90,11 +112,11 @@ type recoveredState struct {
90112 ls []string
91113 manifestMarker * atomicfs.Marker
92114 manifestFileNum base.DiskFileNum
93- manifestExists bool
94115 maxFilenumUsed base.DiskFileNum
95116 obsoleteTempFilenames []string
96117 objProvider objstorage.Provider
97118 previousOptionsFilename string
119+ recoveredVersion * recoveredVersion
98120}
99121
100122// RemoveObsolete removes obsolete files uncovered during recovery.
@@ -132,3 +154,176 @@ func (rs *recoveredState) Close() error {
132154 }
133155 return err
134156}
157+
158+ // recoveredVersion describes the latest Version of the LSM recovered by
159+ // replaying a manifest file.
160+ type recoveredVersion struct {
161+ manifestFileNum base.DiskFileNum
162+ minUnflushedLogNum base.DiskFileNum
163+ nextFileNum base.DiskFileNum
164+ logSeqNum base.SeqNum
165+ latest * latestVersionState
166+ metrics Metrics
167+ version * manifest.Version
168+ }
169+
170+ // recoverVersion replays the named manifest file to recover the latest version
171+ // of the LSM from persisted state.
172+ func recoverVersion (
173+ opts * Options , dirname string , provider objstorage.Provider , manifestFileNum base.DiskFileNum ,
174+ ) (* recoveredVersion , error ) {
175+ vs := & recoveredVersion {
176+ manifestFileNum : manifestFileNum ,
177+ nextFileNum : 1 ,
178+ logSeqNum : base .SeqNumStart ,
179+ latest : & latestVersionState {
180+ l0Organizer : manifest .NewL0Organizer (opts .Comparer , opts .FlushSplitBytes ),
181+ virtualBackings : manifest .MakeVirtualBackings (),
182+ },
183+ }
184+ manifestPath := base .MakeFilepath (opts .FS , dirname , base .FileTypeManifest , vs .manifestFileNum )
185+ manifestFilename := opts .FS .PathBase (manifestPath )
186+
187+ // Read the versionEdits in the manifest file.
188+ var bve manifest.BulkVersionEdit
189+ bve .AllAddedTables = make (map [base.TableNum ]* manifest.TableMetadata )
190+ manifestFile , err := opts .FS .Open (manifestPath )
191+ if err != nil {
192+ return nil , errors .Wrapf (err , "pebble: could not open manifest file %q for DB %q" ,
193+ errors .Safe (manifestFilename ), dirname )
194+ }
195+ defer manifestFile .Close ()
196+ rr := record .NewReader (manifestFile , 0 /* logNum */ )
197+ for {
198+ r , err := rr .Next ()
199+ if err == io .EOF || record .IsInvalidRecord (err ) {
200+ break
201+ }
202+ if err != nil {
203+ return nil , errors .Wrapf (err , "pebble: error when loading manifest file %q" ,
204+ errors .Safe (manifestFilename ))
205+ }
206+ var ve manifest.VersionEdit
207+ err = ve .Decode (r )
208+ if err != nil {
209+ // Break instead of returning an error if the record is corrupted
210+ // or invalid.
211+ if err == io .EOF || record .IsInvalidRecord (err ) {
212+ break
213+ }
214+ return nil , err
215+ }
216+ if ve .ComparerName != "" {
217+ if ve .ComparerName != opts .Comparer .Name {
218+ return nil , errors .Errorf ("pebble: manifest file %q for DB %q: " +
219+ "comparer name from file %q != comparer name from Options %q" ,
220+ errors .Safe (manifestFilename ), dirname , errors .Safe (ve .ComparerName ), errors .Safe (opts .Comparer .Name ))
221+ }
222+ }
223+ if err := bve .Accumulate (& ve ); err != nil {
224+ return nil , err
225+ }
226+ if ve .MinUnflushedLogNum != 0 {
227+ vs .minUnflushedLogNum = ve .MinUnflushedLogNum
228+ }
229+ if ve .NextFileNum != 0 {
230+ vs .nextFileNum = base .DiskFileNum (ve .NextFileNum )
231+ }
232+ if ve .LastSeqNum != 0 {
233+ // logSeqNum is the _next_ sequence number that will be assigned,
234+ // while LastSeqNum is the last assigned sequence number. Note that
235+ // this behaviour mimics that in RocksDB; the first sequence number
236+ // assigned is one greater than the one present in the manifest
237+ // (assuming no WALs contain higher sequence numbers than the
238+ // manifest's LastSeqNum). Increment LastSeqNum by 1 to get the
239+ // next sequence number that will be assigned.
240+ //
241+ // If LastSeqNum is less than SeqNumStart, increase it to at least
242+ // SeqNumStart to leave ample room for reserved sequence numbers.
243+ vs .logSeqNum = max (ve .LastSeqNum + 1 , base .SeqNumStart )
244+ }
245+ }
246+
247+ // We have already set vs.nextFileNum=1 at the beginning of the function and
248+ // could have only updated it to some other non-zero value, so it cannot be
249+ // 0 here.
250+ if vs .minUnflushedLogNum == 0 {
251+ if vs .nextFileNum >= 2 {
252+ // We either have a freshly created DB, or a DB created by RocksDB
253+ // that has not had a single flushed SSTable yet. This is because
254+ // RocksDB bumps up nextFileNum in this case without bumping up
255+ // minUnflushedLogNum, even if WALs with non-zero file numbers are
256+ // present in the directory.
257+ } else {
258+ return nil , base .CorruptionErrorf ("pebble: malformed manifest file %q for DB %q" ,
259+ errors .Safe (manifestFilename ), dirname )
260+ }
261+ }
262+ vs .nextFileNum = max (vs .nextFileNum , vs .minUnflushedLogNum + 1 )
263+
264+ // Populate the virtual backings for virtual sstables since we have finished
265+ // version edit accumulation.
266+ for _ , b := range bve .AddedFileBacking {
267+ isLocal := objstorage .IsLocalTable (provider , b .DiskFileNum )
268+ vs .latest .virtualBackings .AddAndRef (b , isLocal )
269+ }
270+ for l , addedLevel := range bve .AddedTables {
271+ for _ , m := range addedLevel {
272+ if m .Virtual {
273+ vs .latest .virtualBackings .AddTable (m , l )
274+ }
275+ }
276+ }
277+
278+ if invariants .Enabled {
279+ // There should be no deleted tables or backings, since we're starting
280+ // from an empty state.
281+ for _ , deletedLevel := range bve .DeletedTables {
282+ if len (deletedLevel ) != 0 {
283+ panic ("deleted files after manifest replay" )
284+ }
285+ }
286+ if len (bve .RemovedFileBacking ) > 0 {
287+ panic ("deleted backings after manifest replay" )
288+ }
289+ }
290+
291+ emptyVersion := manifest .NewInitialVersion (opts .Comparer )
292+ newVersion , err := bve .Apply (emptyVersion , opts .Experimental .ReadCompactionRate )
293+ if err != nil {
294+ return nil , err
295+ }
296+ vs .latest .l0Organizer .PerformUpdate (vs .latest .l0Organizer .PrepareUpdate (& bve , newVersion ), newVersion )
297+ vs .latest .l0Organizer .InitCompactingFileInfo (nil /* in-progress compactions */ )
298+ vs .latest .blobFiles .Init (& bve , manifest.BlobRewriteHeuristic {
299+ CurrentTime : opts .private .timeNow ,
300+ MinimumAge : opts .Experimental .ValueSeparationPolicy ().RewriteMinimumAge ,
301+ })
302+ vs .version = newVersion
303+
304+ for i := range vs .metrics .Levels {
305+ l := & vs .metrics .Levels [i ]
306+ l .TablesCount = int64 (newVersion .Levels [i ].Len ())
307+ files := newVersion .Levels [i ].Slice ()
308+ l .TablesSize = int64 (files .TableSizeSum ())
309+ }
310+ for _ , l := range newVersion .Levels {
311+ for f := range l .All () {
312+ if ! f .Virtual {
313+ isLocal , localSize := sizeIfLocal (f .TableBacking , provider )
314+ vs .metrics .Table .Local .LiveSize = uint64 (int64 (vs .metrics .Table .Local .LiveSize ) + localSize )
315+ if isLocal {
316+ vs .metrics .Table .Local .LiveCount ++
317+ }
318+ }
319+ }
320+ }
321+ for backing := range vs .latest .virtualBackings .All () {
322+ isLocal , localSize := sizeIfLocal (backing , provider )
323+ vs .metrics .Table .Local .LiveSize = uint64 (int64 (vs .metrics .Table .Local .LiveSize ) + localSize )
324+ if isLocal {
325+ vs .metrics .Table .Local .LiveCount ++
326+ }
327+ }
328+ return vs , nil
329+ }
0 commit comments