@@ -13,13 +13,16 @@ import (
13
13
"math/rand/v2"
14
14
"os"
15
15
"slices"
16
+ "strings"
16
17
"time"
17
18
18
19
"github.com/cockroachdb/errors"
19
20
"github.com/cockroachdb/pebble/internal/base"
20
21
"github.com/cockroachdb/pebble/internal/humanize"
21
22
"github.com/cockroachdb/pebble/objstorage"
22
23
"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
24
+ "github.com/cockroachdb/pebble/objstorage/remote"
25
+ "github.com/cockroachdb/pebble/sstable"
23
26
"github.com/cockroachdb/pebble/sstable/compressionanalyzer"
24
27
"github.com/cockroachdb/pebble/vfs"
25
28
"github.com/cockroachdb/tokenbucket"
@@ -34,28 +37,38 @@ func (d *dbT) runAnalyzeData(cmd *cobra.Command, args []string) {
34
37
isTTY := isTTY (stdout )
35
38
36
39
dir := args [0 ]
37
- if err := d .initOptions (dir ); err != nil {
38
- fmt .Fprintf (stderr , "error initializing options: %s\n " , err )
39
- return
40
+
41
+ var dbStorage dbStorage
42
+ var isRemote bool
43
+ if strings .Contains (dir , "://" ) {
44
+ if d .remoteStorageFn == nil {
45
+ fmt .Fprintf (stderr , "path looks like remote storage, but remote storage not configuered.\n " )
46
+ return
47
+ }
48
+ remoteStorageImpl , err := d .remoteStorageFn (dir )
49
+ if err != nil {
50
+ fmt .Fprintf (stderr , "error initializing remote storage: %s\n " , err )
51
+ return
52
+ }
53
+ dbStorage = newRemoteStorage (remoteStorageImpl )
54
+ isRemote = true
55
+ } else {
56
+ dbStorage = newVFSStorage (d .opts .FS , dir )
40
57
}
41
58
rng := rand .New (rand .NewPCG (rand .Uint64 (), rand .Uint64 ()))
42
- dbStorage := newVFSStorage (d .opts .FS , dir )
59
+ if isTTY {
60
+ fmt .Fprintf (stdout , "Listing files and sizes...\n " )
61
+ }
43
62
files , err := makeFileSet (dbStorage , rng )
44
63
if err != nil {
45
64
fmt .Fprintf (stderr , "error loading file list: %s\n " , err )
46
65
return
47
66
}
48
- if files .Done () {
67
+ numFiles , totalSize := files .Remaining ()
68
+ if numFiles == 0 {
49
69
fmt .Fprintf (stderr , "no sstables found\n " )
50
70
return
51
71
}
52
- totalSize := files .TotalSize ()
53
- // We do not recalculate the target size every time we refresh the file list.
54
- // If the database is growing rapidly, we might not be able to keep up.
55
- targetSize := totalSize
56
- if d .analyzeData .samplePercent > 0 && d .analyzeData .samplePercent < 100 {
57
- targetSize = (totalSize * int64 (d .analyzeData .samplePercent ) + 99 ) / 100
58
- }
59
72
var readLimiter * tokenbucket.TokenBucket
60
73
if d .analyzeData .readMBPerSec > 0 {
61
74
readLimiter = & tokenbucket.TokenBucket {}
@@ -64,7 +77,12 @@ func (d *dbT) runAnalyzeData(cmd *cobra.Command, args []string) {
64
77
readLimiter .Init (rate , burst )
65
78
}
66
79
if isTTY {
67
- fmt .Fprintf (stdout , "Found %d files, total size %s.\n " , len (files .files ), humanize .Bytes .Int64 (totalSize ))
80
+ if isRemote {
81
+ // We don't obtain the sizes of the remote objects.
82
+ fmt .Fprintf (stdout , "Found %d objects.\n " , numFiles )
83
+ } else {
84
+ fmt .Fprintf (stdout , "Found %d files, total size %s.\n " , numFiles , humanize .Bytes .Int64 (totalSize ))
85
+ }
68
86
if d .analyzeData .readMBPerSec > 0 {
69
87
fmt .Fprintf (stdout , "Limiting read bandwidth to %s/s.\n " , humanize .Bytes .Int64 (int64 (d .analyzeData .readMBPerSec )<< 20 ))
70
88
} else {
@@ -84,11 +102,28 @@ func (d *dbT) runAnalyzeData(cmd *cobra.Command, args []string) {
84
102
startTime := time .Now ()
85
103
lastReportTime := startTime
86
104
87
- analyzer := compressionanalyzer .NewFileAnalyzer (readLimiter , d .opts .MakeReaderOptions ())
88
- var sampled int64
105
+ readerOptions := sstable.ReaderOptions {
106
+ Comparers : d .comparers ,
107
+ Mergers : d .mergers ,
108
+ KeySchemas : d .opts .KeySchemas ,
109
+ }
110
+ analyzer := compressionanalyzer .NewFileAnalyzer (readLimiter , readerOptions )
111
+ var sampledFiles int
112
+ var sampledBytes int64
89
113
const reportPeriod = 10 * time .Second
90
114
for {
91
- shouldStop := files .Done () || sampled >= targetSize ||
115
+ remainingFiles , remainingBytes := files .Remaining ()
116
+ var percentage float64
117
+ if remainingFiles == 0 {
118
+ percentage = 100
119
+ } else if isRemote {
120
+ // We don't obtain the sizes of all remote objects, so we use the number
121
+ // of files.
122
+ percentage = float64 (sampledFiles ) * 100 / float64 (sampledFiles + remainingFiles )
123
+ } else {
124
+ percentage = float64 (sampledBytes ) * 100 / float64 (sampledBytes + remainingBytes )
125
+ }
126
+ shouldStop := percentage >= float64 (d .analyzeData .samplePercent ) ||
92
127
(d .analyzeData .timeout > 0 && time .Since (startTime ) > d .analyzeData .timeout )
93
128
// Every 10 seconds, we:
94
129
// - print the current results and progress (if on a tty);
@@ -103,11 +138,8 @@ func (d *dbT) runAnalyzeData(cmd *cobra.Command, args []string) {
103
138
if isTTY || shouldStop {
104
139
partialResults := analyzer .Buckets ().String (minSamples )
105
140
fmt .Fprintf (stdout , "\n %s\n " , partialResults )
106
- percentage := min (float64 (sampled * 100 )/ float64 (totalSize ), 100 )
107
- if files .Done () {
108
- percentage = 100
109
- }
110
- fmt .Fprintf (stdout , "Sampled %.2f%% (%s)\n " , percentage , humanize .Bytes .Int64 (sampled ))
141
+ fmt .Fprintf (stdout , "Sampled %s files, %s (%.2f%%)\n " ,
142
+ humanize .Count .Int64 (int64 (sampledFiles )), humanize .Bytes .Int64 (sampledBytes ), percentage )
111
143
}
112
144
if err := analyzeSaveCSVFile (analyzer , d .analyzeData .outputCSVFile ); err != nil {
113
145
fmt .Fprintf (stderr , "error writing CSV file: %s\n " , err )
@@ -116,39 +148,40 @@ func (d *dbT) runAnalyzeData(cmd *cobra.Command, args []string) {
116
148
if shouldStop {
117
149
return
118
150
}
119
- if err := files .Refresh (); err != nil {
120
- fmt .Fprintf (stderr , "error loading file list: %s\n " , err )
121
- return
151
+ if ! isRemote {
152
+ if err := files .Refresh (); err != nil {
153
+ fmt .Fprintf (stderr , "error loading file list: %s\n " , err )
154
+ return
155
+ }
122
156
}
123
157
lastReportTime = time .Now ()
124
158
}
125
159
// Sample a file and analyze it.
126
- filename , size := files .Sample ()
127
- path := d .opts . FS . PathJoin ( dir , filename )
128
- if err := d . analyzeSSTable ( analyzer , path ); err != nil {
160
+ filename := files .Sample ()
161
+ size , err := d .analyzeSSTable ( analyzer , dbStorage , filename )
162
+ if err != nil {
129
163
// We silently ignore errors from files that are deleted from under us.
130
164
if ! errors .Is (err , os .ErrNotExist ) {
131
165
// Note that errors can happen if the sstable file wasn't completed;
132
166
// they should not stop the process.
133
- fmt .Fprintf (stderr , "error reading file %s: %s\n " , path , err )
167
+ fmt .Fprintf (stderr , "error reading file %s: %s\n " , filename , err )
134
168
}
135
169
continue
136
170
}
137
- sampled += size
171
+ sampledBytes += size
172
+ sampledFiles ++
138
173
}
139
174
}
140
175
141
- func (d * dbT ) analyzeSSTable (analyzer * compressionanalyzer.FileAnalyzer , path string ) error {
142
- file , err := d .opts .FS .Open (path )
143
- if err != nil {
144
- return err
145
- }
146
- readable , err := objstorageprovider .NewFileReadable (file , d .opts .FS , objstorageprovider .NewReadaheadConfig (), path )
176
+ func (d * dbT ) analyzeSSTable (
177
+ analyzer * compressionanalyzer.FileAnalyzer , dbStorage dbStorage , name string ,
178
+ ) (size int64 , _ error ) {
179
+ readable , err := dbStorage .Open (name )
147
180
if err != nil {
148
- return errors . CombineErrors ( err , file . Close ())
181
+ return 0 , err
149
182
}
150
- defer func () { _ = readable .Close () } ()
151
- return analyzer .SSTable (context .Background (), readable )
183
+ size = readable .Size ()
184
+ return size , analyzer .SSTable (context .Background (), readable )
152
185
}
153
186
154
187
func analyzeSaveCSVFile (a * compressionanalyzer.FileAnalyzer , path string ) error {
@@ -201,6 +234,34 @@ func (l *vfsStorage) Open(name string) (objstorage.Readable, error) {
201
234
return readable , nil
202
235
}
203
236
237
+ type remoteStorage struct {
238
+ storage remote.Storage
239
+ }
240
+
241
+ func newRemoteStorage (storage remote.Storage ) * remoteStorage {
242
+ return & remoteStorage {storage : storage }
243
+ }
244
+
245
+ var _ dbStorage = (* remoteStorage )(nil )
246
+
247
+ func (r * remoteStorage ) List () ([]string , error ) {
248
+ return r .storage .List ("" , "" )
249
+ }
250
+
251
+ func (r * remoteStorage ) Size (name string ) int64 {
252
+ // Retrieving the size for each file from cloud storage would take too long,
253
+ // just make up a fixed value.
254
+ return 1024 * 1024
255
+ }
256
+
257
+ func (r * remoteStorage ) Open (name string ) (objstorage.Readable , error ) {
258
+ objReader , size , err := r .storage .ReadObject (context .Background (), name )
259
+ if err != nil {
260
+ return nil , err
261
+ }
262
+ return objstorageprovider .NewRemoteReadable (objReader , size ), nil
263
+ }
264
+
204
265
// We avoid files that are very large to prevent excessive memory usage. Note
205
266
// that we have seen cases where large files contain a giant top index block, so
206
267
// even getting the block layout of the file would use a lot of memory.
@@ -210,8 +271,9 @@ type fileSet struct {
210
271
dbStorage dbStorage
211
272
rng * rand.Rand
212
273
213
- files []fileInSet
214
- sampleIdx []int
274
+ files []fileInSet
275
+ sampleIdx []int
276
+ bytesToSample int64
215
277
}
216
278
217
279
type fileInSet struct {
@@ -299,9 +361,11 @@ func (s *fileSet) Refresh() error {
299
361
}
300
362
// Generate the samples.
301
363
s .sampleIdx = s .sampleIdx [:0 ]
364
+ s .bytesToSample = 0
302
365
for i := range s .files {
303
366
if ! s .files [i ].sampled {
304
367
s .sampleIdx = append (s .sampleIdx , i )
368
+ s .bytesToSample += s .files [i ].size
305
369
}
306
370
}
307
371
slices .SortFunc (s .sampleIdx , func (i , j int ) int {
@@ -310,25 +374,18 @@ func (s *fileSet) Refresh() error {
310
374
return nil
311
375
}
312
376
313
- func (s * fileSet ) TotalSize () int64 {
314
- var sum int64
315
- for i := range s .files {
316
- sum += s .files [i ].size
317
- }
318
- return sum
319
- }
320
-
321
- func (s * fileSet ) Done () bool {
322
- return len (s .sampleIdx ) == 0
377
+ func (s * fileSet ) Remaining () (files int , bytes int64 ) {
378
+ return len (s .sampleIdx ), s .bytesToSample
323
379
}
324
380
325
381
// Sample returns a random file from the set (which was not previously sampled),
326
382
// weighted by size.
327
- func (s * fileSet ) Sample () (filename string , size int64 ) {
383
+ func (s * fileSet ) Sample () (filename string ) {
328
384
idx := s .sampleIdx [0 ]
329
385
s .sampleIdx = s .sampleIdx [1 :]
330
386
s .files [idx ].sampled = true
331
- return s .files [idx ].filename , s .files [idx ].size
387
+ s .bytesToSample -= s .files [idx ].size
388
+ return s .files [idx ].filename
332
389
}
333
390
334
391
func isTTY (out io.Writer ) bool {
0 commit comments