Skip to content

Commit cde5cbb

Browse files
authored
feat(storage/transfermanager): add Downloader (#10045)
transfermanager.Downloader provides an easy way to parallelize reads in Google Cloud Storage. NOTE: This package is in preview. It is not stable, and is likely to change.
1 parent f055a5d commit cde5cbb

File tree

7 files changed

+1540
-0
lines changed

7 files changed

+1540
-0
lines changed

storage/transfermanager/doc.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Copyright 2024 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
/*
16+
Package transfermanager provides an easy way to parallelize downloads in Google
17+
Cloud Storage.
18+
19+
More information about Google Cloud Storage is available at
20+
https://cloud.google.com/storage/docs.
21+
22+
See https://pkg.go.dev/cloud.google.com/go for authentication, timeouts,
23+
connection pooling and similar aspects of this package.
24+
25+
NOTE: This package is in preview. It is not stable, and is likely to change.
26+
*/
27+
package transfermanager // import "cloud.google.com/go/storage/transfermanager"
Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
// Copyright 2024 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package transfermanager
16+
17+
import (
18+
"context"
19+
"errors"
20+
"fmt"
21+
"io"
22+
"sync"
23+
"time"
24+
25+
"cloud.google.com/go/storage"
26+
)
27+
28+
// Downloader manages a set of parallelized downloads.
29+
type Downloader struct {
30+
client *storage.Client
31+
config *transferManagerConfig
32+
inputs []DownloadObjectInput
33+
results []DownloadOutput
34+
errors []error
35+
inputsMu *sync.Mutex
36+
resultsMu *sync.Mutex
37+
errorsMu *sync.Mutex
38+
work chan *DownloadObjectInput // Piece of work to be executed.
39+
done chan bool // Indicates to finish up work; expecting no more inputs.
40+
workers *sync.WaitGroup // Keeps track of the workers that are currently running.
41+
}
42+
43+
// DownloadObject queues the download of a single object. This will initiate the
44+
// download but is non-blocking; call Downloader.Results or use the callback to
45+
// process the result. DownloadObject is thread-safe and can be called
46+
// simultaneously from different goroutines.
47+
// The download may not start immediately if all workers are busy, so a deadline
48+
// set on the ctx may time out before the download even starts. To set a timeout
49+
// that starts with the download, use the [WithPerOpTimeout()] option.
50+
func (d *Downloader) DownloadObject(ctx context.Context, input *DownloadObjectInput) error {
51+
if d.config.asynchronous && input.Callback == nil {
52+
return errors.New("transfermanager: input.Callback must not be nil when the WithCallbacks option is set")
53+
}
54+
if !d.config.asynchronous && input.Callback != nil {
55+
return errors.New("transfermanager: input.Callback must be nil unless the WithCallbacks option is set")
56+
}
57+
58+
select {
59+
case <-d.done:
60+
return errors.New("transfermanager: WaitAndClose called before DownloadObject")
61+
default:
62+
}
63+
64+
input.ctx = ctx
65+
d.addInput(input)
66+
return nil
67+
}
68+
69+
// WaitAndClose waits for all outstanding downloads to complete and closes the
70+
// Downloader. Adding new downloads after this has been called will cause an error.
71+
//
72+
// WaitAndClose returns all the results of the downloads and an error wrapping
73+
// all errors that were encountered by the Downloader when downloading objects.
74+
// These errors are also returned in the respective DownloadOutput for the
75+
// failing download. The results are not guaranteed to be in any order.
76+
// Results will be empty if using the [WithCallbacks] option.
77+
func (d *Downloader) WaitAndClose() ([]DownloadOutput, error) {
78+
errMsg := "transfermanager: at least one error encountered downloading objects:"
79+
select {
80+
case <-d.done: // this allows users to call WaitAndClose various times
81+
var err error
82+
if len(d.errors) > 0 {
83+
err = fmt.Errorf("%s\n%w", errMsg, errors.Join(d.errors...))
84+
}
85+
return d.results, err
86+
default:
87+
d.done <- true
88+
d.workers.Wait()
89+
close(d.done)
90+
91+
if len(d.errors) > 0 {
92+
return d.results, fmt.Errorf("%s\n%w", errMsg, errors.Join(d.errors...))
93+
}
94+
return d.results, nil
95+
}
96+
}
97+
98+
// sendInputsToWorkChan listens continuously to the inputs slice until d.done.
99+
// It will send all items in inputs to the d.work chan.
100+
// Once it receives from d.done, it drains the remaining items in the inputs
101+
// (sending them to d.work) and then closes the d.work chan.
102+
func (d *Downloader) sendInputsToWorkChan() {
103+
for {
104+
select {
105+
case <-d.done:
106+
d.drainInput()
107+
close(d.work)
108+
return
109+
default:
110+
d.drainInput()
111+
}
112+
}
113+
}
114+
115+
// drainInput consumes everything in the inputs slice and sends it to the work chan.
116+
// It will block if there are not enough workers to consume every input, until all
117+
// inputs are received on the work chan(ie. they're dispatched to an available worker).
118+
func (d *Downloader) drainInput() {
119+
for {
120+
d.inputsMu.Lock()
121+
if len(d.inputs) < 1 {
122+
d.inputsMu.Unlock()
123+
return
124+
}
125+
input := d.inputs[0]
126+
d.inputs = d.inputs[1:]
127+
d.inputsMu.Unlock()
128+
d.work <- &input
129+
}
130+
}
131+
132+
func (d *Downloader) addInput(input *DownloadObjectInput) {
133+
d.inputsMu.Lock()
134+
d.inputs = append(d.inputs, *input)
135+
d.inputsMu.Unlock()
136+
}
137+
138+
func (d *Downloader) addResult(result *DownloadOutput) {
139+
d.resultsMu.Lock()
140+
d.results = append(d.results, *result)
141+
d.resultsMu.Unlock()
142+
}
143+
144+
func (d *Downloader) error(err error) {
145+
d.errorsMu.Lock()
146+
d.errors = append(d.errors, err)
147+
d.errorsMu.Unlock()
148+
}
149+
150+
// downloadWorker continuously processes downloads until the work channel is closed.
151+
func (d *Downloader) downloadWorker() {
152+
for {
153+
input, ok := <-d.work
154+
if !ok {
155+
break // no more work; exit
156+
}
157+
158+
// TODO: break down the input into smaller pieces if necessary; maybe as follows:
159+
// Only request partSize data to begin with. If no error and we haven't finished
160+
// reading the object, enqueue the remaining pieces of work and mark in the
161+
// out var the amount of shards to wait for.
162+
out := input.downloadShard(d.client, d.config.perOperationTimeout)
163+
164+
// Keep track of any error that occurred.
165+
if out.Err != nil {
166+
d.error(fmt.Errorf("downloading %q from bucket %q: %w", input.Object, input.Bucket, out.Err))
167+
}
168+
169+
// Either execute the callback, or append to results.
170+
if d.config.asynchronous {
171+
input.Callback(out)
172+
} else {
173+
d.addResult(out)
174+
}
175+
}
176+
d.workers.Done()
177+
}
178+
179+
// NewDownloader creates a new Downloader to add operations to.
180+
// Choice of transport, etc is configured on the client that's passed in.
181+
// The returned Downloader can be shared across goroutines to initiate downloads.
182+
func NewDownloader(c *storage.Client, opts ...Option) (*Downloader, error) {
183+
d := &Downloader{
184+
client: c,
185+
config: initTransferManagerConfig(opts...),
186+
inputs: []DownloadObjectInput{},
187+
results: []DownloadOutput{},
188+
errors: []error{},
189+
inputsMu: &sync.Mutex{},
190+
resultsMu: &sync.Mutex{},
191+
errorsMu: &sync.Mutex{},
192+
work: make(chan *DownloadObjectInput),
193+
done: make(chan bool),
194+
workers: &sync.WaitGroup{},
195+
}
196+
197+
// Start a listener to send work through.
198+
go d.sendInputsToWorkChan()
199+
200+
// Start workers.
201+
for i := 0; i < d.config.numWorkers; i++ {
202+
d.workers.Add(1)
203+
go d.downloadWorker()
204+
}
205+
206+
return d, nil
207+
}
208+
209+
// DownloadRange specifies the object range.
210+
type DownloadRange struct {
211+
// Offset is the starting offset (inclusive) from with the object is read.
212+
// If offset is negative, the object is read abs(offset) bytes from the end,
213+
// and length must also be negative to indicate all remaining bytes will be read.
214+
Offset int64
215+
// Length is the number of bytes to read.
216+
// If length is negative or larger than the object size, the object is read
217+
// until the end.
218+
Length int64
219+
}
220+
221+
// DownloadObjectInput is the input for a single object to download.
222+
type DownloadObjectInput struct {
223+
// Required fields
224+
Bucket string
225+
Object string
226+
Destination io.WriterAt
227+
228+
// Optional fields
229+
Generation *int64
230+
Conditions *storage.Conditions
231+
EncryptionKey []byte
232+
Range *DownloadRange // if specified, reads only a range
233+
234+
// Callback will be run once the object is finished downloading. It must be
235+
// set if and only if the [WithCallbacks] option is set; otherwise, it must
236+
// not be set.
237+
Callback func(*DownloadOutput)
238+
239+
ctx context.Context
240+
}
241+
242+
// downloadShard will read a specific object into in.Destination.
243+
// If timeout is less than 0, no timeout is set.
244+
// TODO: download a single shard instead of the entire object.
245+
func (in *DownloadObjectInput) downloadShard(client *storage.Client, timeout time.Duration) (out *DownloadOutput) {
246+
out = &DownloadOutput{Bucket: in.Bucket, Object: in.Object}
247+
248+
// Set timeout.
249+
ctx := in.ctx
250+
if timeout > 0 {
251+
c, cancel := context.WithTimeout(ctx, timeout)
252+
defer cancel()
253+
ctx = c
254+
}
255+
256+
// Set options on the object.
257+
o := client.Bucket(in.Bucket).Object(in.Object)
258+
259+
if in.Conditions != nil {
260+
o = o.If(*in.Conditions)
261+
}
262+
if in.Generation != nil {
263+
o = o.Generation(*in.Generation)
264+
}
265+
if len(in.EncryptionKey) > 0 {
266+
o = o.Key(in.EncryptionKey)
267+
}
268+
269+
var offset, length int64 = 0, -1 // get the entire object by default
270+
271+
if in.Range != nil {
272+
offset, length = in.Range.Offset, in.Range.Length
273+
}
274+
275+
// Read.
276+
r, err := o.NewRangeReader(ctx, offset, length)
277+
if err != nil {
278+
out.Err = err
279+
return
280+
}
281+
282+
// TODO: write at a specific offset.
283+
off := io.NewOffsetWriter(in.Destination, 0)
284+
_, err = io.Copy(off, r)
285+
if err != nil {
286+
out.Err = err
287+
r.Close()
288+
return
289+
}
290+
291+
if err = r.Close(); err != nil {
292+
out.Err = err
293+
return
294+
}
295+
296+
out.Attrs = &r.Attrs
297+
return
298+
}
299+
300+
// DownloadOutput provides output for a single object download, including all
301+
// errors received while downloading object parts. If the download was successful,
302+
// Attrs will be populated.
303+
type DownloadOutput struct {
304+
Bucket string
305+
Object string
306+
Err error // error occurring during download
307+
Attrs *storage.ReaderObjectAttrs // attributes of downloaded object, if successful
308+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Copyright 2024 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package transfermanager
16+
17+
import (
18+
"context"
19+
"strings"
20+
"testing"
21+
)
22+
23+
func TestWaitAndClose(t *testing.T) {
24+
d, err := NewDownloader(nil)
25+
if err != nil {
26+
t.Fatalf("NewDownloader: %v", err)
27+
}
28+
29+
if _, err := d.WaitAndClose(); err != nil {
30+
t.Fatalf("WaitAndClose: %v", err)
31+
}
32+
33+
expectedErr := "transfermanager: WaitAndClose called before DownloadObject"
34+
err = d.DownloadObject(context.Background(), &DownloadObjectInput{})
35+
if err == nil {
36+
t.Fatalf("d.DownloadObject err was nil, should be %q", expectedErr)
37+
}
38+
if !strings.Contains(err.Error(), expectedErr) {
39+
t.Errorf("expected err %q, got: %v", expectedErr, err.Error())
40+
}
41+
}

0 commit comments

Comments
 (0)