Skip to content

Commit

Permalink
bugfix: zero-byte range reads.
Browse files Browse the repository at this point in the history
For http backends, zero byte reads tensorstore constructed a range header like 'Range: bytes=1-0'
This is not a valid range header, and allows servers to respond with the entire contents, which is undesirable.

These zero-byte reads are converted into HEAD requests or, in the case of GCS, metadata requests.

PiperOrigin-RevId: 563252891
Change-Id: I415cc8ad56a7a0979367b0e091a0d4f6b6fe05ee
  • Loading branch information
laramiel authored and Copybara-Service committed Sep 6, 2023
1 parent 7db8e6a commit ea7aecb
Show file tree
Hide file tree
Showing 12 changed files with 181 additions and 113 deletions.
1 change: 1 addition & 0 deletions tensorstore/internal/http/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ tensorstore_cc_test(
],
deps = [
":http",
"//tensorstore/kvstore:byte_range",
"@com_google_absl//absl/time",
"@com_google_googletest//:gtest_main",
],
Expand Down
10 changes: 7 additions & 3 deletions tensorstore/internal/http/http_request.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,20 @@

#include "tensorstore/internal/http/http_request.h"

#include <cassert>
#include <optional>
#include <string>
#include <string_view>
#include <utility>
#include <vector>

#include "absl/functional/function_ref.h"
#include "absl/strings/ascii.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/strings/str_join.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tensorstore/kvstore/byte_range.h"
#include "tensorstore/util/str_cat.h"

namespace tensorstore {
namespace internal_http {
Expand All @@ -34,7 +36,9 @@ std::optional<std::string> FormatRangeHeader(
OptionalByteRangeRequest byte_range) {
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range
assert(byte_range.SatisfiesInvariants());
if (byte_range.IsRange()) {
if (byte_range.IsRange() &&
byte_range.exclusive_max > byte_range.inclusive_min) {
// The range header is inclusive; `Range: 1-0` is invalid.
return absl::StrFormat("Range: bytes=%d-%d", byte_range.inclusive_min,
byte_range.exclusive_max - 1);
}
Expand Down
1 change: 1 addition & 0 deletions tensorstore/internal/http/http_request.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <optional>
#include <string>
#include <string_view>
#include <utility>
#include <vector>

#include "absl/functional/function_ref.h"
Expand Down
2 changes: 2 additions & 0 deletions tensorstore/internal/http/http_request_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@

#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tensorstore/kvstore/byte_range.h"

namespace {

Expand Down
5 changes: 3 additions & 2 deletions tensorstore/kvstore/byte_range.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
#ifndef TENSORSTORE_KVSTORE_BYTE_RANGE_REQUEST_H_
#define TENSORSTORE_KVSTORE_BYTE_RANGE_REQUEST_H_

#include <stddef.h>
#include <stdint.h>

#include <cassert>
#include <cstdint>
#include <optional>
#include <ostream>

#include "absl/strings/cord.h"
Expand Down
3 changes: 3 additions & 0 deletions tensorstore/kvstore/gcs_http/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ tensorstore_cc_library(
"//tensorstore/internal/cache_key",
"//tensorstore/internal/http",
"//tensorstore/internal/http:curl_transport",
"//tensorstore/internal/json",
"//tensorstore/internal/json_binding",
"//tensorstore/internal/json_binding:absl_time",
"//tensorstore/internal/json_binding:bindable",
Expand All @@ -37,6 +38,7 @@ tensorstore_cc_library(
"//tensorstore/kvstore",
"//tensorstore/kvstore:byte_range",
"//tensorstore/kvstore:generation",
"//tensorstore/kvstore:key_range",
"//tensorstore/kvstore/gcs:gcs_resource",
"//tensorstore/kvstore/gcs:validate",
"//tensorstore/serialization",
Expand All @@ -50,6 +52,7 @@ tensorstore_cc_library(
"//tensorstore/util/execution:any_receiver",
"//tensorstore/util/garbage_collection",
"@com_github_nlohmann_json//:nlohmann_json",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/log:absl_log",
"@com_google_absl//absl/random",
"@com_google_absl//absl/status",
Expand Down
103 changes: 62 additions & 41 deletions tensorstore/kvstore/gcs_http/gcs_key_value_store.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include <stddef.h>
#include <stdint.h>

#include <algorithm>
#include <atomic>
#include <cassert>
#include <memory>
Expand All @@ -21,10 +25,12 @@
#include <utility>
#include <vector>

#include "absl/base/thread_annotations.h"
#include "absl/log/absl_log.h"
#include "absl/random/random.h"
#include "absl/status/status.h"
#include "absl/strings/cord.h"
#include "absl/strings/str_cat.h"
#include "absl/synchronization/mutex.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
Expand All @@ -38,6 +44,7 @@
#include "tensorstore/internal/http/http_response.h"
#include "tensorstore/internal/http/http_transport.h"
#include "tensorstore/internal/intrusive_ptr.h"
#include "tensorstore/internal/json/json.h"
#include "tensorstore/internal/json_binding/bindable.h"
#include "tensorstore/internal/json_binding/json_binding.h"
#include "tensorstore/internal/metrics/counter.h"
Expand All @@ -58,8 +65,12 @@
#include "tensorstore/kvstore/gcs_http/object_metadata.h"
#include "tensorstore/kvstore/gcs_http/rate_limiter.h"
#include "tensorstore/kvstore/generation.h"
#include "tensorstore/kvstore/key_range.h"
#include "tensorstore/kvstore/operations.h"
#include "tensorstore/kvstore/read_result.h"
#include "tensorstore/kvstore/registry.h"
#include "tensorstore/kvstore/spec.h"
#include "tensorstore/kvstore/supported_features.h"
#include "tensorstore/kvstore/url_registry.h"
#include "tensorstore/util/execution/any_receiver.h"
#include "tensorstore/util/execution/execution.h"
Expand All @@ -72,12 +83,12 @@
#include "tensorstore/util/str_cat.h"

/// specializations
#include "tensorstore/internal/cache_key/std_optional.h"
#include "tensorstore/internal/json_binding/std_array.h"
#include "tensorstore/internal/json_binding/std_optional.h"
#include "tensorstore/serialization/fwd.h"
#include "tensorstore/serialization/std_optional.h"
#include "tensorstore/util/garbage_collection/std_optional.h"
#include "tensorstore/internal/cache_key/std_optional.h" // IWYU pragma: keep
#include "tensorstore/internal/json_binding/std_array.h" // IWYU pragma: keep
#include "tensorstore/internal/json_binding/std_optional.h" // IWYU pragma: keep
#include "tensorstore/serialization/fwd.h" // IWYU pragma: keep
#include "tensorstore/serialization/std_optional.h" // IWYU pragma: keep
#include "tensorstore/util/garbage_collection/std_optional.h" // IWYU pragma: keep

// GCS reference links are:
//
Expand Down Expand Up @@ -497,7 +508,8 @@ struct ReadTask : public RateLimiterNode,
return;
}
/// Reads contents of a GCS object.
std::string media_url = tensorstore::StrCat(resource, "?alt=media");
std::string media_url = tensorstore::StrCat(
resource, options.byte_range.size() == 0 ? "?alt=json" : "?alt=media");

// Add the ifGenerationNotMatch condition.
AddGenerationParam(&media_url, true, "ifGenerationNotMatch",
Expand All @@ -521,10 +533,11 @@ struct ReadTask : public RateLimiterNode,
if (maybe_auth_header.value().has_value()) {
request_builder.AddHeader(*maybe_auth_header.value());
}
if (options.byte_range.size() != 0) {
request_builder.MaybeAddRangeHeader(options.byte_range);
}

auto request = request_builder.MaybeAddRangeHeader(options.byte_range)
.EnableAcceptEncoding()
.BuildRequest();
auto request = request_builder.EnableAcceptEncoding().BuildRequest();
start_time_ = absl::Now();

ABSL_LOG_IF(INFO, TENSORSTORE_INTERNAL_GCS_LOG_REQUESTS)
Expand Down Expand Up @@ -598,40 +611,48 @@ struct ReadTask : public RateLimiterNode,
return read_result;
}

size_t payload_size = httpresponse.payload.size();
if (httpresponse.status_code != 206) {
// This may or may not have been a range request; attempt to validate.
TENSORSTORE_ASSIGN_OR_RETURN(auto byte_range,
options.byte_range.Validate(payload_size));
read_result.state = kvstore::ReadResult::kValue;
read_result.value =
internal::GetSubCord(httpresponse.payload, byte_range);
} else {
// Server should return a parseable content-range header.
TENSORSTORE_ASSIGN_OR_RETURN(auto content_range_tuple,
ParseContentRangeHeader(httpresponse));

if (auto request_size = options.byte_range.size();
(options.byte_range.inclusive_min != -1 &&
options.byte_range.inclusive_min !=
std::get<0>(content_range_tuple)) ||
(request_size != -1 && request_size != payload_size)) {
// Return an error when the response does not start at the requested
// offset of when the response is smaller than the desired size.
return absl::OutOfRangeError(tensorstore::StrCat(
"Requested byte range ", options.byte_range,
" was not satisfied by GCS response of size ", payload_size));
read_result.state = kvstore::ReadResult::kValue;
ObjectMetadata metadata;
if (options.byte_range.size() != 0) {
if (httpresponse.status_code != 206) {
// This may or may not have been a range request; attempt to validate.
TENSORSTORE_ASSIGN_OR_RETURN(
auto byte_range,
options.byte_range.Validate(httpresponse.payload.size()));

read_result.value =
internal::GetSubCord(httpresponse.payload, byte_range);
} else {
read_result.value = httpresponse.payload;

// Server should return a parseable content-range header.
TENSORSTORE_ASSIGN_OR_RETURN(auto content_range_tuple,
ParseContentRangeHeader(httpresponse));

if (auto request_size = options.byte_range.size();
(options.byte_range.inclusive_min != -1 &&
options.byte_range.inclusive_min !=
std::get<0>(content_range_tuple)) ||
(request_size >= 0 && request_size != read_result.value.size())) {
// Return an error when the response does not start at the requested
// offset of when the response is smaller than the desired size.
return absl::OutOfRangeError(
tensorstore::StrCat("Requested byte range ", options.byte_range,
" was not satisfied by GCS response of size ",
httpresponse.payload.size()));
}
}
// assert(payload_size == std::get<2>(content_range_tuple));
read_result.state = kvstore::ReadResult::kValue;
read_result.value = httpresponse.payload;
// TODO: Avoid parsing the entire metadata & only extract the
// generation field.
SetObjectMetadataFromHeaders(httpresponse.headers, &metadata);
} else {
// 0-range reads issue metadata requests; parse the metadata for
// the storage generation.
absl::Cord cord = httpresponse.payload;
TENSORSTORE_ASSIGN_OR_RETURN(metadata,
ParseObjectMetadata(cord.Flatten()));
}

// TODO: Avoid parsing the entire metadata & only extract the
// generation field.
ObjectMetadata metadata;
SetObjectMetadataFromHeaders(httpresponse.headers, &metadata);

read_result.stamp.generation =
StorageGeneration::FromUint64(metadata.generation);
return read_result;
Expand Down
2 changes: 1 addition & 1 deletion tensorstore/kvstore/http/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ tensorstore_cc_library(
"//tensorstore/util:result",
"//tensorstore/util:status",
"//tensorstore/util:str_cat",
"//tensorstore/util/execution",
"//tensorstore/util/garbage_collection",
"@com_github_nlohmann_json//:nlohmann_json",
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings",
Expand Down

0 comments on commit ea7aecb

Please sign in to comment.