Skip to content

Commit

Permalink
atob implementation (#1256)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #1256

Implement
[atob](https://html.spec.whatwg.org/multipage/webappapis.html#atob)
utility function for decoding a base64 string. This implementation
doesn't follow the HTML spec 100% in that for error cases, the code
doesn't throw DOMException. Existing alternatives people use with
Hermes simply throw Error, which is what this code throws as well.

Reviewed By: avp

Differential Revision: D52181353

fbshipit-source-id: c90ec95e1ed3b44a7668a6ae4071df536bb31a71
  • Loading branch information
dannysu authored and facebook-github-bot committed Jan 17, 2024
1 parent d2177c3 commit 13fafde
Show file tree
Hide file tree
Showing 9 changed files with 298 additions and 0 deletions.
17 changes: 17 additions & 0 deletions include/hermes/VM/JSLib/Base64Util.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#ifndef HERMES_VM_JSLIB_BASE64UTIL_H
#define HERMES_VM_JSLIB_BASE64UTIL_H

#include "hermes/Support/OptValue.h"
#include "hermes/VM/Runtime.h"

namespace hermes {
Expand All @@ -18,6 +19,22 @@ namespace vm {
template <typename T>
bool base64Encode(llvh::ArrayRef<T> str, StringBuilder &builder);

/// If \p str has a valid base64 encoded string length, then calculate the
/// expected length after decoding using the forgiving base64 algorithm. Returns
/// nullopt if \p str has an invalid length.
template <typename T>
OptValue<uint32_t> base64DecodeOutputLength(llvh::ArrayRef<T> str);

/// Implements the forgiving base64 decode algorithm:
/// https://infra.spec.whatwg.org/#forgiving-base64-decode
/// The key difference compared to other base64 decode algorithms is that the
/// forgiving algorithm ignores whitespaces.
/// \param str string to be decoded
/// \param builder StringBuilder to store the output in
/// \return true if successful, false otherwise
template <typename T>
bool base64Decode(llvh::ArrayRef<T> str, StringBuilder &builder);

} // namespace vm
} // namespace hermes

Expand Down
1 change: 1 addition & 0 deletions include/hermes/VM/NativeFunctions.def
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ NATIVE_FUNCTION(arrayPrototypeSome)
NATIVE_FUNCTION(arrayPrototypeUnshift)
NATIVE_FUNCTION(arrayPrototypeSplice)
NATIVE_FUNCTION(asyncFunctionConstructor)
NATIVE_FUNCTION(atob)

NATIVE_FUNCTION(bigintTruncate)
NATIVE_FUNCTION(bigintConstructor)
Expand Down
1 change: 1 addition & 0 deletions include/hermes/VM/PredefinedStrings.def
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ STR(isNaN, "isNaN")
STR(isFinite, "isFinite")
STR(escape, "escape")
STR(unescape, "unescape")
STR(atob, "atob")
STR(btoa, "btoa")
STR(decodeURI, "decodeURI")
STR(decodeURIComponent, "decodeURIComponent")
Expand Down
11 changes: 11 additions & 0 deletions include/hermes/VM/StringBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,17 @@ class StringBuilder {
return StringBuilder(runtime, crtRes->getString());
}

/// Number of characters accumulated in the StringBuilder so far.
uint32_t currentLength() const {
return index_;
}

/// The length this StringBuilder was constructed with. The max length doesn't
/// ever change.
uint32_t maxLength() const {
return strPrim_->getStringLength();
}

/// Append an UTF16Ref \p str. Note that str should not point to a GC-managed
/// memory, as this function in theory can allocate.
void appendUTF16Ref(UTF16Ref str) {
Expand Down
34 changes: 34 additions & 0 deletions lib/VM/JSLib/Base64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,39 @@ CallResult<HermesValue> btoa(void *, Runtime &runtime, NativeArgs args) {
return builder->getStringPrimitive().getHermesValue();
}

/// Take a Base64-encoded ASCII string and decode it. Error is thrown if the
/// input string isn't a valid base64 encoded string.
CallResult<HermesValue> atob(void *, Runtime &runtime, NativeArgs args) {
GCScope gcScope{runtime};
auto res = toString_RJS(runtime, args.getArgHandle(0));
if (LLVM_UNLIKELY(res == ExecutionStatus::EXCEPTION)) {
return ExecutionStatus::EXCEPTION;
}

auto string = runtime.makeHandle(std::move(*res));

OptValue<uint32_t> expectedLength = string->isASCII()
? base64DecodeOutputLength(string->getStringRef<char>())
: base64DecodeOutputLength(string->getStringRef<char16_t>());
if (!expectedLength) {
return runtime.raiseError("Not a valid base64 encoded string length");
}
CallResult<StringBuilder> builder =
StringBuilder::createStringBuilder(runtime, SafeUInt32(*expectedLength));
if (LLVM_UNLIKELY(builder == ExecutionStatus::EXCEPTION)) {
return ExecutionStatus::EXCEPTION;
}

bool success = string->isASCII()
? base64Decode(string->getStringRef<char>(), *builder)
: base64Decode(string->getStringRef<char16_t>(), *builder);
if (!success) {
return runtime.raiseError(
"Found invalid character when decoding base64 string");
}

return builder->getStringPrimitive().getHermesValue();
}

} // namespace vm
} // namespace hermes
105 changes: 105 additions & 0 deletions lib/VM/JSLib/Base64Util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,30 @@ namespace hermes {
namespace vm {

namespace {

constexpr const std::array<char, 64> Base64Chars = {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'};

// A lookup table that map (Base64-encoded) ASCII back to binary.
constexpr const std::array<unsigned char, 128> decMap = {
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
64, 64, 64, 64, 64, 62, 64, 64, 64, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 64, 64, 64, 64, 64, 64, 64, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 64, 64, 64, 64,
64, 64, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
43, 44, 45, 46, 47, 48, 49, 50, 51, 64, 64, 64, 64, 64};

template <typename T>
inline bool isWhitespace(T c) {
return (
c == '\x09' || c == '\x0A' || c == '\x0C' || c == '\x0D' || c == '\x20');
}

} // namespace

template <typename T>
Expand Down Expand Up @@ -90,5 +108,92 @@ template bool base64Encode(
llvh::ArrayRef<char16_t> str,
StringBuilder &builder);

template <typename T>
OptValue<uint32_t> base64DecodeOutputLength(llvh::ArrayRef<T> str) {
// Figure out the actual string length after ignoring all whitespaces.
uint64_t strLength = 0;
T lastChar = 0;
T secondLastChar = 0;
for (const auto c : str) {
// Only increment length if character is not a whitespace
if (!isWhitespace(c)) {
strLength++;
secondLastChar = lastChar;
lastChar = c;
}
}

uint32_t numPadding = 0;
if (strLength % 4 == 0) {
// Check to see if the last character or the last 2 characters are the
// padding character.
if (strLength > 0 && lastChar == '=') {
numPadding++;
if (strLength > 1 && secondLastChar == '=') {
numPadding++;
}
}
} else {
// The input string should always be divisible by 4.
return llvh::None;
}

// This shouldn't overflow because the value is guaranteed to be smaller.
uint32_t expectedLength = (strLength / 4 * 3) - numPadding;
if (strLength != 0 && expectedLength == 0) {
return llvh::None;
}
return expectedLength;
}

template OptValue<uint32_t> base64DecodeOutputLength(llvh::ArrayRef<char> str);
template OptValue<uint32_t> base64DecodeOutputLength(
llvh::ArrayRef<char16_t> str);

template <typename T>
bool base64Decode(llvh::ArrayRef<T> str, StringBuilder &builder) {
// Iterate over the trimmed \p str, decode every \c c into a sextet and store
// into a buffer \c buf of capacity 32 bits. \c bufSize is maintained to
// track how many bits are actually buffered.
uint32_t buf = 0;
uint32_t bufSize = 0;
for (const auto c : str) {
if (isWhitespace(c)) {
continue;
}

if (LLVM_UNLIKELY(c > 127) || LLVM_UNLIKELY(c < 0)) {
return false;
}

if (c == '=') {
break;
}

unsigned char sextet = decMap[c];
if (LLVM_UNLIKELY(sextet >= 64)) {
return false;
}

// Making room for the new sextet.
buf = (buf << 6) + sextet;
bufSize += 6;

// Once buffer is filled over a byte, evacuate a byte to the output.
if (bufSize >= 8) {
char16_t decodedChar = (buf >> (bufSize - 8)) & 0xFF;
builder.appendCharacter(decodedChar);
bufSize -= 8;
}
}

return builder.currentLength() == builder.maxLength();
}

template bool base64Decode(llvh::ArrayRef<char> str, StringBuilder &builder);
template bool base64Decode(
llvh::ArrayRef<char16_t> str,
StringBuilder &builder);

} // namespace vm
} // namespace hermes
3 changes: 3 additions & 0 deletions lib/VM/JSLib/GlobalObject.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,9 @@ void initGlobalObject(Runtime &runtime, const JSLibFlags &jsLibFlags) {
// Define the 'unescape' function.
defineGlobalFunc(Predefined::getSymbolID(Predefined::unescape), unescape, 1);

// Define the 'atob' function.
defineGlobalFunc(Predefined::getSymbolID(Predefined::atob), atob, 1);

// Define the 'btoa' function.
defineGlobalFunc(Predefined::getSymbolID(Predefined::btoa), btoa, 1);

Expand Down
26 changes: 26 additions & 0 deletions test/hermes/atob.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/**
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

// RUN: LC_ALL=en_US.UTF-8 %hermes -O -target=HBC %s | %FileCheck --match-full-lines %s
"use strict";

print('atob');
// CHECK-LABEL: atob
print(atob('YQ=='));
// CHECK-NEXT: a
print(atob('0w=='));
// CHECK-NEXT: Ó
print(atob('000='));
// CHECK-NEXT: ÓM
try {
atob('\u03A9');
} catch (e) {
print(e.message);
// CHECK-NEXT: Not a valid base64 encoded string length
}
print(atob(btoa("a")));
// CHECK-NEXT: a
100 changes: 100 additions & 0 deletions unittests/VMRuntime/Base64UtilTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,36 @@ using Base64UtilTest = RuntimeTestFixture;
EXPECT_ENCODED(createUTF16Ref(converted.data()), expected); \
}

#define EXPECT_DECODED(original, expected) \
{ \
hermes::OptValue<uint32_t> expectedLength = \
base64DecodeOutputLength(original); \
EXPECT_TRUE(expectedLength.hasValue()); \
CallResult<StringBuilder> builder = StringBuilder::createStringBuilder( \
runtime, hermes::SafeUInt32(*expectedLength)); \
EXPECT_NE(builder, ExecutionStatus::EXCEPTION); \
\
bool success = base64Decode(original, *builder); \
EXPECT_TRUE(success); \
EXPECT_EQ( \
builder->getStringPrimitive()->getStringRef<char16_t>(), \
createUTF16Ref(expected)); \
}

#define EXPECT_DECODED_ASCII_AND_UTF16(original, expected) \
{ \
ASCIIRef asciiRef = createASCIIRef(original); \
EXPECT_DECODED(asciiRef, expected); \
\
std::vector<char16_t> converted(asciiRef.size() + 1); \
uint32_t i = 0; \
for (i = 0; i < asciiRef.size(); i++) { \
converted[i] = asciiRef[i]; \
} \
converted[i] = '\0'; \
EXPECT_DECODED(createUTF16Ref(converted.data()), expected); \
}

TEST_F(Base64UtilTest, EdgeCases) {
EXPECT_ENCODED_ASCII_AND_UTF16("", "");
}
Expand Down Expand Up @@ -90,4 +120,74 @@ TEST_F(Base64UtilTest, EncodeInvalid) {
EXPECT_FALSE(base64Encode(createUTF16Ref(u"abc\U0001F600xyz"), *builder));
}

TEST_F(Base64UtilTest, DecodeValid) {
EXPECT_DECODED_ASCII_AND_UTF16("", u"");
EXPECT_DECODED_ASCII_AND_UTF16("YQ==", u"a");
EXPECT_DECODED_ASCII_AND_UTF16("YR==", u"a");
EXPECT_DECODED_ASCII_AND_UTF16("YWI=", u"ab");
EXPECT_DECODED_ASCII_AND_UTF16("YWJj", u"abc");
EXPECT_DECODED_ASCII_AND_UTF16("YWJjZA==", u"abcd");
EXPECT_DECODED_ASCII_AND_UTF16("YWJjZGU=", u"abcde");
EXPECT_DECODED_ASCII_AND_UTF16("YWJjZGVm", u"abcdef");
EXPECT_DECODED_ASCII_AND_UTF16("0w==", u"\xD3");
EXPECT_DECODED_ASCII_AND_UTF16("000=", u"\xD3M");
}

TEST_F(Base64UtilTest, DecodeWithWhitespace) {
EXPECT_DECODED_ASCII_AND_UTF16(" ", u"");
EXPECT_DECODED_ASCII_AND_UTF16("\x09\x0A\x0C\x0D\x20", u"");
EXPECT_DECODED_ASCII_AND_UTF16("Y Q = =", u"a");
EXPECT_DECODED_ASCII_AND_UTF16("\x09Y\x0AQ\x0C=\x0D=\x20", u"a");
EXPECT_DECODED_ASCII_AND_UTF16(" YR==", u"a");
EXPECT_DECODED_ASCII_AND_UTF16("YR== ", u"a");
}

TEST_F(Base64UtilTest, DecodeInvalid) {
// Just a long enough buffer. All calls in this function are expected to fail.
hermes::SafeUInt32 outputLength{50};
CallResult<StringBuilder> builder =
StringBuilder::createStringBuilder(runtime, outputLength);

std::array<char, 5> hasNegative = {'A', 'b', 'c', -15, '\0'};
EXPECT_FALSE(base64Decode(createASCIIRef(hasNegative.data()), *builder));

EXPECT_FALSE(base64Decode(createASCIIRef("==="), *builder));
EXPECT_FALSE(base64Decode(createASCIIRef("0==="), *builder));
EXPECT_FALSE(base64Decode(createASCIIRef("aa=0"), *builder));
EXPECT_FALSE(
base64Decode(createASCIIRef("$aaaaaaaaaaaaaaaaaaaaaaa"), *builder));
EXPECT_FALSE(
base64Decode(createASCIIRef("aaaaaa$aaaaaaaaaaaaaaaaa"), *builder));
EXPECT_FALSE(base64Decode(
createASCIIRef("bbbbbbbbbddddddddddddddddddddaaaaaaadddddddb="),
*builder));

// Strings that are the wrong length to be a valid base64 encoded string
EXPECT_FALSE(base64Decode(createASCIIRef("A"), *builder));
EXPECT_FALSE(base64Decode(createASCIIRef("B="), *builder));
EXPECT_FALSE(base64Decode(createASCIIRef("ba="), *builder));
EXPECT_FALSE(base64Decode(createASCIIRef("aaaaa"), *builder));

// Non-Base64 ASCII , i.e., not [a-z]|[A-Z]|[0-p]|\+|\/
EXPECT_FALSE(base64Decode(createASCIIRef("a*"), *builder));
EXPECT_FALSE(base64Decode(createASCIIRef("YQ*"), *builder));

// Out of ASCII range.
EXPECT_FALSE(base64Decode(createASCIIRef("a\xFF"), *builder));

// Padding where it's not expected
EXPECT_FALSE(base64Decode(createASCIIRef("="), *builder));
EXPECT_FALSE(base64Decode(createASCIIRef("=="), *builder));

// Padding in the middle
EXPECT_FALSE(base64Decode(createASCIIRef("YQ==YQ=="), *builder));

// Extra padding
EXPECT_FALSE(base64Decode(createASCIIRef("YQ==="), *builder));

EXPECT_FALSE(base64Decode(createUTF16Ref(u"a\uff20=="), *builder));
EXPECT_FALSE(
base64Decode(createUTF16Ref(u"\u0065\u0065\u0065\u03A9"), *builder));
}

} // end anonymous namespace

0 comments on commit 13fafde

Please sign in to comment.