Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Utf8String class to directly hold utf8 strings passed via JSRT #5348

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 75 additions & 67 deletions lib/Jsrt/Jsrt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3729,26 +3729,40 @@ JsErrorCode GetScriptBufferDetails(
}
const bool isUtf8 = !isString && !(parseAttributes & JsParseScriptAttributeArrayBufferIsUtf16Encoded);

*script = isExternalArray ?
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@boingoing I've refactored this method and added a use in CompileRun. As part of that I noticed that this method previously didn't cater for the case of an ExternalArrayBuffer with utf16 encoding; my change supports that, but I'm not sure if that was intentional or not.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that case was an oversight. Thanks for cleaning this up, looks good.

((Js::ExternalArrayBuffer*)(scriptVal))->GetBuffer() :
(const byte*)((Js::JavascriptString*)(scriptVal))->GetSz();
*cb = isExternalArray ?
((Js::ExternalArrayBuffer*)(scriptVal))->GetByteLength() :
((Js::JavascriptString*)(scriptVal))->GetSizeInBytes();

if (isExternalArray && isUtf8)
{
*scriptFlag = (LoadScriptFlag)(LoadScriptFlag_ExternalArrayBuffer | LoadScriptFlag_Utf8Source);
}
else if (isUtf8)
if (isExternalArray)
{
*scriptFlag = (LoadScriptFlag)(LoadScriptFlag_Utf8Source);
Js::ExternalArrayBuffer* scriptBuffer = (Js::ExternalArrayBuffer*)(scriptVal);
*script = scriptBuffer->GetBuffer();
*cb = scriptBuffer->GetByteLength();
if (isUtf8)
{
*scriptFlag = (LoadScriptFlag)(LoadScriptFlag_ExternalArrayBuffer | LoadScriptFlag_Utf8Source);
}
else
{
*scriptFlag = (LoadScriptFlag)(LoadScriptFlag_ExternalArrayBuffer);
}
}
else
{
*scriptFlag = LoadScriptFlag_None;
Js::JavascriptString* scriptString = (Js::JavascriptString*)(scriptVal);
if (Js::Utf8String::Is(scriptString))
{
Js::Utf8String * utf8String = Js::Utf8String::From(scriptString);
*script = (const byte*)utf8String->Utf8Buffer();
*cb = utf8String->Utf8Length();
*scriptFlag = (LoadScriptFlag)(LoadScriptFlag_Utf8Source);
}
else
{
return ContextAPINoScriptWrapper_NoRecord([&](Js::ScriptContext *scriptContext) -> JsErrorCode {
*script = (const byte*)scriptString->GetSz();
*cb = scriptString->GetSizeInBytes();
*scriptFlag = LoadScriptFlag_None;
return JsNoError;
});
}
}

return JsNoError;
}

Expand Down Expand Up @@ -4781,9 +4795,17 @@ CHAKRA_API JsCreateString(

return ContextAPINoScriptWrapper([&](Js::ScriptContext *scriptContext, TTDRecorder& _actionEntryPopper) -> JsErrorCode {

Js::JavascriptString *stringValue = Js::LiteralStringWithPropertyStringPtr::
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have any data on how many strings from JsCreateString end up used as property keys? We might possibly be regressing some scenarios by changing what is essentially a heuristic here.

NewFromCString(content, (CharCount)length, scriptContext->GetLibrary());
char * recyclerBuffer = RecyclerNewArrayLeaf(scriptContext->GetRecycler(), char, length);
if (recyclerBuffer == nullptr)
{
Js::JavascriptError::ThrowOutOfMemoryError(scriptContext);
}

memcpy_s(recyclerBuffer, length, content, length);

Js::JavascriptString *stringValue = RecyclerNew(scriptContext->GetRecycler(), Js::Utf8String, recyclerBuffer, length, scriptContext->GetLibrary()->GetStringTypeStatic());

// TODO: With TTD enabled we immediately flatten these strings to utf16. Perhaps we should handle this differently.
PERFORM_JSRT_TTD_RECORD_ACTION(scriptContext, RecordJsRTCreateString, stringValue->GetSz(), stringValue->GetLength());

*value = stringValue;
Expand Down Expand Up @@ -4934,44 +4956,20 @@ _ALWAYSINLINE JsErrorCode CompileRun(
VALIDATE_JSREF(scriptVal);
PARAM_NOT_NULL(sourceUrl);

bool isExternalArray = Js::ExternalArrayBuffer::Is(scriptVal),
isString = false;
bool isUtf8 = !(parseAttributes & JsParseScriptAttributeArrayBufferIsUtf16Encoded);

LoadScriptFlag scriptFlag = LoadScriptFlag_None;
const byte* script;
size_t cb;
const WCHAR *url;

if (isExternalArray)
{
script = ((Js::ExternalArrayBuffer*)(scriptVal))->GetBuffer();

cb = ((Js::ExternalArrayBuffer*)(scriptVal))->GetByteLength();
JsErrorCode error = GetScriptBufferDetails(scriptVal, parseAttributes, &scriptFlag, &cb, &script);

scriptFlag = (LoadScriptFlag)(isUtf8 ?
LoadScriptFlag_ExternalArrayBuffer | LoadScriptFlag_Utf8Source :
LoadScriptFlag_ExternalArrayBuffer);
}
else
if (error != JsNoError)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Surely we have some fancy macro for this

{
isString = Js::JavascriptString::Is(scriptVal);
if (!isString)
{
return JsErrorInvalidArgument;
}
return error;
}

JsErrorCode error = GlobalAPIWrapper_NoRecord([&]() -> JsErrorCode {
if (isString)
{
Js::JavascriptString* jsString = Js::JavascriptString::FromVar(scriptVal);
script = (const byte*)jsString->GetSz();

// JavascriptString is 2 bytes (WCHAR/char16)
cb = jsString->GetLength() * sizeof(WCHAR);
}
const WCHAR *url;

error = GlobalAPIWrapper_NoRecord([&]() -> JsErrorCode {
if (!Js::JavascriptString::Is(sourceUrl))
{
return JsErrorInvalidArgument;
Expand Down Expand Up @@ -5165,14 +5163,23 @@ CHAKRA_API JsRunSerialized(
{
PARAM_NOT_NULL(bufferVal);
const WCHAR *url;
JsErrorCode errorCode = ContextAPINoScriptWrapper_NoRecord([&](Js::ScriptContext *scriptContext) -> JsErrorCode {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does our usual rule about { on new line not apply in lambdas? Just curious, I see some of both in this file so either way can be argued to be consistent with the surrounding style.


if (sourceUrl && Js::JavascriptString::Is(sourceUrl))
{
url = ((Js::JavascriptString*)(sourceUrl))->GetSz();
}
else
if (sourceUrl && Js::JavascriptString::Is(sourceUrl))
{
url = ((Js::JavascriptString*)(sourceUrl))->GetSz();
}
else
{
return JsErrorInvalidArgument;
}

return JsNoError;
});

if (errorCode != JsNoError)
{
return JsErrorInvalidArgument;
return errorCode;
}

// JsParseSerialized only accepts ArrayBuffer (incl. ExternalArrayBuffer)
Expand Down Expand Up @@ -5689,12 +5696,23 @@ CHAKRA_API JsRunScriptWithParserState(
const WCHAR *url = nullptr;
uint sourceIndex = 0;

JsErrorCode errorCode = ContextAPINoScriptWrapper_NoRecord([&](Js::ScriptContext *scriptContext) -> JsErrorCode {
const byte* bytes;
size_t cb;
LoadScriptFlag loadScriptFlag;
const byte* bytes;
size_t cb;
LoadScriptFlag loadScriptFlag;

JsErrorCode errorCode = GetScriptBufferDetails(script, parseAttributes, &loadScriptFlag, &cb, &bytes);
JsErrorCode errorCode = GetScriptBufferDetails(script, parseAttributes, &loadScriptFlag, &cb, &bytes);

if (errorCode != JsNoError)
{
return errorCode;
}

if (!Js::ExternalArrayBuffer::Is(parserState))
{
return JsErrorInvalidArgument;
}

errorCode = ContextAPINoScriptWrapper_NoRecord([&](Js::ScriptContext *scriptContext) -> JsErrorCode {

if (sourceUrl && Js::JavascriptString::Is(sourceUrl))
{
Expand All @@ -5705,11 +5723,6 @@ CHAKRA_API JsRunScriptWithParserState(
return JsErrorInvalidArgument;
}

if (errorCode != JsNoError)
{
return errorCode;
}

SourceContextInfo* sourceContextInfo = scriptContext->GetSourceContextInfo(sourceContext, nullptr);

if (sourceContextInfo == nullptr)
Expand Down Expand Up @@ -5761,11 +5774,6 @@ CHAKRA_API JsRunScriptWithParserState(
return errorCode;
}

if (!Js::ExternalArrayBuffer::Is(parserState))
{
return JsErrorInvalidArgument;
}

Js::ArrayBuffer* arrayBuffer = Js::ArrayBuffer::FromVar(parserState);
byte* buffer = arrayBuffer->GetBuffer();
JsSerializedLoadScriptCallback dummy = DummyScriptLoadSourceCallbackForRunScriptWithParserState;
Expand Down
1 change: 1 addition & 0 deletions lib/Runtime/Library/Chakra.Runtime.Library.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@
<ClInclude Include="SparseArraySegment.h" />
<ClInclude Include="SubString.h" />
<ClInclude Include="UriHelper.h" />
<ClInclude Include="Utf8String.h" />
<ClInclude Include="WabtInterface.h" />
<ClInclude Include="WasmLibrary.h" />
<ClInclude Include="WebAssembly.h" />
Expand Down
1 change: 1 addition & 0 deletions lib/Runtime/Library/Chakra.Runtime.Library.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@
<ClInclude Include="JsBuiltIn\JsBuiltIn.js.nojit.bc.32b.h" />
<ClInclude Include="JsBuiltIn\JsBuiltIn.js.nojit.bc.64b.h" />
<ClInclude Include="PropertyRecordUsageCache.h" />
<ClInclude Include="Utf8String.h" />
<ClInclude Include="..\LibraryFunction.h" />
</ItemGroup>
<ItemGroup>
Expand Down
2 changes: 1 addition & 1 deletion lib/Runtime/Library/JavascriptString.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ namespace Js
void FinishCopy(__inout_xcount(m_charLength) char16 *const buffer, StringCopyInfoStack &nestedStringTreeCopyInfos);

public:
virtual int GetRandomAccessItemsFromConcatString(Js::JavascriptString * const *& items) const { return -1; }
virtual int GetRandomAccessItemsFromConcatString(_Out_ Js::JavascriptString * const *& items) const { items = nullptr; return -1; }
virtual bool IsTree() const { return false; }

virtual BOOL SetItem(uint32 index, Var value, PropertyOperationFlags propertyOperationFlags) override;
Expand Down
1 change: 1 addition & 0 deletions lib/Runtime/Library/RuntimeLibraryPch.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "Library/SingleCharString.h"
#include "Library/SubString.h"
#include "Library/BufferStringBuilder.h"
#include "Library/Utf8String.h"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is already included in Runtime.h above, can we remove here?


#include "Library/BoundFunction.h"
#include "Library/JavascriptGeneratorFunction.h"
Expand Down
142 changes: 142 additions & 0 deletions lib/Runtime/Library/Utf8String.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------
#pragma once

#include "Codex/Utf8Codex.h"

namespace Js
{
class Utf8String : public JavascriptString
{
private:
typedef struct {
FieldNoBarrier(size_t) length;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why size_t not charcount_t?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the length of the utf8 string, which can be up to 3 times as long as the same string in utf16. Since a utf16 string can be up to ~2^31 in chakra, the corresponding utf8 string may be more than 2^32 in length, so utf8 lengths need to be size_t

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point, thanks


In reply to: 205622942 [](ancestors = 205622942)

Field(char*) buffer;
} PrefixedUtf8String;
Field(PrefixedUtf8String*) utf8String;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If PrefixedUtf8String is only two pointers big, could we just put it directly in the object rather than requiring a separate allocation (and a separate pointer dereference every time we use it)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Part of the original motivation of this was to allow other kinds of strings to convert themselves into a Utf8String, and when I was doing that one of the candidate string types only had 1 pointer's worth of space available, which is why I went down this path (see the ConvertString method which implicitly assumes that there is space for the one pointer available). However right now that's not happening, so I could undo that for now and re-do it later on if necessary.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What had only one pointer's worth of space? All allocations are bumped up to the nearest 16 bytes anyway, so a 40-byte object actually gets allocated as 48


In reply to: 205622702 [](ancestors = 205622702)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe it was specifically on 32bit one of the types was otherwise full... I don't recall which at the moment though.


protected:
DEFINE_VTABLE_CTOR(Utf8String, JavascriptString);

private:

void SetUtf8Buffer(_In_reads_(utf8Length) char* buffer, size_t utf8Length)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I very much prefer reading the code with it all in the header like this, but I think we're meant to put most function definitions in cpp files unless they're templates

{
this->utf8String = RecyclerNew(this->GetRecycler(), PrefixedUtf8String);
this->utf8String->length = utf8Length;
this->utf8String->buffer = buffer;
}

public:

Utf8String(_In_ JavascriptString* originalString, _In_reads_(utf8Length) char* buffer, size_t utf8Length, _In_ StaticType* type) :
JavascriptString(type),
utf8String(nullptr)
{
SetUtf8Buffer(buffer, utf8Length);

this->SetLength(originalString->GetLength());
this->SetBuffer(originalString->UnsafeGetBuffer());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This worries me. Maybe originalString->GetSz() instead?

  • SubString's buffer is not guaranteed to be null-terminated
  • SubString and PropertyString both rely on holding other pointers to keep their buffers alive, because the buffer itself is not recycler allocated

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My thinking was that this should keep the finalized-ness of the source string: if the original string isn't finalized, it won't have a buffer so we would be setting it to nullptr (and then if someone asks for it, we generate it from the utf8 version), and otherwise it was finalized so we can remain finalized. I hadn't considered the case when a string would have an invalid buffer like a substring would.

Using getSz would be safe, but could maybe flatten more strings than expected... but it avoids having to decode the utf8 in future anyway. I think I will go with that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd be fine with something that said explicitly "if finalized, GetSz, else null", but UnsafeGetBuffer is marked unsafe for a reason :)


In reply to: 205623819 [](ancestors = 205623819)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually even with GetSz the PropertyString problem persists. It returns an interior pointer to a PropertyRecord, which is kept alive by the string itself. We risk it getting collected from under us with this setup.


In reply to: 205624092 [](ancestors = 205624092,205623819)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Buffer ownership is also a very real problem with the conversion code: we swap out a vtable, overwrite our owning pointer with some other data, and then our data might get collected.


In reply to: 205624399 [](ancestors = 205624399,205624092,205623819)

}

Utf8String(_In_reads_(utf8Length) char* buffer, size_t utf8Length, _In_ StaticType* type) :
JavascriptString(type),
utf8String(nullptr)
{
SetUtf8Buffer(buffer, utf8Length);

charcount_t utf16Length = 0;
utf8::DecodeOptions opts = utf8::DecodeOptions::doDefault;
LPCUTF8 buf = reinterpret_cast<LPCUTF8>(buffer);
LPCUTF8 end = buf + utf8Length;
while (buf < end)
{
if ((*buf & 0x80) == 0)
{
// Single byte character
utf16Length++;
buf++;
}
else
{
// Decode a single utf16 character, and increment the pointer
// to the start of the next character.
char16 c1 = *buf;
++buf;
utf8::DecodeTail(c1, buf, end, opts);
utf16Length++;
}
}

this->SetLength(utf16Length);
this->SetBuffer(nullptr);
}


size_t Utf8Length() const
{
return this->utf8String->length;
}

const char* Utf8Buffer() const
{
return this->utf8String->buffer;
}

const char16* GetSz() override
{
if (this->IsFinalized())
{
return this->UnsafeGetBuffer();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this was the result of ConvertString from a SubString, then this buffer might not be null terminated, violating the rules of GetSz. This part I think is not actually dangerous, because any substring should have a null character eventually in the source string it came from, but we might end up reading a lot more data than intended.

}

// TODO: This is currently wrong in the presence of unmatched surrogate pairs.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: it might be nice to move this comment down to right before DecodeUnitsIntoAndNullTerminateNoAdvance, because in this position I originally though it was talking about the allocation size being wrong (which would be Very Bad).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's also not the 'correct' place for this comment; been a while since I wrote it. The actual issue is with the encoding into utf8, and determining how to decode it. As things stand I'm using 'actual' utf8 which doesn't allow lone surrogate halves, but we could instead use 'cesu-8' which does allow that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I know canonical utf-8 doesn’t encode surrogate halves at all (paired or otherwise)—the character is encoded directly. If you need to preserve the surrogates independently I think cesu-8 is your only option.

// They will be converted to a 3 byte replacement character in utf8, and then
// converted back into a 1 utf16 character 0xfffd representation of that, not the original
// utf16 surrogate pair character
char16* buffer = RecyclerNewArrayLeaf(this->GetRecycler(), char16, this->GetLength() + 1);

// Fetching the char* from the Field(char*) first so we can then cast to LPCUTF8
const char* bufferStart = this->utf8String->buffer;
LPCUTF8 start = reinterpret_cast<LPCUTF8>(bufferStart);
size_t decodeLength = utf8::DecodeUnitsIntoAndNullTerminateNoAdvance(buffer, start, start + this->utf8String->length);

Assert(decodeLength == this->GetLength());

buffer[this->GetLength()] = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems to already be done by the AndNullTerminate portion of the call above; maybe just assert that it is zero?


this->SetBuffer(buffer);
return this->UnsafeGetBuffer();
}

static bool Is(RecyclableObject* obj)
{
return VirtualTableInfo<Js::Utf8String>::HasVirtualTable(obj);
}

static Utf8String* From(RecyclableObject* obj)
{
if (Utf8String::Is(obj))
{
return static_cast<Utf8String*>(obj);
}

return nullptr;
}

template <typename StringType>
static Utf8String * ConvertString(StringType * originalString, _In_reads_(utf8Length) char* buffer, size_t utf8Length)
{
VirtualTableInfo<Utf8String>::SetVirtualTable(originalString);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I kind of expected SetVirtualTable to be a template that static-asserted that the source type was big enough for the destination type, but I don't see any such assertion. Could you please add one? Otherwise this is concerning because LiteralString is not big enough to convert successfully.

Utf8String * convertedString = (Utf8String *)originalString;

convertedString->SetUtf8Buffer(buffer, utf8Length);

// length and buffer are preserved from the original string, since that is part of JavascriptString

return convertedString;
}
};
}
2 changes: 2 additions & 0 deletions lib/Runtime/Runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,8 @@ enum tagDEBUG_EVENT_INFO_TYPE
#include "Library/PropertyRecordUsageCache.h"
#include "Library/PropertyString.h"
#include "Library/SingleCharString.h"
#include "Library/Utf8String.h"
#include "Library/LazyJSONString.h"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, this is left over from when it was combined with another change. Didn't notice that before.


#include "Library/JavascriptTypedNumber.h"
#include "Library/SparseArraySegment.h"
Expand Down