Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose a Utf8String type. #17872

Closed
wants to merge 58 commits into from
Closed
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
6b90660
Expose a Utf8String type.
May 3, 2018
fabbe5d
Fix Unix build breaks
May 3, 2018
f8639fa
Incorporate PR feedback
May 3, 2018
bcbbbb9
Fix allocation size & remove the manual managed code trappings
May 3, 2018
e4f9a43
GetPinnableReference and fix x86 build
May 3, 2018
b5f7f59
Resolve merge conflicts
May 4, 2018
9e13d11
Merge with master
May 7, 2018
624897f
Realign with System.String changes
May 7, 2018
78e15a2
Merge with master
May 8, 2018
5e800cd
Merge with master
May 9, 2018
10db0e3
Merge from master
May 10, 2018
3618be1
Merge from master
May 11, 2018
6cbb3f8
Merge from master
May 14, 2018
2cff568
Faster Utf8 string allocator.
May 14, 2018
7758492
Merge from master
May 15, 2018
663fe01
Merge with master
May 16, 2018
4f78043
Merge with master
May 17, 2018
e8e82db
Merge with master
May 18, 2018
a00b751
Bring up the Utf8 constructors
May 18, 2018
bbdcae4
Block RuntimeHelpers.GetUninitializedObject()
May 18, 2018
a382e1d
Incorporate PR feedback
May 18, 2018
56ea91c
No longer need private constructor
May 18, 2018
4279139
Put the readonly back.
May 18, 2018
a3ce65a
Make MethodBase.Invoke aware of Utf8String's specialness
May 18, 2018
24a8071
Merge with master
May 21, 2018
e16f421
Merge with master
May 22, 2018
5633b36
Merge with master
May 23, 2018
0711ac1
Merge with master
May 24, 2018
1119a96
Merge with master
May 25, 2018
3c85d80
Merge with master
May 29, 2018
5907d1c
Merge with master
May 30, 2018
049f0ed
Merge with master
May 31, 2018
603193a
Merge with master
Jun 1, 2018
f6d811e
Merge with master
Jun 4, 2018
6825173
Merge with master
Jun 5, 2018
b9b8040
Fix build break
Jun 5, 2018
103fce3
Merge with master
Jun 6, 2018
f997aaa
Merge with master
Jun 7, 2018
d026198
Merge with master
Jun 8, 2018
493c27f
Merge with master
Jun 11, 2018
5eb53cc
Merge with master
Jun 12, 2018
6cf128f
Merge with master
Jun 13, 2018
dd11e73
Merge with master
Jun 14, 2018
82ec9cb
Merge with master
Jun 15, 2018
4a01601
Merge with master
Jun 18, 2018
4ff38a1
Merge with master
Jun 19, 2018
fab65e9
Merge with master
Jun 20, 2018
d6258a4
Merge with master
Jun 21, 2018
21f47d3
Merge with master
Jun 22, 2018
4d60318
Merge with master
Jun 25, 2018
39a4e2e
Merge with master
Jun 26, 2018
279a4c5
Merge with master
Jun 27, 2018
b247f32
Merge with master
Jun 28, 2018
cfe461b
Merge with master
Jun 29, 2018
2f4ad81
Merge with master
Jul 2, 2018
11af9eb
Merge with master
Jul 3, 2018
00e7d26
Merge with master
Jul 5, 2018
5e9651f
Merge with master
Jul 9, 2018
File filter...
Filter file types
Jump to…
Jump to file
Failed to load files.

Always

Just for now

@@ -1400,7 +1400,7 @@
<value>The object has no underlying COM data associated with it.</value>
</data>
<data name="Argument_NoUninitializedStrings" xml:space="preserve">
<value>Uninitialized Strings cannot be created.</value>
<value>Uninitialized strings cannot be created.</value>
</data>
<data name="Argument_ObjIsWinRTObject" xml:space="preserve">
<value>The object's type must not be a Windows Runtime type.</value>
@@ -288,6 +288,7 @@
<Compile Include="$(BclSourcesRoot)\System\TypeNameParser.cs" />
<Compile Include="$(BclSourcesRoot)\System\TypedReference.cs" />
<Compile Include="$(BclSourcesRoot)\System\TypeLoadException.cs" />
<Compile Include="$(BclSourcesRoot)\System\Utf8String.cs" />
<Compile Include="$(BclSourcesRoot)\System\ValueType.cs" />
<Compile Include="$(BclSourcesRoot)\System\WeakReference.cs" />
<Compile Include="$(BclSourcesRoot)\System\WeakReferenceOfT.cs" />
@@ -0,0 +1,86 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Text;
using System.Runtime.InteropServices;
using System.Runtime.CompilerServices;

namespace System
{
// This is an experimental type and not referenced from CoreFx but needs to exists and be public so we can prototype in CoreFxLab.
public sealed class Utf8String
{
// Do not reorder these fields. Must match layout of Utf8StringObject in object.h.
private readonly int _length;
private readonly byte _firstByte;

public int Length => _length;
public ref readonly byte GetPinnableReference() => ref _firstByte;

public static readonly Utf8String Empty = FastAllocate(0);

// Utf8String constructors
// These are special. The implementation methods for these have a different signature from the
// declared constructors.

[MethodImpl(MethodImplOptions.InternalCall)]
public extern Utf8String(ReadOnlySpan<byte> value);

#if PROJECTN
[DependencyReductionRoot]
#endif
#if !CORECLR
static
#endif
private Utf8String Ctor(ReadOnlySpan<byte> value)
{

This comment has been minimized.

@jkotas

jkotas May 18, 2018
Member

Do we plan to specialize empty strings like they are specialized in regular strings?

if (value.Length == 0)
return Empty;

Utf8String newString = FastAllocate(value.Length);
unsafe
{
fixed (byte* pDst = &newString._firstByte)
fixed (byte* pSrc = &MemoryMarshal.GetNonNullPinnableReference(value))
{
Buffer.Memcpy(dest: pDst, src: pSrc, len: value.Length);
}
}
return newString;
}

[MethodImpl(MethodImplOptions.InternalCall)]
public extern Utf8String(ReadOnlySpan<char> value);

#if PROJECTN
[DependencyReductionRoot]
#endif
#if !CORECLR
static
#endif
private Utf8String Ctor(ReadOnlySpan<char> value)
{
if (value.Length == 0)
return Empty;

Encoding e = Encoding.UTF8;
int length = e.GetByteCount(value);
Utf8String newString = FastAllocate(length);
unsafe
{
fixed (byte* pFirstByte = &newString._firstByte)
fixed (char* pFirstChar = &MemoryMarshal.GetNonNullPinnableReference(value))
{
e.GetBytes(pFirstChar, length, pFirstByte, length);
}
}
return newString;
}

// Creates a new zero-initialized instance of the specified length. Actual storage allocated is "length + 1" bytes (the extra
// +1 is for the NUL terminator.)
[MethodImpl(MethodImplOptions.InternalCall)]
public static extern Utf8String FastAllocate(int length); //TODO: Is public for experimentation in CoreFxLab. Will be private in its ultimate form.
}
}
@@ -172,6 +172,7 @@ DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pObjectClass, ::g_pObjectClass
DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pRuntimeTypeClass, ::g_pRuntimeTypeClass)
DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pCanonMethodTableClass, ::g_pCanonMethodTableClass)
DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pStringClass, ::g_pStringClass)
DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pUtf8StringClass, ::g_pUtf8StringClass)
DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pArrayClass, ::g_pArrayClass)
DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pSZArrayHelperClass, ::g_pSZArrayHelperClass)
DEFINE_DACVAR(ULONG, UNKNOWN_POINTER_TYPE, dac__g_pNullableClass, ::g_pNullableClass)
@@ -2760,6 +2760,9 @@ void SystemDomain::LoadBaseSystemClasses()
// Load String
g_pStringClass = MscorlibBinder::LoadPrimitiveType(ELEMENT_TYPE_STRING);

// Load Utf8String
g_pUtf8StringClass = MscorlibBinder::GetClass(CLASS__UTF8_STRING);

// Used by Buffer::BlockCopy
g_pByteArrayMT = ClassLoader::LoadArrayTypeThrowing(
TypeHandle(MscorlibBinder::GetElementType(ELEMENT_TYPE_U1))).AsArray()->GetMethodTable();
@@ -147,6 +147,8 @@
#define g_TransparentProxyName "__TransparentProxy"
#define g_TypeClassName "System.Type"

#define g_Utf8StringName "Utf8String"

#define g_VariantClassName "System.Variant"
#define g_GuidClassName "System.Guid"

@@ -55,23 +55,43 @@ static_assert_no_msg(ECallCtor_First + 8 == ECall::CtorSBytePtrStartLengthEncodi

#define NumberOfStringConstructors 9

#define METHOD__UTF8STRING__CTORF_FIRST METHOD__UTF8_STRING__CTORF_READONLYSPANOFBYTE
static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 0 == METHOD__UTF8_STRING__CTORF_READONLYSPANOFBYTE);
static_assert_no_msg(METHOD__UTF8STRING__CTORF_FIRST + 1 == METHOD__UTF8_STRING__CTORF_READONLYSPANOFCHAR);

#define ECallUtf8String_Ctor_First ECall::Utf8StringCtorReadOnlySpanOfByteManaged
static_assert_no_msg(ECallUtf8String_Ctor_First + 0 == ECall::Utf8StringCtorReadOnlySpanOfByteManaged);
static_assert_no_msg(ECallUtf8String_Ctor_First + 1 == ECall::Utf8StringCtorReadOnlySpanOfCharManaged);

#define NumberOfUtf8StringConstructors 2

static void PopulateConstructors(DWORD firstECallIndex, BinderMethodID firstMethod, int numConstructors)
{
STANDARD_VM_CONTRACT;

for (int i = 0; i < numConstructors; i++)
{
MethodDesc* pMD = MscorlibBinder::GetMethod((BinderMethodID)(firstMethod + i));
_ASSERTE(pMD != NULL);

PCODE pDest = pMD->GetMultiCallableAddrOfCode();

ECall::DynamicallyAssignFCallImpl(pDest, firstECallIndex + i);
}
}

void ECall::PopulateManagedStringConstructors()
{
STANDARD_VM_CONTRACT;

INDEBUG(static bool fInitialized = false);
_ASSERTE(!fInitialized); // assume this method is only called once
_ASSERTE(g_pStringClass != NULL);
_ASSERTE(g_pUtf8StringClass != NULL);

for (int i = 0; i < NumberOfStringConstructors; i++)
{
MethodDesc* pMD = MscorlibBinder::GetMethod((BinderMethodID)(METHOD__STRING__CTORF_FIRST + i));
_ASSERTE(pMD != NULL);

PCODE pDest = pMD->GetMultiCallableAddrOfCode();
PopulateConstructors(ECallCtor_First, METHOD__STRING__CTORF_FIRST, NumberOfStringConstructors);
PopulateConstructors(ECallUtf8String_Ctor_First, METHOD__UTF8STRING__CTORF_FIRST, NumberOfUtf8StringConstructors);

ECall::DynamicallyAssignFCallImpl(pDest, ECallCtor_First + i);
}
INDEBUG(fInitialized = true);
}

@@ -114,7 +114,10 @@ class ECall
DYNAMICALLY_ASSIGNED_FCALL_IMPL(CtorSBytePtrManaged, NULL) \
DYNAMICALLY_ASSIGNED_FCALL_IMPL(CtorSBytePtrStartLengthManaged, NULL) \
DYNAMICALLY_ASSIGNED_FCALL_IMPL(CtorSBytePtrStartLengthEncodingManaged, NULL) \
DYNAMICALLY_ASSIGNED_FCALL_IMPL(FastAllocateUtf8String, FramedAllocateUtf8String) \
DYNAMICALLY_ASSIGNED_FCALL_IMPL(InternalGetCurrentThread, NULL) \
DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorReadOnlySpanOfByteManaged, NULL) \
DYNAMICALLY_ASSIGNED_FCALL_IMPL(Utf8StringCtorReadOnlySpanOfCharManaged, NULL) \

enum
{
@@ -116,6 +116,12 @@ FCFuncStart(gStringFuncs)
#endif // FEATURE_COMINTEROP
FCFuncEnd()

FCFuncStart(gUtf8StringFuncs)
FCDynamic("FastAllocate", CORINFO_INTRINSIC_Illegal, ECall::FastAllocateUtf8String)
FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_ReadOnlySpanOfByte_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorReadOnlySpanOfByteManaged)
FCDynamicSig(COR_CTOR_METHOD_NAME, &gsig_IM_ReadOnlySpanOfChar_RetVoid, CORINFO_INTRINSIC_Illegal, ECall::Utf8StringCtorReadOnlySpanOfCharManaged)
FCFuncEnd()

FCFuncStart(gValueTypeFuncs)
FCFuncElement("CanCompareBits", ValueTypeHelper::CanCompareBits)
FCFuncElement("FastEqualsCheck", ValueTypeHelper::FastEqualsCheck)
@@ -1377,6 +1383,7 @@ FCClassElement("TypedReference", "System", gTypedReferenceFuncs)
#ifdef FEATURE_COMINTEROP
FCClassElement("UriMarshaler", "System.StubHelpers", gUriMarshalerFuncs)
#endif
FCClassElement("Utf8String", "System", gUtf8StringFuncs)
FCClassElement("ValueClassMarshaler", "System.StubHelpers", gValueClassMarshalerFuncs)
FCClassElement("ValueType", "System", gValueTypeFuncs)
#ifdef FEATURE_COMINTEROP
@@ -1059,6 +1059,73 @@ STRINGREF SlowAllocateString( DWORD cchStringLength )
return( ObjectToSTRINGREF(orObject) );
}

Utf8StringObject *SlowAllocateUtf8String(DWORD cchStringLength)
{
CONTRACTL{
THROWS;
GC_TRIGGERS;
MODE_COOPERATIVE; // returns an objref without pinning it => cooperative
} CONTRACTL_END;

Utf8StringObject *orObject = NULL;

#ifdef _DEBUG
if (g_pConfig->ShouldInjectFault(INJECTFAULT_GCHEAP))
{
char *a = new char;
delete a;
}
#endif

// Limit the maximum string size to <2GB to mitigate risk of security issues caused by 32-bit integer
// overflows in buffer size calculations.
if (cchStringLength > 0x7FFFFFDF)
ThrowOutOfMemory();

SIZE_T ObjectSize = PtrAlign(Utf8StringObject::GetSize(cchStringLength));

This comment has been minimized.

@jkotas

jkotas May 3, 2018
Member

I think this allocates more than required.

I have noticed that we may have the same problem in regular string. For example:
new string('a', 5) allocates 0x28 bytes objects on x64 today. But it should be enough for it to allocate just 0x20 bytes:

8 syncblock
8 vtable
4 size
5*2 content
2 zero terminator

Do you know why it is the case?

This comment has been minimized.

@jkotas

jkotas May 3, 2018
Member

I have checked CoreRT. CoreRT allocates 0x20 bytes in this case, so there is definitely something pretty broken.

This comment has been minimized.

@mikedn

mikedn May 3, 2018

Isn't there some padding between size and content?

This comment has been minimized.

@jkotas

jkotas May 3, 2018
Member

There is padding for regular arrays (so that all arrays have same layout even when elements are 8-byte aligned). There is no padding for strings. The content starts right after length. The extra unnecessary bytes are at the end.

I remember folks went to a great length to ensure that strings do not pay the extra 4 bytes during the initial 64-bit ports. It looks like it has regressed.

This comment has been minimized.

@ghost

ghost May 3, 2018
Author

It's coming from sizeof(Utf8StringObject) - there should be a pragma pack 4 around that declaration.

Presumably the same for StringObject, though I want to keep anything like that separate from this PR. We haven't yet agreed to merge this into master.

This comment has been minimized.

@jkotas

jkotas May 3, 2018
Member

#17876 has the fix for the String overallocation problem.

_ASSERTE(ObjectSize > cchStringLength);

SetTypeHandleOnThreadForAlloc(TypeHandle(g_pUtf8StringClass));

orObject = (Utf8StringObject *)Alloc(ObjectSize, FALSE, FALSE);

// Object is zero-init already

This comment has been minimized.

@GrabYourPitchforks

GrabYourPitchforks May 3, 2018
Member

This includes the null terminator being zero-inited, I suppose?

This comment has been minimized.

@ghost

ghost May 3, 2018
Author

Yep.

_ASSERTE(orObject->HasEmptySyncBlockInfo());

// Initialize Object
orObject->SetMethodTable(g_pUtf8StringClass);
orObject->SetLength(cchStringLength);

if (ObjectSize >= LARGE_OBJECT_SIZE)
{
GCHeapUtilities::GetGCHeap()->PublishObject((BYTE*)orObject);
}

// Notify the profiler of the allocation
if (TrackAllocations())
{
OBJECTREF objref = ObjectToOBJECTREF((Object*)orObject);
GCPROTECT_BEGIN(objref);
ProfilerObjectAllocatedCallback(objref, (ClassID)orObject->GetTypeHandle().AsPtr());
GCPROTECT_END();

orObject = (Utf8StringObject *)OBJECTREFToObject(objref);
}

#ifdef FEATURE_EVENT_TRACE
// Send ETW event for allocation
if (ETW::TypeSystemLog::IsHeapAllocEventEnabled())
{
ETW::TypeSystemLog::SendObjectAllocatedEvent(orObject);
}
#endif // FEATURE_EVENT_TRACE

LogAlloc(ObjectSize, g_pUtf8StringClass, orObject);

return orObject;
}


#ifdef FEATURE_COMINTEROP_UNMANAGED_ACTIVATION
// OBJECTREF AllocateComClassObject(ComClassFactory* pComClsFac)
void AllocateComClassObject(ComClassFactory* pComClsFac, OBJECTREF* ppRefClass)
@@ -71,6 +71,8 @@ STRINGREF AllocateString( DWORD cchStringLength );
// The slow version, implemented in gcscan.cpp
STRINGREF SlowAllocateString( DWORD cchStringLength );

Utf8StringObject *SlowAllocateUtf8String( DWORD cchStringLength );

#else

// On other platforms, go to the (somewhat less efficient) implementations in gcscan.cpp
@@ -83,6 +85,8 @@ OBJECTREF AllocateObjectArray(DWORD cElements, TypeHandle ElementType, BOOL bAll

STRINGREF SlowAllocateString( DWORD cchStringLength );

Utf8StringObject *SlowAllocateUtf8String( DWORD cchStringLength );

inline STRINGREF AllocateString( DWORD cchStringLength )
{
WRAPPER_NO_CONTRACT;
@@ -2881,6 +2881,60 @@ HCIMPL1(StringObject*, AllocateString_MP_FastPortable, DWORD stringLength)
}
HCIMPLEND

HCIMPL1(Utf8StringObject*, AllocateUtf8String_MP_FastPortable, DWORD stringLength)
{
FCALL_CONTRACT;

do
{
_ASSERTE(GCHeapUtilities::UseThreadAllocationContexts());

// Instead of doing elaborate overflow checks, we just limit the number of elements. This will avoid all overflow
// problems, as well as making sure big string objects are correctly allocated in the big object heap.
if (stringLength >= LARGE_OBJECT_SIZE - 256)
{
break;
}

// This is typically the only call in the fast path. Making the call early seems to be better, as it allows the compiler
// to use volatile registers for intermediate values. This reduces the number of push/pop instructions and eliminates
// some reshuffling of intermediate values into nonvolatile registers around the call.
Thread *thread = GetThread();

SIZE_T totalSize = Utf8StringObject::GetSize(stringLength);

// The method table's base size includes space for a terminating null character
_ASSERTE(totalSize >= g_pUtf8StringClass->GetBaseSize());
_ASSERTE(totalSize - g_pUtf8StringClass->GetBaseSize() == stringLength);

SIZE_T alignedTotalSize = ALIGN_UP(totalSize, DATA_ALIGNMENT);
_ASSERTE(alignedTotalSize >= totalSize);
totalSize = alignedTotalSize;

gc_alloc_context *allocContext = thread->GetAllocContext();
BYTE *allocPtr = allocContext->alloc_ptr;
_ASSERTE(allocPtr <= allocContext->alloc_limit);
if (totalSize > static_cast<SIZE_T>(allocContext->alloc_limit - allocPtr))
{
break;
}
allocContext->alloc_ptr = allocPtr + totalSize;

_ASSERTE(allocPtr != nullptr);
Utf8StringObject *stringObject = reinterpret_cast<Utf8StringObject *>(allocPtr);
stringObject->SetMethodTable(g_pUtf8StringClass);
stringObject->SetLength(stringLength);

return stringObject;
} while (false);

// Tail call to the slow helper
ENDFORBIDGC();
return HCCALL1(FramedAllocateUtf8String, stringLength);
}
HCIMPLEND


#include <optdefault.h>

/*********************************************************************/
@@ -2920,6 +2974,20 @@ HCIMPL1(StringObject*, FramedAllocateString, DWORD stringLength)
}
HCIMPLEND

HCIMPL1(Utf8StringObject*, FramedAllocateUtf8String, DWORD stringLength)
{
FCALL_CONTRACT;

Utf8StringObject* result = NULL;
HELPER_METHOD_FRAME_BEGIN_RET_0(); // Set up a frame

result = SlowAllocateUtf8String(stringLength);

HELPER_METHOD_FRAME_END();
return result;
}
HCIMPLEND

/*********************************************************************/
OBJECTHANDLE ConstructStringLiteral(CORINFO_MODULE_HANDLE scopeHnd, mdToken metaTok)
{
@@ -223,6 +223,9 @@ extern FCDECL1(StringObject*, AllocateString_MP_FastPortable, DWORD stringLength
extern FCDECL1(StringObject*, UnframedAllocateString, DWORD stringLength);
extern FCDECL1(StringObject*, FramedAllocateString, DWORD stringLength);

extern FCDECL1(Utf8StringObject*, AllocateUtf8String_MP_FastPortable, DWORD stringLength);
extern FCDECL1(Utf8StringObject*, FramedAllocateUtf8String, DWORD stringLength);

extern FCDECL2(Object*, JIT_NewArr1VC_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size);
extern FCDECL2(Object*, JIT_NewArr1OBJ_MP_FastPortable, CORINFO_CLASS_HANDLE arrayMT, INT_PTR size);
extern FCDECL2(Object*, JIT_NewArr1_R2R, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size);
ProTip! Use n and p to navigate between commits in a pull request.