New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
wasm-only memset #5245
wasm-only memset #5245
Changes from 8 commits
f98dc0a
6796808
47f7203
1cdd9c4
74a99fa
7f03fac
a0c1965
7b6d026
521179e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -823,13 +823,65 @@ LibraryManager.library = { | |
if ((num|0) >= | ||
#if SIMD | ||
196608 | ||
#else | ||
#if WASM_ONLY | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This pattern seems to come up a fair bit. Might be worth extending the preprocessor to handle something like
Wouldn't belong to this PR but might be worth doing. |
||
16384 | ||
#else | ||
8192 | ||
#endif | ||
#endif | ||
) { | ||
return _emscripten_memcpy_big(dest|0, src|0, num|0)|0; | ||
} | ||
|
||
#if WASM_ONLY | ||
ret = dest|0; | ||
dest_end = (dest + num)|0; | ||
if ((dest&7) == (src&7)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In wasm-only mode, this raises an interesting question: if src or dst are unaligned, is it faster to do unaligned 8-byte copies or should one resort to aligned 1-byte copies. On x86 definitely unaligned 8-byte copies will still be faster than 1-byte copies, since x86 practically doesn't care (iirc it was a 1 cycle penalty either on loads or stores on Intel, and 1 cycle penalty on loads and stores on AMD). However on ARM this might be a different thing. Anyone have a super efficient ARM memcpy at hand? |
||
// The initial unaligned < 8-byte front. | ||
while (dest & 7) { | ||
if ((num|0) == 0) return ret|0; | ||
store1(dest, load1(src, 1), 1); | ||
dest = (dest+1)|0; | ||
src = (src+1)|0; | ||
num = (num-1)|0; | ||
} | ||
aligned_dest_end = (dest_end & -8)|0; | ||
block_aligned_dest_end = (aligned_dest_end - 64)|0; | ||
while ((dest|0) <= (block_aligned_dest_end|0) ) { | ||
store8(dest, load8(src, 8), 8); | ||
store8(dest + 8 | 0, load8(src + 8 | 0, 8), 8); | ||
store8(dest + 16 | 0, load8(src + 16 | 0, 8), 8); | ||
store8(dest + 24 | 0, load8(src + 24 | 0, 8), 8); | ||
store8(dest + 32 | 0, load8(src + 32 | 0, 8), 8); | ||
store8(dest + 40 | 0, load8(src + 40 | 0, 8), 8); | ||
store8(dest + 48 | 0, load8(src + 48 | 0, 8), 8); | ||
store8(dest + 56 | 0, load8(src + 56 | 0, 8), 8); | ||
dest = (dest+64)|0; | ||
src = (src+64)|0; | ||
} | ||
while ((dest|0) < (aligned_dest_end|0) ) { | ||
store8(dest, load8(src, 8), 8); | ||
dest = (dest+8)|0; | ||
src = (src+8)|0; | ||
} | ||
} else { | ||
// In the unaligned copy case, unroll a bit as well. | ||
aligned_dest_end = (dest_end - 4)|0; | ||
while ((dest|0) < (aligned_dest_end|0) ) { | ||
store4(dest, load4(src, 1), 1); // unaligned | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, why is this a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah right, that makes sense. Does this currently emit unaligned loads and stores on wasm or does it break up to 1 byte writes in binaryen? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. asm2wasm will turn it into a single 4-byte load/store, marked with alignment 1. |
||
dest = (dest+4)|0; | ||
src = (src+4)|0; | ||
} | ||
} | ||
// The remaining unaligned < 8 byte tail. | ||
while ((dest|0) < (dest_end|0)) { | ||
store1(dest, load1(src, 1), 1); | ||
dest = (dest+1)|0; | ||
src = (src+1)|0; | ||
} | ||
return ret|0; | ||
#else | ||
ret = dest|0; | ||
dest_end = (dest + num)|0; | ||
if ((dest&3) == (src&3)) { | ||
|
@@ -894,6 +946,7 @@ LibraryManager.library = { | |
src = (src+1)|0; | ||
} | ||
return ret|0; | ||
#endif | ||
}, | ||
|
||
llvm_memcpy_i32: 'memcpy', | ||
|
@@ -939,9 +992,54 @@ LibraryManager.library = { | |
var end = 0, aligned_end = 0, block_aligned_end = 0, value4 = 0; | ||
#if SIMD | ||
var value16 = SIMD_Int32x4(0,0,0,0); | ||
#else | ||
#if WASM_ONLY | ||
var value8 = i64(); | ||
#endif | ||
#endif | ||
end = (ptr + num)|0; | ||
|
||
#if WASM_ONLY | ||
value = value & 0xff; | ||
if ((num|0) >= 71 /* 64 bytes for an unrolled loop + 7 bytes for unaligned head*/) { | ||
value4 = value | (value << 8) | (value << 16) | (value << 24); | ||
while ((ptr&3) != 0) { | ||
store1(ptr, value, 1); | ||
ptr = (ptr+1)|0; | ||
} | ||
if (ptr&4) { | ||
store4(ptr, value4, 4); | ||
ptr = (ptr+4)|0; | ||
} | ||
|
||
aligned_end = (end & -8)|0; | ||
block_aligned_end = (aligned_end - 64)|0; | ||
value8 = i64_or(i64_zext(value4), i64_shl(i64_zext(value4), i64(32))); | ||
|
||
while ((ptr|0) <= (block_aligned_end|0)) { | ||
store8(ptr , value8, 8); | ||
store8(ptr + 8 | 0, value8, 8); | ||
store8(ptr + 16 | 0, value8, 8); | ||
store8(ptr + 24 | 0, value8, 8); | ||
store8(ptr + 32 | 0, value8, 8); | ||
store8(ptr + 40 | 0, value8, 8); | ||
store8(ptr + 48 | 0, value8, 8); | ||
store8(ptr + 56 | 0, value8, 8); | ||
ptr = (ptr + 64)|0; | ||
} | ||
|
||
while ((ptr|0) < (aligned_end|0) ) { | ||
store8(ptr, value8, 8); | ||
ptr = (ptr+8)|0; | ||
} | ||
} | ||
// The remaining bytes. | ||
while ((ptr|0) < (end|0)) { | ||
store1(ptr, value, 1); | ||
ptr = (ptr+1)|0; | ||
} | ||
return (end-num)|0; | ||
#else | ||
value = value & 0xff; | ||
if ((num|0) >= 67 /* 64 bytes for an unrolled loop + 3 bytes for unaligned head*/) { | ||
while ((ptr&3) != 0) { | ||
|
@@ -956,7 +1054,7 @@ LibraryManager.library = { | |
value16 = SIMD_Int32x4_splat(value4); | ||
#endif | ||
|
||
while((ptr|0) <= (block_aligned_end|0)) { | ||
while ((ptr|0) <= (block_aligned_end|0)) { | ||
#if SIMD | ||
SIMD_Int32x4_store(HEAPU8, ptr, value16); | ||
SIMD_Int32x4_store(HEAPU8, ptr+16, value16); | ||
|
@@ -994,6 +1092,7 @@ LibraryManager.library = { | |
ptr = (ptr+1)|0; | ||
} | ||
return (end-num)|0; | ||
#endif | ||
}, | ||
llvm_memset_i32: 'memset', | ||
llvm_memset_p0i8_i32: 'memset', | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wait, so we add a
WASM_ONLY
flag that we assert gets set when we unsetLEGALIZE_JS_FFI
, but then we clobber it with our autodetectedshared.Building.is_wasm_only()
regardless?We've promoted
WASM_ONLY
to a Setting so it's available to the js compiler, not because it's something users should be setting. Because of that I think we should move this to before the assert, and just always have it autodetected. Maybe add a note to settings.js, saying we ignore this if you try to set it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
emcc.py order was a little wrong, thanks, fixed.
And clarified in settings.js what is internal use only.