Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wasm-only memset #5245

Closed
wants to merge 9 commits into from
45 changes: 25 additions & 20 deletions emcc.py
Expand Up @@ -1090,12 +1090,6 @@ def check(input_file):
if shared.Settings.WASM:
shared.Settings.BINARYEN = 1 # these are synonyms

# When only targeting wasm, the .asm.js file is not executable, so is treated as an intermediate build file that can be cleaned up.
if shared.Building.is_wasm_only():
asm_target = asm_target.replace('.asm.js', '.temp.asm.js')
if not DEBUG:
misc_temp_files.note(asm_target)

assert shared.Settings.TOTAL_MEMORY >= 16*1024*1024, 'TOTAL_MEMORY must be at least 16MB, was ' + str(shared.Settings.TOTAL_MEMORY)
if shared.Settings.BINARYEN:
assert shared.Settings.TOTAL_MEMORY % 65536 == 0, 'For wasm, TOTAL_MEMORY must be a multiple of 64KB, was ' + str(shared.Settings.TOTAL_MEMORY)
Expand Down Expand Up @@ -1147,16 +1141,6 @@ def check(input_file):
# * if we also supported js mem inits we'd have 4 modes
# * and js mem inits are useful for avoiding a side file, but the wasm module avoids that anyhow
options.memory_init_file = True
# async compilation requires wasm-only mode, and also not interpreting (the interpreter needs sync input)
if shared.Settings.BINARYEN_ASYNC_COMPILATION == 1 and shared.Building.is_wasm_only() and 'interpret' not in shared.Settings.BINARYEN_METHOD:
# async compilation requires a swappable module - we swap it in when it's ready
shared.Settings.SWAPPABLE_ASM_MODULE = 1
else:
# if not wasm-only, we can't do async compilation as the build can run in other
# modes than wasm (like asm.js) which may not support an async step
shared.Settings.BINARYEN_ASYNC_COMPILATION = 0
if 'BINARYEN_ASYNC_COMPILATION=1' in settings_changes:
logging.warning('BINARYEN_ASYNC_COMPILATION requested, but disabled since not in wasm-only mode')

# wasm outputs are only possible with a side wasm
if target.endswith(WASM_ENDINGS):
Expand Down Expand Up @@ -1216,8 +1200,29 @@ def check(input_file):
newargs.append('-mllvm')
newargs.append('-disable-llvm-optzns')

if shared.Settings.BINARYEN:
# determine wasm-only mode, now that all settings are settled on
shared.Settings.WASM_ONLY = shared.Building.is_wasm_only()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait, so we add a WASM_ONLY flag that we assert gets set when we unset LEGALIZE_JS_FFI, but then we clobber it with our autodetected shared.Building.is_wasm_only() regardless?

We've promoted WASM_ONLY to a Setting so it's available to the js compiler, not because it's something users should be setting. Because of that I think we should move this to before the assert, and just always have it autodetected. Maybe add a note to settings.js, saying we ignore this if you try to set it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

emcc.py order was a little wrong, thanks, fixed.

And clarified in settings.js what is internal use only.


# When only targeting wasm, the .asm.js file is not executable, so is treated as an intermediate build file that can be cleaned up.
if shared.Settings.WASM_ONLY:
asm_target = asm_target.replace('.asm.js', '.temp.asm.js')
if not DEBUG:
misc_temp_files.note(asm_target)

# async compilation requires wasm-only mode, and also not interpreting (the interpreter needs sync input)
if shared.Settings.BINARYEN_ASYNC_COMPILATION == 1 and shared.Settings.WASM_ONLY and 'interpret' not in shared.Settings.BINARYEN_METHOD:
# async compilation requires a swappable module - we swap it in when it's ready
shared.Settings.SWAPPABLE_ASM_MODULE = 1
else:
# if not wasm-only, we can't do async compilation as the build can run in other
# modes than wasm (like asm.js) which may not support an async step
shared.Settings.BINARYEN_ASYNC_COMPILATION = 0
if 'BINARYEN_ASYNC_COMPILATION=1' in settings_changes:
logging.warning('BINARYEN_ASYNC_COMPILATION requested, but disabled since not in wasm-only mode')

if not shared.Settings.LEGALIZE_JS_FFI:
assert shared.Building.is_wasm_only(), 'LEGALIZE_JS_FFI incompatible with RUNNING_JS_OPTS and non-wasm BINARYEN_METHOD.'
assert shared.Settings.WASM_ONLY, 'LEGALIZE_JS_FFI incompatible with RUNNING_JS_OPTS and non-wasm BINARYEN_METHOD.'

shared.Settings.EMSCRIPTEN_VERSION = shared.EMSCRIPTEN_VERSION
shared.Settings.OPT_LEVEL = options.opt_level
Expand Down Expand Up @@ -2207,7 +2212,7 @@ def do_binaryen(final, target, asm_target, options, memfile, wasm_binary_target,
if not shared.Settings.WASM_BACKEND:
if DEBUG:
# save the asm.js input
shutil.copyfile(asm_target, os.path.join(emscripten_temp_dir, os.path.basename(asm_target)))
shared.safe_copy(asm_target, os.path.join(emscripten_temp_dir, os.path.basename(asm_target)))
cmd = [os.path.join(binaryen_bin, 'asm2wasm'), asm_target, '--total-memory=' + str(shared.Settings.TOTAL_MEMORY)]
if shared.Settings.BINARYEN_TRAP_MODE == 'js':
cmd += ['--emit-jsified-potential-traps']
Expand Down Expand Up @@ -2240,7 +2245,7 @@ def do_binaryen(final, target, asm_target, options, memfile, wasm_binary_target,
cmd += ['--mem-max=' + str(shared.Settings.BINARYEN_MEM_MAX)]
if shared.Settings.LEGALIZE_JS_FFI != 1:
cmd += ['--no-legalize-javascript-ffi']
if shared.Building.is_wasm_only():
if shared.Settings.WASM_ONLY:
cmd += ['--wasm-only'] # this asm.js is code not intended to run as asm.js, it is only ever going to be wasm, an can contain special fastcomp-wasm support
if options.debug_level >= 2 or options.profiling_funcs:
cmd += ['-g']
Expand Down Expand Up @@ -2275,7 +2280,7 @@ def do_binaryen(final, target, asm_target, options, memfile, wasm_binary_target,
log_time('asm2wasm')
if shared.Settings.BINARYEN_PASSES:
shutil.move(wasm_binary_target, wasm_binary_target + '.pre')
cmd = [os.path.join(binaryen_bin, 'wasm-opt'), wasm_binary_target + '.pre', '-o', wasm_binary_target] + map(lambda p: '--' + p, shared.Settings.BINARYEN_PASSES.split(','))
cmd = [os.path.join(binaryen_bin, 'wasm-opt'), wasm_binary_target + '.pre', '-o', wasm_binary_target] + map(lambda p: ('--' + p) if p[0] != '-' else p, shared.Settings.BINARYEN_PASSES.split(','))
logging.debug('wasm-opt on BINARYEN_PASSES: ' + ' '.join(cmd))
subprocess.check_call(cmd)
if not wrote_wasm_text and 'interpret-s-expr' in shared.Settings.BINARYEN_METHOD:
Expand Down
101 changes: 100 additions & 1 deletion src/library.js
Expand Up @@ -823,13 +823,65 @@ LibraryManager.library = {
if ((num|0) >=
#if SIMD
196608
#else
#if WASM_ONLY
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This pattern seems to come up a fair bit. Might be worth extending the preprocessor to handle something like

#if SIMD
...
#elif WASM_ONLY
...
#endif

Wouldn't belong to this PR but might be worth doing.

16384
#else
8192
#endif
#endif
) {
return _emscripten_memcpy_big(dest|0, src|0, num|0)|0;
}

#if WASM_ONLY
ret = dest|0;
dest_end = (dest + num)|0;
if ((dest&7) == (src&7)) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In wasm-only mode, this raises an interesting question: if src or dst are unaligned, is it faster to do unaligned 8-byte copies or should one resort to aligned 1-byte copies. On x86 definitely unaligned 8-byte copies will still be faster than 1-byte copies, since x86 practically doesn't care (iirc it was a 1 cycle penalty either on loads or stores on Intel, and 1 cycle penalty on loads and stores on AMD). However on ARM this might be a different thing. Anyone have a super efficient ARM memcpy at hand?

// The initial unaligned < 8-byte front.
while (dest & 7) {
if ((num|0) == 0) return ret|0;
store1(dest, load1(src, 1), 1);
dest = (dest+1)|0;
src = (src+1)|0;
num = (num-1)|0;
}
aligned_dest_end = (dest_end & -8)|0;
block_aligned_dest_end = (aligned_dest_end - 64)|0;
while ((dest|0) <= (block_aligned_dest_end|0) ) {
store8(dest, load8(src, 8), 8);
store8(dest + 8 | 0, load8(src + 8 | 0, 8), 8);
store8(dest + 16 | 0, load8(src + 16 | 0, 8), 8);
store8(dest + 24 | 0, load8(src + 24 | 0, 8), 8);
store8(dest + 32 | 0, load8(src + 32 | 0, 8), 8);
store8(dest + 40 | 0, load8(src + 40 | 0, 8), 8);
store8(dest + 48 | 0, load8(src + 48 | 0, 8), 8);
store8(dest + 56 | 0, load8(src + 56 | 0, 8), 8);
dest = (dest+64)|0;
src = (src+64)|0;
}
while ((dest|0) < (aligned_dest_end|0) ) {
store8(dest, load8(src, 8), 8);
dest = (dest+8)|0;
src = (src+8)|0;
}
} else {
// In the unaligned copy case, unroll a bit as well.
aligned_dest_end = (dest_end - 4)|0;
while ((dest|0) < (aligned_dest_end|0) ) {
store4(dest, load4(src, 1), 1); // unaligned
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, why is this a store4 and a load4? Is that alright since this is doing one byte loads and stores?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The , 1 tells wasm it is alignment 1. So this should be at least as efficient as 4 stores, as if it wasn't the engine can break it up, it has all the info to do so.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah right, that makes sense. Does this currently emit unaligned loads and stores on wasm or does it break up to 1 byte writes in binaryen?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

asm2wasm will turn it into a single 4-byte load/store, marked with alignment 1.

dest = (dest+4)|0;
src = (src+4)|0;
}
}
// The remaining unaligned < 8 byte tail.
while ((dest|0) < (dest_end|0)) {
store1(dest, load1(src, 1), 1);
dest = (dest+1)|0;
src = (src+1)|0;
}
return ret|0;
#else
ret = dest|0;
dest_end = (dest + num)|0;
if ((dest&3) == (src&3)) {
Expand Down Expand Up @@ -894,6 +946,7 @@ LibraryManager.library = {
src = (src+1)|0;
}
return ret|0;
#endif
},

llvm_memcpy_i32: 'memcpy',
Expand Down Expand Up @@ -939,9 +992,54 @@ LibraryManager.library = {
var end = 0, aligned_end = 0, block_aligned_end = 0, value4 = 0;
#if SIMD
var value16 = SIMD_Int32x4(0,0,0,0);
#else
#if WASM_ONLY
var value8 = i64();
#endif
#endif
end = (ptr + num)|0;

#if WASM_ONLY
value = value & 0xff;
if ((num|0) >= 71 /* 64 bytes for an unrolled loop + 7 bytes for unaligned head*/) {
value4 = value | (value << 8) | (value << 16) | (value << 24);
while ((ptr&3) != 0) {
store1(ptr, value, 1);
ptr = (ptr+1)|0;
}
if (ptr&4) {
store4(ptr, value4, 4);
ptr = (ptr+4)|0;
}

aligned_end = (end & -8)|0;
block_aligned_end = (aligned_end - 64)|0;
value8 = i64_or(i64_zext(value4), i64_shl(i64_zext(value4), i64(32)));

while ((ptr|0) <= (block_aligned_end|0)) {
store8(ptr , value8, 8);
store8(ptr + 8 | 0, value8, 8);
store8(ptr + 16 | 0, value8, 8);
store8(ptr + 24 | 0, value8, 8);
store8(ptr + 32 | 0, value8, 8);
store8(ptr + 40 | 0, value8, 8);
store8(ptr + 48 | 0, value8, 8);
store8(ptr + 56 | 0, value8, 8);
ptr = (ptr + 64)|0;
}

while ((ptr|0) < (aligned_end|0) ) {
store8(ptr, value8, 8);
ptr = (ptr+8)|0;
}
}
// The remaining bytes.
while ((ptr|0) < (end|0)) {
store1(ptr, value, 1);
ptr = (ptr+1)|0;
}
return (end-num)|0;
#else
value = value & 0xff;
if ((num|0) >= 67 /* 64 bytes for an unrolled loop + 3 bytes for unaligned head*/) {
while ((ptr&3) != 0) {
Expand All @@ -956,7 +1054,7 @@ LibraryManager.library = {
value16 = SIMD_Int32x4_splat(value4);
#endif

while((ptr|0) <= (block_aligned_end|0)) {
while ((ptr|0) <= (block_aligned_end|0)) {
#if SIMD
SIMD_Int32x4_store(HEAPU8, ptr, value16);
SIMD_Int32x4_store(HEAPU8, ptr+16, value16);
Expand Down Expand Up @@ -994,6 +1092,7 @@ LibraryManager.library = {
ptr = (ptr+1)|0;
}
return (end-num)|0;
#endif
},
llvm_memset_i32: 'memset',
llvm_memset_p0i8_i32: 'memset',
Expand Down
10 changes: 7 additions & 3 deletions src/settings.js
Expand Up @@ -678,9 +678,6 @@ var SPLIT_MEMORY = 0; // If > 0, we split memory into chunks, of the size given
// TODO: add malloc-split to embuilder
var SAFE_SPLIT_MEMORY = 0; // Similar to SAFE_HEAP, but for SPLIT_MEMORY.

var RUNNING_JS_OPTS = 0; // whether js opts will be run, after the main compiler
var BOOTSTRAPPING_STRUCT_INFO = 0; // whether we are in the generate struct_info bootstrap phase

var EMSCRIPTEN_TRACING = 0; // Add some calls to emscripten tracing APIs

var USE_GLFW = 2; // Specify the GLFW version that is being linked against.
Expand Down Expand Up @@ -856,6 +853,13 @@ var FETCH = 0; // If nonzero, enables emscripten_fetch API.

var ASMFS = 0; // If set to 1, uses the multithreaded filesystem that is implemented within the asm.js module, using emscripten_fetch. Implies -s FETCH=1.

// Internal use only, users should not set there

var RUNNING_JS_OPTS = 0; // whether js opts will be run, after the main compiler
var BOOTSTRAPPING_STRUCT_INFO = 0; // whether we are in the generate struct_info bootstrap phase

var WASM_TEXT_FILE = ''; // name of the file containing wasm text, if relevant
var WASM_BINARY_FILE = ''; // name of the file containing wasm binary, if relevant
var ASMJS_CODE_FILE = ''; // name of the file containing asm.js, if relevant
var WASM_ONLY = 0; // Whether we are only targeting wasm (i.e., there is no fallback to asm.js)