Skip to content

Commit

Permalink
Fix python metadata extraction with pthreads (#17263)
Browse files Browse the repository at this point in the history
We have to jump through a lot more hoops to find the EM_JS segment
data when the segments are passive.  In this case the address of each
segment is not in the segment information, but embedded in the
`__wasm_init_memory` code which gets called from `_start`.  We have to
do more work here partially decode this function, but its still less
than binaryen is doing.

Fixes: #16573
  • Loading branch information
sbc100 committed Jun 22, 2022
1 parent 18e6560 commit 25591f3
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 24 deletions.
4 changes: 1 addition & 3 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -365,9 +365,7 @@ jobs:
EMCC_READ_METADATA: "compare"
steps:
- run-tests-linux:
# test_em_js_pthreads current triggers a bug in python metadata
# extraction: https://github.com/emscripten-core/emscripten/issues/16573
test_targets: "core0 skip:core0.test_em_js_pthreads"
test_targets: "core0"
test-core2:
executor: bionic
environment:
Expand Down
7 changes: 4 additions & 3 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2230,12 +2230,13 @@ def test_em_asm_direct(self):
@parameterized({
'': ([], False),
'pthreads': (['-sUSE_PTHREADS', '-sPROXY_TO_PTHREAD', '-sEXIT_RUNTIME'], False),
'pthreads_dylink': (['-sUSE_PTHREADS', '-sPROXY_TO_PTHREAD', '-sEXIT_RUNTIME', '-sMAIN_MODULE=2', '-Wno-experimental'], False),
'c': ([], True),
'linked': (['-sMAIN_MODULE'], False),
'linked_c': (['-sMAIN_MODULE'], True),
'dylink': (['-sMAIN_MODULE=2'], False),
'dylink_c': (['-sMAIN_MODULE=2'], True),
})
def test_em_js(self, args, force_c):
if '-sMAIN_MODULE' in args:
if '-sMAIN_MODULE=2' in args:
self.check_dylink()
else:
self.emcc_args += ['-sEXPORTED_FUNCTIONS=_main,_malloc']
Expand Down
119 changes: 102 additions & 17 deletions tools/extract_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,43 +4,48 @@
# found in the LICENSE file.

from . import webassembly
from .webassembly import OpCode, AtomicOpCode, MemoryOpCode
from .shared import exit_with_error
from .settings import settings


def is_wrapper_function(module, function):
module.seek(function.offset)
def skip_function_header(module):
num_local_decls = module.read_uleb()
while num_local_decls:
local_count = module.read_uleb() # noqa
local_type = module.read_type() # noqa
num_local_decls -= 1


def is_wrapper_function(module, function):
module.seek(function.offset)
skip_function_header(module)
end = function.offset + function.size
while module.tell() != end:
opcode = module.read_byte()
try:
opcode = webassembly.OpCode(opcode)
opcode = OpCode(opcode)
except ValueError as e:
print(e)
return False
if opcode == webassembly.OpCode.CALL:
if opcode == OpCode.CALL:
callee = module.read_uleb() # noqa
elif opcode == webassembly.OpCode.END:
elif opcode == OpCode.END:
break
else:
return False
assert opcode == webassembly.OpCode.END
assert opcode == OpCode.END
return True


def get_const_expr_value(expr):
assert len(expr) == 2
assert expr[1][0] == webassembly.OpCode.END
assert expr[1][0] == OpCode.END
opcode, immediates = expr[0]
if opcode in (webassembly.OpCode.I32_CONST, webassembly.OpCode.I64_CONST):
if opcode in (OpCode.I32_CONST, OpCode.I64_CONST):
assert len(immediates) == 1
return immediates[0]
elif opcode in (webassembly.OpCode.GLOBAL_GET,):
elif opcode in (OpCode.GLOBAL_GET,):
return 0
else:
exit_with_error('unexpected opcode in const expr: ' + str(opcode))
Expand All @@ -50,23 +55,103 @@ def get_global_value(globl):
return get_const_expr_value(globl.init)


def find_segment_with_address(module, address, size=0):
def parse_function_for_memory_inits(module, func_index, offset_map):
"""Very limited function parser that uses `memory.init` instructions
to derive segment offset.
When segments are passive they don't have an offset but (at least with
llvm-generated code) are loaded during the start function
(`__wasm_init_memory`) using `memory.init` instructions.
Here we parse the `__wasm_init_memory` function and make many assumptions
about its layout. For example, we assume the first argument to `memory.init`
is either an `i32.const` or the result of an `i32.add`.
"""
segments = module.get_segments()
func = module.get_function(func_index)
module.seek(func.offset)
skip_function_header(module)
end = func.offset + func.size
const_values = []
call_targets = []
while module.tell() != end:
opcode = OpCode(module.read_byte())
if opcode in (OpCode.END, OpCode.NOP, OpCode.DROP, OpCode.I32_ADD):
pass
elif opcode in (OpCode.BLOCK,):
module.read_type()
elif opcode in (OpCode.I32_CONST, OpCode.I64_CONST):
const_values.append(module.read_uleb())
elif opcode in (OpCode.GLOBAL_SET, OpCode.BR, OpCode.GLOBAL_GET, OpCode.LOCAL_SET, OpCode.LOCAL_GET, OpCode.LOCAL_TEE):
module.read_uleb()
elif opcode == OpCode.CALL:
call_targets.append(module.read_uleb())
elif opcode == OpCode.MEMORY_PREFIX:
opcode = MemoryOpCode(module.read_byte())
if opcode == MemoryOpCode.MEMORY_INIT:
segment_idx = module.read_uleb()
segment = segments[segment_idx]
offset = const_values[-3]
offset_map[segment] = offset
memory = module.read_uleb()
assert memory == 0
elif opcode == MemoryOpCode.MEMORY_FILL:
memory = module.read_uleb() # noqa
assert memory == 0
elif opcode == MemoryOpCode.MEMORY_DROP:
segment = module.read_uleb() # noqa
else:
assert False, "unknown: %s" % opcode
elif opcode == OpCode.ATOMIC_PREFIX:
opcode = AtomicOpCode(module.read_byte())
if opcode in (AtomicOpCode.ATOMIC_I32_RMW_CMPXCHG, AtomicOpCode.ATOMIC_I32_STORE,
AtomicOpCode.ATOMIC_NOTIFY, AtomicOpCode.ATOMIC_WAIT32,
AtomicOpCode.ATOMIC_WAIT64):
module.read_uleb()
module.read_uleb()
else:
assert False, "unknown: %s" % opcode
elif opcode == OpCode.BR_TABLE:
count = module.read_uleb()
for _ in range(count):
depth = module.read_uleb() # noqa
default = module.read_uleb() # noqa
else:
assert False, "unknown: %s" % opcode

# Recursion is safe here because the layout of the wasm-ld-generated
# start function has a specific structure and has at most on level
# of call stack depth.
for t in call_targets:
parse_function_for_memory_inits(module, t, offset_map)


@webassembly.memoize
def get_passive_segment_offsets(module):
start_func_index = module.get_start()
assert start_func_index is not None
offset_map = {}
parse_function_for_memory_inits(module, start_func_index, offset_map)
return offset_map


def find_segment_with_address(module, address):
segments = module.get_segments()
active = [s for s in segments if s.init]

for seg in active:
offset = get_const_expr_value(seg.init)
if offset is None:
continue
if offset == address:
return (seg, 0)
if address > offset and address < offset + seg.size:
if address >= offset and address < offset + seg.size:
return (seg, address - offset)

passive = [s for s in segments if not s.init]
for seg in passive:
if seg.size == size:
return (seg, 0)
if passive:
offset_map = get_passive_segment_offsets(module)
for seg, offset in offset_map.items():
if address >= offset and address < offset + seg.size:
return (seg, address - offset)

raise AssertionError('unable to find segment for address: %s' % address)

Expand Down Expand Up @@ -95,7 +180,7 @@ def get_asm_strings(module, export_map):
start_addr = get_global_value(start_global)
end_addr = get_global_value(end_global)

seg = find_segment_with_address(module, start_addr, end_addr - start_addr)
seg = find_segment_with_address(module, start_addr)
if not seg:
exit_with_error('unable to find segment starting at __start_em_asm: %s' % start_addr)
seg, seg_offset = seg
Expand Down
33 changes: 32 additions & 1 deletion tools/webassembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,23 +87,46 @@ class Type(IntEnum):
V128 = 0x7b # -0x5
FUNCREF = 0x70 # -0x10
EXTERNREF = 0x6f # -0x11
VOID = 0x40 # -0x40


class OpCode(IntEnum):
NOP = 0x01
BLOCK = 0x02
CALL = 0x10
END = 0x0b
BR = 0x0c
BR_TABLE = 0x0e
CALL = 0x10
DROP = 0x1a
LOCAL_GET = 0x20
LOCAL_SET = 0x21
LOCAL_TEE = 0x22
GLOBAL_GET = 0x23
GLOBAL_SET = 0x24
RETURN = 0x0f
I32_CONST = 0x41
I64_CONST = 0x42
F32_CONST = 0x43
F64_CONST = 0x44
I32_ADD = 0x6a
REF_NULL = 0xd0
ATOMIC_PREFIX = 0xfe
MEMORY_PREFIX = 0xfc


class MemoryOpCode(IntEnum):
MEMORY_INIT = 0x08
MEMORY_DROP = 0x09
MEMORY_COPY = 0x0a
MEMORY_FILL = 0x0b


class AtomicOpCode(IntEnum):
ATOMIC_NOTIFY = 0x00
ATOMIC_WAIT32 = 0x01
ATOMIC_WAIT64 = 0x02
ATOMIC_I32_STORE = 0x17
ATOMIC_I32_RMW_CMPXCHG = 0x48


class SecType(IntEnum):
Expand Down Expand Up @@ -406,6 +429,14 @@ def get_globals(self):
globls.append(Global(global_type, mutable, init))
return globls

@memoize
def get_start(self):
start_section = self.get_section(SecType.START)
if not start_section:
return None
self.seek(start_section.offset)
return self.read_uleb()

@memoize
def get_functions(self):
code_section = self.get_section(SecType.CODE)
Expand Down

0 comments on commit 25591f3

Please sign in to comment.