diff --git a/test/core/test_dwarf.cpp b/test/core/test_dwarf.cpp new file mode 100644 index 0000000000000..ad91ccda9cd4a --- /dev/null +++ b/test/core/test_dwarf.cpp @@ -0,0 +1,26 @@ +#include + +EM_JS(int, out_to_js, (int x), {}) + +class MyClass { +public: + void foo(); + void bar(); +}; + +void __attribute__((noinline)) MyClass::foo() { + out_to_js(0); // line 12 + out_to_js(1); + out_to_js(2); +} + +void __attribute__((always_inline)) MyClass::bar() { + out_to_js(3); + __builtin_trap(); // line 19 +} + +int main() { + MyClass mc; + mc.foo(); + mc.bar(); +} diff --git a/test/test_other.py b/test/test_other.py index 07f1b3f090f0a..e419997dda2af 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -9639,12 +9639,49 @@ def check_dwarf_loc_info(address, funcs, locs): for loc in locs: self.assertIn(loc, out) - def check_source_map_loc_info(address, loc): + def check_source_map_loc_info(address, func, loc): out = self.run_process( [emsymbolizer, '-s', 'sourcemap', 'test_dwarf.wasm', address], stdout=PIPE).stdout + self.assertIn(func, out) self.assertIn(loc, out) + def do_tests(src): + # 1. Test DWARF + source map together + # For DWARF, we check for the full inlined info for both function names and + # source locations. Source maps does not provide inlined info. So we only + # check for the info of the outermost function. + self.run_process([EMCC, test_file(src), '-g', '-gsource-map', '-O1', '-o', + 'test_dwarf.js']) + check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, + out_to_js_call_loc) + check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_func[0], + out_to_js_call_loc[0]) + check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) + # Source map shows the original (inlined) source location with the original + # function name + check_source_map_loc_info(unreachable_addr, unreachable_func[0], + unreachable_loc[0]) + + # 2. Test source map only + # The addresses, function names, and source locations are the same across + # the builds because they are relative offsets from the code section, so we + # don't need to recompute them + self.run_process([EMCC, test_file(src), '-gsource-map', '-O1', '-o', + 'test_dwarf.js']) + check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_func[0], + out_to_js_call_loc[0]) + check_source_map_loc_info(unreachable_addr, unreachable_func[0], + unreachable_loc[0]) + + # 3. Test DWARF only + self.run_process([EMCC, test_file(src), '-g', '-O1', '-o', + 'test_dwarf.js']) + check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, + out_to_js_call_loc) + check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) + + # -- C program test -- # We test two locations within test_dwarf.c: # out_to_js(0); // line 6 # __builtin_trap(); // line 13 @@ -9667,31 +9704,32 @@ def check_source_map_loc_info(address, loc): # The first one corresponds to the innermost inlined location. unreachable_loc = ['test_dwarf.c:13:3', 'test_dwarf.c:18:3'] - # 1. Test DWARF + source map together - # For DWARF, we check for the full inlined info for both function names and - # source locations. Source maps provide neither function names nor inlined - # info. So we only check for the source location of the outermost function. - check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, - out_to_js_call_loc) - check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_loc[0]) - check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) - check_source_map_loc_info(unreachable_addr, unreachable_loc[0]) - - # 2. Test source map only - # The addresses, function names, and source locations are the same across - # the builds because they are relative offsets from the code section, so we - # don't need to recompute them - self.run_process([EMCC, test_file('core/test_dwarf.c'), - '-gsource-map', '-O1', '-o', 'test_dwarf.js']) - check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_loc[0]) - check_source_map_loc_info(unreachable_addr, unreachable_loc[0]) + do_tests('core/test_dwarf.c') - # 3. Test DWARF only - self.run_process([EMCC, test_file('core/test_dwarf.c'), - '-g', '-O1', '-o', 'test_dwarf.js']) - check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, - out_to_js_call_loc) - check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) + # -- C++ program test -- + # We test two locations within test_dwarf.cpp: + # out_to_js(0); // line 12 + # __builtin_trap(); // line 19 + self.run_process([EMCC, test_file('core/test_dwarf.cpp'), + '-g', '-gsource-map', '-O1', '-o', 'test_dwarf.js']) + # Address of out_to_js(0) within MyClass::foo(), uninlined + out_to_js_call_addr = self.get_instr_addr('call\t0', 'test_dwarf.wasm') + # Address of __builtin_trap() within MyClass::bar(), inlined into main() + unreachable_addr = self.get_instr_addr('unreachable', 'test_dwarf.wasm') + + # Function name of out_to_js(0) within MyClass::foo(), uninlined + out_to_js_call_func = ['MyClass::foo()'] + # Function names of __builtin_trap() within MyClass::bar(), inlined into + # main(). The first one corresponds to the innermost inlined function. + unreachable_func = ['MyClass::bar()', 'main'] + + # Source location of out_to_js(0) within MyClass::foo(), uninlined + out_to_js_call_loc = ['test_dwarf.cpp:12:3'] + # Source locations of __builtin_trap() within MyClass::bar(), inlined into + # main(). The first one corresponds to the innermost inlined location. + unreachable_loc = ['test_dwarf.cpp:19:3', 'test_dwarf.cpp:25:6'] + + do_tests('core/test_dwarf.cpp') def test_emsymbolizer_functions(self): 'Test emsymbolizer use cases that only provide function-granularity info' diff --git a/tools/emsymbolizer.py b/tools/emsymbolizer.py index c71fc26da890d..62f6b7830a0a9 100755 --- a/tools/emsymbolizer.py +++ b/tools/emsymbolizer.py @@ -118,6 +118,7 @@ class Location: def __init__(self): self.version = None self.sources = [] + self.funcs = [] self.mappings = {} self.offsets = [] @@ -129,6 +130,7 @@ def parse(self, filename): self.version = source_map_json['version'] self.sources = source_map_json['sources'] + self.funcs = source_map_json['names'] chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=' vlq_map = {c: i for i, c in enumerate(chars)} @@ -156,6 +158,7 @@ def decodeVLQ(string): src = 0 line = 1 col = 1 + func = 0 for segment in source_map_json['mappings'].split(','): data = decodeVLQ(segment) info = [] @@ -170,7 +173,9 @@ def decodeVLQ(string): if len(data) >= 4: col += data[3] info.append(col) - # TODO: see if we need the name, which is the next field (data[4]) + if len(data) == 5: + func += data[4] + info.append(func) self.mappings[offset] = WasmSourceMap.Location(*info) self.offsets.append(offset) @@ -208,6 +213,7 @@ def lookup(self, offset, lower_bound=None): self.sources[info.source] if info.source is not None else None, info.line, info.column, + self.funcs[info.func] if info.func is not None else None, ) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index 8d9fefc4fcc7f..b7075ded76873 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -24,9 +24,11 @@ __rootdir__ = os.path.dirname(__scriptdir__) sys.path.insert(0, __rootdir__) -from tools import utils +from tools import shared, utils from tools.system_libs import DETERMINISTIC_PREFIX +LLVM_CXXFILT = shared.llvm_tool_path('llvm-cxxfilt') + EMSCRIPTEN_PREFIX = utils.normalize_path(utils.path_from_root()) logger = logging.getLogger('wasm-sourcemap') @@ -231,7 +233,158 @@ def extract_comp_dir_map(text): return map_stmt_list_to_comp_dir -def read_dwarf_entries(wasm, options): +def demangle_names(names): + # Only demangle names that look mangled + mangled_names = sorted({n for n in names if n.startswith('_Z')}) + if not mangled_names: + return {} + if not os.path.exists(LLVM_CXXFILT): + logger.warning('llvm-cxxfilt does not exist') + return {} + + # Gather all mangled names and call llvm-cxxfilt only once for all of them + try: + input_str = '\n'.join(mangled_names) + process = Popen([LLVM_CXXFILT], stdin=PIPE, stdout=PIPE, stderr=PIPE, text=True) + stdout, stderr = process.communicate(input=input_str) + if process.returncode != 0: + logger.warning('llvm-cxxfilt failed: %s' % stderr) + return {} + + demangled_list = stdout.splitlines() + if len(demangled_list) != len(mangled_names): + logger.warning('llvm-cxxfilt output length mismatch') + return {} + + return dict(zip(mangled_names, demangled_list)) + except OSError: + logger.warning('Failed to run llvm-cxxfilt') + return {} + + +class FuncRange: + def __init__(self, name, low_pc, high_pc): + self.name = name + self.low_pc = low_pc + self.high_pc = high_pc + + +# This function parses DW_TAG_subprogram entries and gets low_pc and high_pc for +# each function in a list of FuncRanges. The result list will be sorted in the +# increasing order of low_pcs. +def extract_func_ranges(text): + # This function handles four cases: + # 1. DW_TAG_subprogram with DW_AT_name, DW_AT_low_pc, and DW_AT_high_pc. + # 0x000000ba: DW_TAG_subprogram + # DW_AT_low_pc (0x0000005f) + # DW_AT_high_pc (0x00000071) + # DW_AT_name ("foo") + # ... + # + # 2. DW_TAG_subprogram with DW_AT_linkage_name, DW_AT_low_pc, and + # DW_AT_high_pc. Applies to mangled C++ functions. + # (We parse DW_AT_linkage_name instead of DW_AT_name here.) + # 0x000000ba: DW_TAG_subprogram + # DW_AT_low_pc (0x0000005f) + # DW_AT_high_pc (0x00000071) + # DW_AT_linkage_name ("_ZN7MyClass3fooEv") + # DW_AT_name ("foo") + # ... + # + # 3. DW_TAG_subprogram with DW_AT_specification, DW_AT_low_pc, and + # DW_AT_high_pc. C++ function info can be split into two DIEs (one with + # DW_AT_linkage_name and DW_AT_declaration (true) and the other with + # DW_AT_specification). In this case we parse DW_AT_specification for the + # function name. + # 0x0000006d: DW_TAG_subprogram + # DW_AT_linkage_name ("_ZN7MyClass3fooEv") + # DW_AT_name ("foo") + # DW_AT_declaration (true) + # ... + # 0x00000097: DW_TAG_subprogram + # DW_AT_low_pc (0x00000007) + # DW_AT_high_pc (0x0000004c) + # DW_AT_specification (0x0000006d "_ZN7MyClass3fooEv") + # ... + # + # 4. DW_TAG_inlined_subroutine with DW_AT_abstract_origin, DW_AT_low_pc, and + # DW_AT_high_pc. This represents an inlined function. We parse + # DW_AT_abstract_origin for the original function name. + # 0x0000011a: DW_TAG_inlined_subroutine + # DW_AT_abstract_origin (0x000000da "_ZN7MyClass3barEv") + # DW_AT_low_pc (0x00000078) + # DW_AT_high_pc (0x00000083) + # ... + + func_ranges = [] + dw_tags = re.split(r'\r?\n(?=0x[0-9a-f]+:)', text) + + def get_name_from_tag(tag): + m = re.search(r'DW_AT_linkage_name\s+\("([^"]+)"\)', tag) + if m: + return m.group(1) + m = re.search(r'DW_AT_name\s+\("([^"]+)"\)', tag) + if m: + return m.group(1) + # If name is missing, check for DW_AT_specification annotation + m = re.search(r'DW_AT_specification\s+\(0x[0-9a-f]+\s+"([^"]+)"\)', tag) + if m: + return m.group(1) + return None + + for tag in dw_tags: + is_subprogram = re.search(r"0x[0-9a-f]+:\s+DW_TAG_subprogram", tag) + is_inlined = re.search(r"0x[0-9a-f]+:\s+DW_TAG_inlined_subroutine", tag) + if is_subprogram or is_inlined: + name = None + low_pc = None + high_pc = None + m = re.search(r'DW_AT_low_pc\s+\(0x([0-9a-f]+)\)', tag) + if m: + low_pc = int(m.group(1), 16) + m = re.search(r'DW_AT_high_pc\s+\(0x([0-9a-f]+)\)', tag) + if m: + high_pc = int(m.group(1), 16) + if is_subprogram: + name = get_name_from_tag(tag) + else: # is_inlined + m = re.search(r'DW_AT_abstract_origin\s+\(0x[0-9a-f]+\s+"([^"]+)"\)', tag) + if m: + name = m.group(1) + if name and low_pc is not None and high_pc is not None: + func_ranges.append(FuncRange(name, low_pc, high_pc)) + + # Demangle names + all_names = [item.name for item in func_ranges] + demangled_map = demangle_names(all_names) + for func_range in func_ranges: + if func_range.name in demangled_map: + func_range.name = demangled_map[func_range.name] + + # To correctly identify the innermost function for a given address, + # func_ranges is sorted primarily by low_pc in ascending order and secondarily + # by high_pc in descending order. This ensures that for overlapping ranges, + # the more specific (inner) range appears later in the list. + func_ranges.sort(key=lambda item: (item.low_pc, -item.high_pc)) + return func_ranges + + +# Returns true if the given llvm-dwarfdump has --filter-child-tags / -t option +def has_filter_child_tag_option(dwarfdump): + # To check if --filter-child-tags / -t option is available, run + # `llvm-dwarfdump -t`. If it is available, it will print to stderr: + # ... for the -t option: requires a value! + # If not, it will print: + # ... Unknown command line argument '-t'. + try: + process = Popen([dwarfdump, '-t'], stdout=PIPE, stderr=PIPE, text=True) + _, err = process.communicate() + return 'requires a value' in err + except OSError: + return False + + +def read_dwarf_info(wasm, options): if options.dwarfdump_output: output = Path(options.dwarfdump_output).read_bytes() elif options.dwarfdump: @@ -239,7 +392,24 @@ def read_dwarf_entries(wasm, options): if not os.path.exists(options.dwarfdump): logger.error('llvm-dwarfdump not found: ' + options.dwarfdump) sys.exit(1) - process = Popen([options.dwarfdump, '-debug-info', '-debug-line', '--recurse-depth=0', wasm], stdout=PIPE) + dwarfdump_cmd = [options.dwarfdump, '-debug-info', '-debug-line', wasm] + + # Recently --filter-child-tag / -t option was added to llvm-dwarfdump prune + # tags. Because it is a recent addition, check if it exists in the user's + # llvm-dwarfdump. If not, print only the top-level DW_TAG_compile_units for + # source location info and don't generate 'names' field. + if has_filter_child_tag_option(options.dwarfdump): + # We need only three tags in the debug info: DW_TAG_compile_unit for + # source location, and DW_TAG_subprogram and DW_TAG_inlined_subroutine + # for the function ranges. + dwarfdump_cmd += ['-t', 'DW_TAG_compile_unit', '-t', 'DW_TAG_subprogram', + '-t', 'DW_TAG_inlined_subroutine'] + else: + logger.warning('llvm-dwarfdump does not support -t. "names" field will not be generated in the source map.') + # Only print DW_TAG_compile_units + dwarfdump_cmd += ['--recurse-depth=0'] + + process = Popen(dwarfdump_cmd, stdout=PIPE, stderr=PIPE) output, err = process.communicate() exit_code = process.wait() if exit_code != 0: @@ -296,22 +466,61 @@ def read_dwarf_entries(wasm, options): remove_dead_entries(entries) # return entries sorted by the address field - return sorted(entries, key=lambda entry: entry['address']) + entries = sorted(entries, key=lambda entry: entry['address']) + func_ranges = extract_func_ranges(debug_line_chunks[0]) + return entries, func_ranges -def build_sourcemap(entries, code_section_offset, options): + +def build_sourcemap(entries, func_ranges, code_section_offset, options): base_path = options.basepath collect_sources = options.sources prefixes = SourceMapPrefixes(options.prefix, options.load_prefix, base_path) + # Add code section offset to the low/high pc in the function PC ranges + for func_range in func_ranges: + func_range.low_pc += code_section_offset + func_range.high_pc += code_section_offset + sources = [] sources_content = [] + # There can be duplicate names in case an original source function has + # multiple disjoint PC ranges or is inlined to multiple callsites. Make the + # 'names' list a unique list of names, and map the function ranges to the + # indices in that list. + names = sorted({item.name for item in func_ranges}) + name_to_id = {name: i for i, name in enumerate(names)} mappings = [] sources_map = {} last_address = 0 last_source_id = 0 last_line = 1 last_column = 1 + last_func_id = 0 + + active_funcs = [] + next_func_range_id = 0 + + # Get the function name ID that the given address falls into + def get_function_id(address): + nonlocal active_funcs + nonlocal next_func_range_id + + # Maintain a list of "active functions" whose ranges currently cover the + # address. As the address advances, it adds new functions that start and + # removes functions that end. The last function remaining in the active list + # at any point is the innermost function. + while next_func_range_id < len(func_ranges) and func_ranges[next_func_range_id].low_pc <= address: + # active_funcs contains (high_pc, id) pair + active_funcs.append((func_ranges[next_func_range_id].high_pc, next_func_range_id)) + next_func_range_id += 1 + active_funcs = [f for f in active_funcs if f[0] > address] + + if active_funcs: + func_range_id = active_funcs[-1][1] + name = func_ranges[func_range_id].name + return name_to_id[name] + return None for entry in entries: line = entry['line'] @@ -342,21 +551,27 @@ def build_sourcemap(entries, code_section_offset, options): sources_content.append(None) else: source_id = sources_map[source_name] + func_id = get_function_id(address) address_delta = address - last_address source_id_delta = source_id - last_source_id line_delta = line - last_line column_delta = column - last_column - mappings.append(encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta)) last_address = address last_source_id = source_id last_line = line last_column = column + mapping = encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta) + if func_id is not None: + func_id_delta = func_id - last_func_id + last_func_id = func_id + mapping += encode_vlq(func_id_delta) + mappings.append(mapping) return {'version': 3, 'sources': sources, 'sourcesContent': sources_content, - 'names': [], + 'names': names, 'mappings': ','.join(mappings)} @@ -367,12 +582,12 @@ def main(args): with open(wasm_input, 'rb') as infile: wasm = infile.read() - entries = read_dwarf_entries(wasm_input, options) + entries, func_ranges = read_dwarf_info(wasm_input, options) code_section_offset = get_code_section_offset(wasm) logger.debug('Saving to %s' % options.output) - map = build_sourcemap(entries, code_section_offset, options) + map = build_sourcemap(entries, func_ranges, code_section_offset, options) with open(options.output, 'w', encoding='utf-8') as outfile: json.dump(map, outfile, separators=(',', ':'), ensure_ascii=False)