From e307e14992c644cd679aa17a2ab2a19618947a79 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Fri, 17 Oct 2025 06:29:08 +0000 Subject: [PATCH 1/6] [empath-split] Support multi-paths modules This changes the format of the user input "paths" file so that multiple paths can be split into a single module. The new paths file structure is now very similar to wasm-split's manifest file, with functions replaced with paths. For example, the format will be like ``` module1 path/to/a path/to/b module2 path/to/c ``` Where `module1` and `module2` are module names. --- test/test_other.py | 19 ++++--- tools/empath-split.py | 125 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 113 insertions(+), 31 deletions(-) diff --git a/test/test_other.py b/test/test_other.py index 8184bb6ca3b02..35f234368c900 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -15760,9 +15760,14 @@ def test_empath_split(self): void foo() { std::cout << "foo" << std::endl; } ''') create_file('path_list', r''' + myapp main.cpp foo.cpp + + lib0 /emsdk/emscripten/system + + lib1 /emsdk/emscripten/system/lib/libc/musl /emsdk/emscripten/system/lib/libcxx ''') @@ -15780,17 +15785,17 @@ def has_defined_function(file, func): return pattern.search(f.read()) is not None # main.cpp - self.assertTrue(has_defined_function('test_0.wasm', '__original_main')) + self.assertTrue(has_defined_function('test_myapp.wasm', '__original_main')) # foo.cpp - self.assertTrue(has_defined_function('test_1.wasm', r'foo\\28\\29')) + self.assertTrue(has_defined_function('test_myapp.wasm', r'foo\\28\\29')) # /emsdk/emscripten/system - self.assertTrue(has_defined_function('test_2.wasm', '__abort_message')) - self.assertTrue(has_defined_function('test_2.wasm', 'pthread_cond_wait')) + self.assertTrue(has_defined_function('test_lib0.wasm', '__abort_message')) + self.assertTrue(has_defined_function('test_lib0.wasm', 'pthread_cond_wait')) # /emsdk/emscripten/system/lib/libc/musl - self.assertTrue(has_defined_function('test_3.wasm', 'strcmp')) + self.assertTrue(has_defined_function('test_lib1.wasm', 'strcmp')) # /emsdk/emscripten/system/lib/libcxx - self.assertTrue(has_defined_function('test_4.wasm', r'std::__2::ios_base::getloc\\28\\29\\20const')) - self.assertTrue(has_defined_function('test_4.wasm', r'std::uncaught_exceptions\\28\\29')) + self.assertTrue(has_defined_function('test_lib1.wasm', r'std::__2::ios_base::getloc\\28\\29\\20const')) + self.assertTrue(has_defined_function('test_lib1.wasm', r'std::uncaught_exceptions\\28\\29')) # Check --print-sources option out = self.run_process([empath_split, 'test.wasm', '--print-sources'], stdout=PIPE).stdout diff --git a/tools/empath-split.py b/tools/empath-split.py index 3fabebb0ce883..8daca46b07eeb 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -21,13 +21,26 @@ $ emcc -g2 -gsource-map a.o b.o -o result.js See https://emscripten.org/docs/porting/Debugging.html for more details. -This takes a wasm file and a paths file, which is a text file containing a list -of paths as inputs. The paths file should contain a single path per line. A -single split module will be generated per specified path. If a specified path -contains another specified path, functions contained in the inner path will be -split as the inner path's module, and the rest of the functions will be split as -the outer path's module. Functions that do not belong to any of the specified -paths will remain in the primary module. +This takes a wasm file and a paths file as inputs. The paths file defines how +to split modules. The format is similar to the manifest file for wasm-split, but +with paths instead of function names. A module is defined by a name on a line, +followed by paths on subsequent lines. Modules are separated by empty lines. +For example: +module1 +path/to/a +path/to/b + +module2 +path/to/c + +This will create two modules, 'module1' and 'module2'. 'module1' will contain +functions from source files under path/to/a and path/to/b. 'module2' will +contain functions from source files under path/to/c. + +If a specified path contains another specified path, functions contained in the +inner path will be split as the inner path's module, and the rest of the +functions will be split as the outer path's module. Functions that do not belong +to any of the specified paths will remain in the primary module. The paths in the paths file can be either absolute or relative, but they should match those of 'sources' field in the source map file. Sometimes a source map's @@ -243,6 +256,61 @@ def is_synthesized_func(func): return path_to_funcs +# 1. Strip whitespaces +# 2. Normalize separators +# 3. Make /a/b/c and /a/b/c/ equivalent +def normalize_path(path): + return utils.normalize_path(path.strip()).rstrip(os.sep) + + +# Parses a paths file that can specify module names and multiple paths per +# module. A module is defined by a name on a line, followed by paths on +# subsequent lines. Modules are separated by empty lines. +# For example: +# +# module1 +# path/to/a +# path/to/b +# +# module2 +# path/to/c +def parse_paths_file(paths_file_content): + module_to_paths = {} + path_to_module = {} + cur_module = None + cur_paths = set() + + for line in paths_file_content.splitlines(): + line = line.strip() + if not line: + if cur_module: + if not cur_paths: + diagnostics.warn(f"Module '{cur_module}' has no paths specified.") + module_to_paths[cur_module] = cur_paths + cur_module = None + cur_paths = set() + continue + + if not cur_module: + cur_module = line + else: + path = normalize_path(line) + if path in path_to_module: + exit_with_error("Path '{path}' cannot be assigned to module '{cur_module}; it is already assigned to module '{path_to_module[path]}'") + cur_paths.add(path) + path_to_module[path] = cur_module + + if cur_module: + if not cur_paths: + diagnostics.warn(f"Module '{cur_module}' has no paths specified.") + module_to_paths[cur_module] = cur_paths + + if not module_to_paths: + exit_with_error('The paths file is empty or invalid.') + + return module_to_paths + + def main(): args, forwarded_args = parse_args() check_errors(args) @@ -252,31 +320,40 @@ def main(): print_sources(sourcemap) return - paths = utils.read_file(args.paths_file).splitlines() - paths = [utils.normalize_path(path.strip()) for path in paths if path.strip()] - # To make /a/b/c and /a/b/c/ equivalent - paths = [path.rstrip(os.sep) for path in paths] - # Remove duplicates - paths = list(dict.fromkeys(paths)) + content = utils.read_file(args.paths_file) + module_to_paths = parse_paths_file(content) # Compute {path: list of functions} map - path_to_funcs = get_path_to_functions_map(args.wasm, sourcemap, paths) + all_paths = [] + for paths in module_to_paths.values(): + all_paths.extend(paths) + path_to_funcs = get_path_to_functions_map(args.wasm, sourcemap, all_paths) # Write .manifest file - with tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=args.preserve_manifest) as f: + with tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=not args.preserve_manifest) as f: manifest = f.name - for i, path in enumerate(paths): - f.write(f'{i}\n') - if not path_to_funcs[path]: - diagnostics.warn(f'{path} does not match any functions') + for i, (module, paths) in enumerate(module_to_paths.items()): + funcs = set() + for path in paths: + if not path_to_funcs[path]: + diagnostics.warn(f'{path} does not match any functions') + funcs.update(path_to_funcs[path]) + if not funcs: + diagnostics.warn(f"Module '{module}' does not match any functions") + if args.verbose: - print(f'{path}: {len(path_to_funcs[path])} functions') - for func in path_to_funcs[path]: - print(' ' + func) + print(f'{module}: {len(funcs)} functions') + for path in paths: + if path in path_to_funcs: + print(f' {path}: {len(path_to_funcs[path])} functions') + for func in path_to_funcs[path]: + print(' ' + func) print() - for func in path_to_funcs[path]: + + f.write(f'{module}\n') + for func in funcs: f.write(func + '\n') - if i < len(paths) - 1: + if i < len(module_to_paths) - 1: f.write('\n') f.flush() From e53e2dbed6622191818177aedc5b19aea2301724 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Fri, 17 Oct 2025 21:13:36 +0000 Subject: [PATCH 2/6] Remove duplicate comments --- test/test_other.py | 14 +++++++------- tools/empath-split.py | 11 ----------- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/test/test_other.py b/test/test_other.py index 35f234368c900..c949de7ea8864 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -15764,10 +15764,10 @@ def test_empath_split(self): main.cpp foo.cpp - lib0 + lib1 /emsdk/emscripten/system - lib1 + lib2 /emsdk/emscripten/system/lib/libc/musl /emsdk/emscripten/system/lib/libcxx ''') @@ -15789,13 +15789,13 @@ def has_defined_function(file, func): # foo.cpp self.assertTrue(has_defined_function('test_myapp.wasm', r'foo\\28\\29')) # /emsdk/emscripten/system - self.assertTrue(has_defined_function('test_lib0.wasm', '__abort_message')) - self.assertTrue(has_defined_function('test_lib0.wasm', 'pthread_cond_wait')) + self.assertTrue(has_defined_function('test_lib1.wasm', '__abort_message')) + self.assertTrue(has_defined_function('test_lib1.wasm', 'pthread_cond_wait')) # /emsdk/emscripten/system/lib/libc/musl - self.assertTrue(has_defined_function('test_lib1.wasm', 'strcmp')) + self.assertTrue(has_defined_function('test_lib2.wasm', 'strcmp')) # /emsdk/emscripten/system/lib/libcxx - self.assertTrue(has_defined_function('test_lib1.wasm', r'std::__2::ios_base::getloc\\28\\29\\20const')) - self.assertTrue(has_defined_function('test_lib1.wasm', r'std::uncaught_exceptions\\28\\29')) + self.assertTrue(has_defined_function('test_lib2.wasm', r'std::__2::ios_base::getloc\\28\\29\\20const')) + self.assertTrue(has_defined_function('test_lib2.wasm', r'std::uncaught_exceptions\\28\\29')) # Check --print-sources option out = self.run_process([empath_split, 'test.wasm', '--print-sources'], stdout=PIPE).stdout diff --git a/tools/empath-split.py b/tools/empath-split.py index 8daca46b07eeb..ce672c233a4cf 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -263,17 +263,6 @@ def normalize_path(path): return utils.normalize_path(path.strip()).rstrip(os.sep) -# Parses a paths file that can specify module names and multiple paths per -# module. A module is defined by a name on a line, followed by paths on -# subsequent lines. Modules are separated by empty lines. -# For example: -# -# module1 -# path/to/a -# path/to/b -# -# module2 -# path/to/c def parse_paths_file(paths_file_content): module_to_paths = {} path_to_module = {} From ac1d2454ad55e38375483705c7147fde5654bd2a Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 21 Oct 2025 18:04:49 +0000 Subject: [PATCH 3/6] path_list -> path_list.txt --- test/test_other.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_other.py b/test/test_other.py index aaf364af86e50..f2b6a45b81dfd 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -15157,7 +15157,7 @@ def test_empath_split(self): #include void foo() { std::cout << "foo" << std::endl; } ''') - create_file('path_list', r''' + create_file('path_list.txt', r''' myapp main.cpp foo.cpp @@ -15171,7 +15171,7 @@ def test_empath_split(self): ''') self.run_process([EMCC, 'main.cpp', 'foo.cpp', '-gsource-map', '-g2', '-o', 'test.js']) - self.run_process([empath_split, 'test.wasm', 'path_list', '-g', '-o', 'test_primary.wasm', '--out-prefix=test_']) + self.run_process([empath_split, 'test.wasm', 'path_list.txt', '-g', '-o', 'test_primary.wasm', '--out-prefix=test_']) # Check if functions are correctly assigned and split with the specified # paths. When one path contains another, the inner path should take its From 97258c682d7491ec7bd8d2d2f02e318b9477a27f Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 21 Oct 2025 18:05:00 +0000 Subject: [PATCH 4/6] Check i != 0 at the top --- tools/empath-split.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/empath-split.py b/tools/empath-split.py index 8aad2375f98fe..f0a27961ab7b5 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -317,6 +317,8 @@ def main(): with tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=not args.preserve_manifest) as f: manifest = f.name for i, (module, paths) in enumerate(module_to_paths.items()): + if i != 0: # Unless we are the first entry add a newline separator + f.write('\n') funcs = set() for path in paths: if not path_to_funcs[path]: From 7eedf6d01c9f36e7eda03da917799fb3d19c0851 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 21 Oct 2025 18:08:24 +0000 Subject: [PATCH 5/6] Change sets to lists to ensure deterministic order --- tools/empath-split.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tools/empath-split.py b/tools/empath-split.py index f0a27961ab7b5..17d48ae46e059 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -262,7 +262,7 @@ def parse_paths_file(paths_file_content): module_to_paths = {} path_to_module = {} cur_module = None - cur_paths = set() + cur_paths = [] for line in paths_file_content.splitlines(): line = line.strip() @@ -272,7 +272,7 @@ def parse_paths_file(paths_file_content): diagnostics.warn(f"Module '{cur_module}' has no paths specified.") module_to_paths[cur_module] = cur_paths cur_module = None - cur_paths = set() + cur_paths = [] continue if not cur_module: @@ -281,7 +281,7 @@ def parse_paths_file(paths_file_content): path = normalize_path(line) if path in path_to_module: exit_with_error("Path '{path}' cannot be assigned to module '{cur_module}; it is already assigned to module '{path_to_module[path]}'") - cur_paths.add(path) + cur_paths.append(path) path_to_module[path] = cur_module if cur_module: @@ -319,11 +319,11 @@ def main(): for i, (module, paths) in enumerate(module_to_paths.items()): if i != 0: # Unless we are the first entry add a newline separator f.write('\n') - funcs = set() + funcs = [] for path in paths: if not path_to_funcs[path]: diagnostics.warn(f'{path} does not match any functions') - funcs.update(path_to_funcs[path]) + funcs += path_to_funcs[path] if not funcs: diagnostics.warn(f"Module '{module}' does not match any functions") @@ -339,8 +339,6 @@ def main(): f.write(f'{module}\n') for func in funcs: f.write(func + '\n') - if i < len(module_to_paths) - 1: - f.write('\n') f.flush() cmd = [args.wasm_split, '--multi-split', args.wasm, '--manifest', manifest] From b4815748216a6172e571b4d60c826a4bbaba507f Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 22 Oct 2025 03:14:02 +0000 Subject: [PATCH 6/6] Revert the drive-by fix --- tools/empath-split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/empath-split.py b/tools/empath-split.py index 17d48ae46e059..f091f2264f75a 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -314,7 +314,7 @@ def main(): path_to_funcs = get_path_to_functions_map(args.wasm, sourcemap, all_paths) # Write .manifest file - with tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=not args.preserve_manifest) as f: + with tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=args.preserve_manifest) as f: manifest = f.name for i, (module, paths) in enumerate(module_to_paths.items()): if i != 0: # Unless we are the first entry add a newline separator