emscripten-core · sbc100 · Jun 5, 2019 · Jun 5, 2019
diff --git a/emar.py b/emar.py
@@ -6,19 +6,98 @@
 
 """Archive helper script
 
-This script is a simple wrapper around llvm-ar.  It used to have special
-handling for duplicate basenames in order to allow bitcode linking process to
-read such files.  This is now handled by using tools/arfile.py to read archives.
+This script acts as a frontend replacement for `ar`. See emcc.
+This is needed because, unlike a traditional linker, emscripten can't handle
+archive with duplicate member names.  This is because emscripten extracts
+archive to a temporary location and duplicate filenames will clobber each
+other in this case.
 """
 
+# TODO(sbc): Implement `ar x` within emscripten, in python, to avoid this issue
+# and delete this file.
+
+from __future__ import print_function
+import hashlib
+import os
+import shutil
 import sys
 
+from tools.toolchain_profiler import ToolchainProfiler
 from tools import shared
+from tools.response_file import substitute_response_files, create_response_file
+
+if __name__ == '__main__':
+  ToolchainProfiler.record_process_start()
 
 
+#
+# Main run() function
+#
 def run():
-  newargs = [shared.LLVM_AR] + sys.argv[1:]
-  return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode
+  args = substitute_response_files(sys.argv)
+  newargs = [shared.LLVM_AR] + args[1:]
+
+  to_delete = []
+
+  # The 3 argmuent form of ar doesn't involve other files. For example
+  # 'ar x libfoo.a'.
+  if len(newargs) > 3:
+    cmd = newargs[1]
+    if 'r' in cmd:
+      # We are adding files to the archive.
+      # Normally the output file is then arg 2, except in the case were the
+      # a or b modifiers are used in which case its arg 3.
+      if 'a' in cmd or 'b' in cmd:
+        out_arg_index = 3
+      else:
+        out_arg_index = 2
+
+      contents = set()
+      if os.path.exists(newargs[out_arg_index]):
+        cmd = [shared.LLVM_AR, 't', newargs[out_arg_index]]
+        output = shared.check_call(cmd, stdout=shared.PIPE).stdout
+        contents.update(output.split('\n'))
+
+      # Add a hash to colliding basename, to make them unique.
+      for j in range(out_arg_index + 1, len(newargs)):
+        orig_name = newargs[j]
+        full_name = os.path.abspath(orig_name)
+        dirname = os.path.dirname(full_name)
+        basename = os.path.basename(full_name)
+        if basename not in contents:
+          contents.add(basename)
+          continue
+        h = hashlib.md5(full_name.encode('utf-8')).hexdigest()[:8]
+        parts = basename.split('.')
+        parts[0] += '_' + h
+        newname = '.'.join(parts)
+        full_newname = os.path.join(dirname, newname)
+        assert not os.path.exists(full_newname)
+        try:
+          shutil.copyfile(orig_name, full_newname)
+          newargs[j] = full_newname
+          to_delete.append(full_newname)
+          contents.add(newname)
+        except:
+          # it is ok to fail here, we just don't get hashing
+          contents.add(basename)
+          pass
+
+    if shared.DEBUG:
+      print('emar:', sys.argv, '  ==>  ', newargs, file=sys.stderr)
+
+    response_filename = create_response_file(newargs[3:], shared.get_emscripten_temp_dir())
+    to_delete += [response_filename]
+    newargs = newargs[:3] + ['@' + response_filename]
+
+  if shared.DEBUG:
+    print('emar:', sys.argv, '  ==>  ', newargs, file=sys.stderr)
+
+  try:
+    return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode
+  finally:
+    for d in to_delete:
+      shared.try_delete(d)
 
 
 if __name__ == '__main__':

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -8,6 +8,7 @@
 import hashlib
 import json
 import os
+import random
 import re
 import shutil
 import sys
@@ -5163,25 +5164,26 @@ def test_iostream_and_determinism(self):
         return 0;
       }
     '''
-    num = 3
+    num = 5
 
     def test():
       print('(iteration)')
-      time.sleep(1.0)
+      time.sleep(random.random() / (10 * num)) # add some timing nondeterminism here, not that we need it, but whatever
       self.do_run(src, 'hello world\n77.\n')
       ret = open('src.cpp.o.js', 'rb').read()
       if self.get_setting('WASM') and not self.get_setting('WASM2JS'):
         ret += open('src.cpp.o.wasm', 'rb').read()
       return ret
 
     builds = [test() for i in range(num)]
-    print([len(b) for b in builds])
+    print(list(map(len, builds)))
     uniques = set(builds)
     if len(uniques) != 1:
-      for i, unique in enumerate(uniques):
+      i = 0
+      for unique in uniques:
         open('unique_' + str(i) + '.js', 'wb').write(unique)
-      # builds must be deterministic, see unique_N.js
-      self.assertEqual(len(uniques), 1)
+        i += 1
+      assert 0, 'builds must be deterministic, see unique_X.js'
 
   def test_stdvec(self):
     self.do_run_in_out_file_test('tests', 'core', 'test_stdvec')

diff --git a/tests/test_other.py b/tests/test_other.py
@@ -1450,10 +1450,16 @@ def test_archive_duplicate_basenames(self):
     ''')
     run_process([PYTHON, EMCC, os.path.join('b', 'common.c'), '-c', '-o', os.path.join('b', 'common.o')])
 
-    try_delete('libdup.a')
-    run_process([PYTHON, EMAR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
-    text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout
-    self.assertEqual(text.count('common.o'), 2)
+    try_delete('liba.a')
+    run_process([PYTHON, EMAR, 'rc', 'liba.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
+
+    # Verify that archive contains basenames with hashes to avoid duplication
+    text = run_process([PYTHON, EMAR, 't', 'liba.a'], stdout=PIPE).stdout
+    self.assertEqual(text.count('common.o'), 1)
+    self.assertContained('common_', text)
+    for line in text.split('\n'):
+      # should not have huge hash names
+      self.assertLess(len(line), 20, line)
 
     create_test_file('main.c', r'''
       void a(void);
@@ -1463,9 +1469,30 @@ def test_archive_duplicate_basenames(self):
         b();
       }
     ''')
-    run_process([PYTHON, EMCC, 'main.c', '-L.', '-ldup'])
+    err = run_process([PYTHON, EMCC, 'main.c', '-L.', '-la'], stderr=PIPE).stderr
+    self.assertNotIn('archive file contains duplicate entries', err)
     self.assertContained('a\nb...\n', run_js('a.out.js'))
 
+    # Using llvm-ar directly should cause duplicate basenames
+    try_delete('libdup.a')
+    run_process([LLVM_AR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
+    text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout
+    assert text.count('common.o') == 2, text
+
+    # With fastcomp we don't support duplicate members so this should generate
+    # a warning.  With the wasm backend (lld) this is fully supported.
+    cmd = [PYTHON, EMCC, 'main.c', '-L.', '-ldup']
+    if self.is_wasm_backend():
+      run_process(cmd)
+      self.assertContained('a\nb...\n', run_js('a.out.js'))
+    else:
+      err = self.expect_fail(cmd)
+      self.assertIn('libdup.a: archive file contains duplicate entries', err)
+      self.assertIn('error: undefined symbol: a', err)
+      # others are not duplicates - the hashing keeps them separate
+      self.assertEqual(err.count('duplicate: '), 1)
+      self.assertContained('a\nb...\n', run_js('a.out.js'))
+
   def test_export_from_archive(self):
     export_name = 'this_is_an_entry_point'
     full_export_name = '_' + export_name

diff --git a/tools/arfile.py b/tools/arfile.py