emscripten-core · juj · Nov 15, 2021 · Nov 2, 2021 · Nov 2, 2021 · Nov 4, 2021
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -430,7 +430,7 @@ jobs:
       # note we do *not* build all libraries and freeze the cache; as we run
       # only limited tests here, it's more efficient to build on demand
       - run-tests:
-          test_targets: "other.test_emcc_cflags other.test_stdin other.test_bad_triple wasm2.test_sse1 wasm2.test_ccall other.test_closure_externs other.test_binaryen_debug other.test_js_optimizer_parse_error other.test_output_to_nowhere other.test_emcc_dev_null other.test_cmake* other.test_system_include_paths other.test_emar_response_file wasm2.test_utf16 other.test_special_chars_in_arguments other.test_toolchain_profiler other.test_realpath_nodefs"
+          test_targets: "other.test_emcc_cflags other.test_stdin other.test_bad_triple wasm2.test_sse1 wasm2.test_ccall other.test_closure_externs other.test_binaryen_debug other.test_js_optimizer_parse_error other.test_output_to_nowhere other.test_emcc_dev_null other.test_cmake* other.test_system_include_paths other.test_emar_response_file wasm2.test_utf16 other.test_special_chars_in_arguments other.test_toolchain_profiler other.test_realpath_nodefs other.test_response_file_encoding"
   test-mac:
     executor: mac
     environment:

diff --git a/ChangeLog.md b/ChangeLog.md
@@ -36,6 +36,10 @@ See docs/process.md for more on how version tagging works.
 2.0.33 - 11/01/2021
 -------------------
 - Bug fixes
+- Added support for specifying the text encoding to be used in response filenames
+  by passing the encoding as a file suffix (e.g. "a.rsp.utf-8" or "a.rsp.cp1252").
+  If not specified, the encoding is autodetected as either UTF-8 or Python
+  default "locale.getpreferredencoding()". (#15406, #15292, #15426)
 
 2.0.32 - 10/19/2021
 -------------------

diff --git a/tests/test_other.py b/tests/test_other.py
@@ -10728,6 +10728,22 @@ def create_o(name, i):
     self.run_process(building.get_command_with_possible_response_file([EMCC, 'main.c'] + files))
     self.assertContained(str(count * (count - 1) // 2), self.run_js('a.out.js'))
 
+  # Tests that the filename suffix of the response files can be used to detect which encoding the file is.
+  def test_response_file_encoding(self):
+    open('äö.c', 'w').write('int main(){}')
+
+    open('a.rsp', 'w', encoding='utf-8').write('äö.c') # Write a response file with unicode contents ...
+    self.run_process([EMCC, '@a.rsp']) # ... and test that in the absence of a file suffix, it is autodetected to utf-8.
+
+    open('a.rsp.cp437', 'w', encoding='cp437').write('äö.c') # Write a response file with Windows CP-437 encoding ...
+    self.run_process([EMCC, '@a.rsp.cp437']) # ... and test that with the explicit suffix present, it is properly decoded
+
+    import locale
+    preferred_encoding = locale.getpreferredencoding(do_setlocale=False)
+    print('Python locale preferredencoding: ' + preferred_encoding)
+    open('a.rsp', 'w', encoding=preferred_encoding).write('äö.c') # Write a response file using Python preferred encoding
+    self.run_process([EMCC, '@a.rsp']) # ... and test that it is properly autodetected.
+
   def test_output_name_collision(self):
     # Ensure that the seconday filenames never collide with the primary output filename
     # In this case we explcitly ask for JS to be ceated in a file with the `.wasm` suffix.

diff --git a/tools/response_file.py b/tools/response_file.py
@@ -13,13 +13,16 @@
 DEBUG = int(os.environ.get('EMCC_DEBUG', '0'))
 
 
-def create_response_file(args, directory):
+def create_response_file(args, directory, suffix='.rsp.utf-8'):
   """Routes the given cmdline param list in args into a new response file and
   returns the filename to it.
 
-  The returned filename has a suffix '.rsp'.
+  By default the returned filename has a suffix '.rsp.utf-8'. Pass a suffix parameter to override.
   """
-  response_fd, response_filename = tempfile.mkstemp(prefix='emscripten_', suffix='.rsp', dir=directory, text=True)
+
+  assert suffix.startswith('.')
+
+  response_fd, response_filename = tempfile.mkstemp(prefix='emscripten_', suffix=suffix, dir=directory, text=True)
 
   # Backslashes and other special chars need to be escaped in the response file.
   escape_chars = ['\\', '\"']
@@ -41,16 +44,12 @@ def escape(arg):
       arg = '"%s"' % arg
     contents += arg + '\n'
 
-  # When writing windows repsonse files force the encoding to UTF8 which we know
-  # that llvm tools understand.  Without this, we get whatever the default codepage
-  # might be.
-  # See: https://github.com/llvm/llvm-project/blob/3f3d1c901d7abcc5b91468335679b1b27d8a02dd/llvm/include/llvm/Support/Program.h#L168-L170
-  # And: https://github.com/llvm/llvm-project/blob/63d16d06f5b8f71382033b5ea4aa668f8150817a/clang/include/clang/Driver/Job.h#L58-L69
-  # TODO(sbc): Should we also force utf-8 on non-windows?
-  if WINDOWS:
-    encoding = 'utf-8'
+  # Decide the encoding of the generated file based on the requested file suffix
+  if suffix.count('.') == 2:
+    # Use the encoding specified in the suffix of the response file
+    encoding = suffix.split('.')[2]
   else:
-    encoding = None
+    encoding = 'utf-8'
 
   with os.fdopen(response_fd, 'w', encoding=encoding) as f:
     f.write(contents)
@@ -70,15 +69,37 @@ def read_response_file(response_filename):
   """Reads a response file, and returns the list of cmdline params found in the
   file.
 
+  The encoding that the response filename should be read with can be specified
+  as a suffix to the file, e.g. "foo.rsp.utf-8" or "foo.rsp.cp1252". If not
+  specified, first UTF-8 and then Python locale.getpreferredencoding() are
+  attempted.
+
   The parameter response_filename may start with '@'."""
   if response_filename.startswith('@'):
     response_filename = response_filename[1:]
 
   if not os.path.exists(response_filename):
     raise IOError("response file not found: %s" % response_filename)
 
-  with open(response_filename) as f:
-    args = f.read()
+  # Guess encoding based on the file suffix
+  components = os.path.basename(response_filename).split('.')
+  encoding_suffix = components[-1].lower()
+  if len(components) > 1 and (encoding_suffix.startswith('utf') or encoding_suffix.startswith('cp') or encoding_suffix.startswith('iso') or encoding_suffix in ['ascii', 'latin-1']):
+    guessed_encoding = encoding_suffix
+  else:
+    guessed_encoding = 'utf-8'
+
+  try:
+    # First try with the guessed encoding
+    with open(response_filename, encoding=guessed_encoding) as f:
+      args = f.read()
+  except (ValueError, LookupError): # UnicodeDecodeError is a subclass of ValueError, and Python raises either a ValueError or a UnicodeDecodeError on decode errors. LookupError is raised if guessed encoding is not an encoding.
+    if DEBUG:
+      logging.warning(f'Failed to parse response file {response_filename} with guessed encoding "{guessed_encoding}". Trying default system encoding...')
+    # If that fails, try with the Python default locale.getpreferredencoding()
+    with open(response_filename) as f:
+      args = f.read()
+
   args = shlex.split(args)
 
   if DEBUG: