From 378cf65fc28b7c6690868f14928732b524883b28 Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Sun, 26 Apr 2026 17:29:17 +0500 Subject: [PATCH 1/3] fix(extensions): use explicit UTF-8 encoding when reading manifest YAML On Windows, Python's open() defaults to the system locale encoding (e.g., GBK on Chinese Windows), which causes UnicodeDecodeError when extension.yml or preset.yml contains non-ASCII content such as Chinese characters in description fields. Add encoding='utf-8' to ExtensionManifest._load_yaml and PresetManifest._load_yaml so manifests are read consistently across platforms. Fixes #2325 --- src/specify_cli/extensions.py | 2 +- src/specify_cli/presets.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index 916038cd5f..ffdb21e2cd 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -139,7 +139,7 @@ def __init__(self, manifest_path: Path): def _load_yaml(self, path: Path) -> dict: """Load YAML file safely.""" try: - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except yaml.YAMLError as e: raise ValidationError(f"Invalid YAML in {path}: {e}") diff --git a/src/specify_cli/presets.py b/src/specify_cli/presets.py index 24de73521e..1b6971f61f 100644 --- a/src/specify_cli/presets.py +++ b/src/specify_cli/presets.py @@ -136,7 +136,7 @@ def __init__(self, manifest_path: Path): def _load_yaml(self, path: Path) -> dict: """Load YAML file safely.""" try: - with open(path, 'r') as f: + with open(path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) or {} except yaml.YAMLError as e: raise PresetValidationError(f"Invalid YAML in {path}: {e}") From a2946773217e52768db115d9c1a5d198a45d2c22 Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Mon, 27 Apr 2026 19:49:17 +0500 Subject: [PATCH 2/3] test(extensions,presets): add UTF-8 manifest regression tests for #2325 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Positive: extension.yml/preset.yml with non-ASCII (Chinese + emoji) descriptions load correctly when written as UTF-8 bytes — fails on Windows without explicit encoding='utf-8'. Negative: files containing invalid UTF-8 bytes raise a clean error (ValidationError or UnicodeDecodeError), not a silent crash. --- tests/test_extensions.py | 29 +++++++++++++++++++++++++++++ tests/test_presets.py | 24 ++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/tests/test_extensions.py b/tests/test_extensions.py index e6a206c069..18fb9f1493 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -225,6 +225,35 @@ def test_non_mapping_yaml_raises_validation_error(self, temp_dir): with pytest.raises(ValidationError, match="YAML mapping"): ExtensionManifest(manifest_path) + def test_utf8_non_ascii_description_loads(self, temp_dir, valid_manifest_data): + """Regression for #2325: non-ASCII (UTF-8) description loads on any platform. + + On Windows, Python's default text-mode encoding is the locale codepage + (e.g. cp1252/GBK), which raises UnicodeDecodeError on UTF-8 bytes + outside the ASCII range. The loader must open with encoding='utf-8'. + """ + import yaml + + valid_manifest_data["extension"]["description"] = "中文测试 — émojis 🚀" + manifest_path = temp_dir / "extension.yml" + # Write UTF-8 bytes explicitly so the test exercises the read path, + # not the (locale-dependent) write path. + manifest_path.write_bytes( + yaml.safe_dump(valid_manifest_data, allow_unicode=True).encode("utf-8") + ) + + manifest = ExtensionManifest(manifest_path) + assert manifest.description == "中文测试 — émojis 🚀" + + def test_invalid_utf8_bytes_raises_validation_error(self, temp_dir): + """Negative case: file containing invalid UTF-8 bytes should not crash with raw UnicodeDecodeError.""" + manifest_path = temp_dir / "extension.yml" + # 0xFF/0xFE are not valid UTF-8 lead bytes. + manifest_path.write_bytes(b"\xff\xfe not valid utf-8 \xff\n") + + with pytest.raises((ValidationError, UnicodeDecodeError)): + ExtensionManifest(manifest_path) + def test_invalid_extension_id(self, temp_dir, valid_manifest_data): """Test manifest with invalid extension ID format.""" import yaml diff --git a/tests/test_presets.py b/tests/test_presets.py index ee4a6dddb1..0089738ba9 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -160,6 +160,30 @@ def test_invalid_yaml(self, temp_dir): with pytest.raises(PresetValidationError, match="Invalid YAML"): PresetManifest(bad_file) + def test_utf8_non_ascii_description_loads(self, temp_dir, valid_pack_data): + """Regression for #2325: non-ASCII (UTF-8) description loads on any platform. + + On Windows, Python's default text-mode encoding is the locale codepage + (e.g. cp1252/GBK), which raises UnicodeDecodeError on UTF-8 bytes + outside the ASCII range. The loader must open with encoding='utf-8'. + """ + valid_pack_data["preset"]["description"] = "中文测试 — émojis 🚀" + manifest_path = temp_dir / "preset.yml" + manifest_path.write_bytes( + yaml.safe_dump(valid_pack_data, allow_unicode=True).encode("utf-8") + ) + + manifest = PresetManifest(manifest_path) + assert manifest.description == "中文测试 — émojis 🚀" + + def test_invalid_utf8_bytes_raises_validation_error(self, temp_dir): + """Negative case: file containing invalid UTF-8 bytes should not crash silently.""" + manifest_path = temp_dir / "preset.yml" + manifest_path.write_bytes(b"\xff\xfe not valid utf-8 \xff\n") + + with pytest.raises((PresetValidationError, UnicodeDecodeError)): + PresetManifest(manifest_path) + def test_missing_schema_version(self, temp_dir, valid_pack_data): """Test missing schema_version field.""" del valid_pack_data["schema_version"] From 474e88cf490f57812d3aeac77963d45511474d99 Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Mon, 27 Apr 2026 21:29:55 +0500 Subject: [PATCH 3/3] fix(extensions,presets): wrap I/O and decode errors as ValidationError Address remaining Copilot concerns on #2370: - Catch UnicodeDecodeError and OSError in both manifest loaders and re-raise as ValidationError / PresetValidationError so callers see a consistent error type, not a bare decode/IO traceback. - Validate that PresetManifest YAML root is a mapping (extensions.py already had this; presets.py was missing it). Treat None as {} for empty-file compatibility. - Tighten the negative regression tests to assert the specific message, and add a non-mapping-root test for PresetManifest matching the existing one for ExtensionManifest. --- src/specify_cli/extensions.py | 6 ++++++ src/specify_cli/presets.py | 15 ++++++++++++++- tests/test_extensions.py | 4 ++-- tests/test_presets.py | 12 ++++++++++-- 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index ffdb21e2cd..a419ebf1d2 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -145,6 +145,12 @@ def _load_yaml(self, path: Path) -> dict: raise ValidationError(f"Invalid YAML in {path}: {e}") except FileNotFoundError: raise ValidationError(f"Manifest not found: {path}") + except UnicodeDecodeError as e: + raise ValidationError( + f"Manifest is not valid UTF-8: {path} ({e.reason} at byte {e.start})" + ) + except OSError as e: + raise ValidationError(f"Could not read manifest {path}: {e}") if not isinstance(data, dict): raise ValidationError( f"Manifest must be a YAML mapping, got {type(data).__name__}: {path}" diff --git a/src/specify_cli/presets.py b/src/specify_cli/presets.py index 1b6971f61f..27054a77fc 100644 --- a/src/specify_cli/presets.py +++ b/src/specify_cli/presets.py @@ -137,11 +137,24 @@ def _load_yaml(self, path: Path) -> dict: """Load YAML file safely.""" try: with open(path, 'r', encoding='utf-8') as f: - return yaml.safe_load(f) or {} + data = yaml.safe_load(f) except yaml.YAMLError as e: raise PresetValidationError(f"Invalid YAML in {path}: {e}") except FileNotFoundError: raise PresetValidationError(f"Manifest not found: {path}") + except UnicodeDecodeError as e: + raise PresetValidationError( + f"Manifest is not valid UTF-8: {path} ({e.reason} at byte {e.start})" + ) + except OSError as e: + raise PresetValidationError(f"Could not read manifest {path}: {e}") + if data is None: + return {} + if not isinstance(data, dict): + raise PresetValidationError( + f"Manifest must be a YAML mapping, got {type(data).__name__}: {path}" + ) + return data def _validate(self): """Validate manifest structure and required fields.""" diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 18fb9f1493..c5be0ab4f3 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -246,12 +246,12 @@ def test_utf8_non_ascii_description_loads(self, temp_dir, valid_manifest_data): assert manifest.description == "中文测试 — émojis 🚀" def test_invalid_utf8_bytes_raises_validation_error(self, temp_dir): - """Negative case: file containing invalid UTF-8 bytes should not crash with raw UnicodeDecodeError.""" + """Negative case: file containing invalid UTF-8 bytes raises ValidationError, not raw UnicodeDecodeError.""" manifest_path = temp_dir / "extension.yml" # 0xFF/0xFE are not valid UTF-8 lead bytes. manifest_path.write_bytes(b"\xff\xfe not valid utf-8 \xff\n") - with pytest.raises((ValidationError, UnicodeDecodeError)): + with pytest.raises(ValidationError, match="not valid UTF-8"): ExtensionManifest(manifest_path) def test_invalid_extension_id(self, temp_dir, valid_manifest_data): diff --git a/tests/test_presets.py b/tests/test_presets.py index 0089738ba9..4b167ed9be 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -177,13 +177,21 @@ def test_utf8_non_ascii_description_loads(self, temp_dir, valid_pack_data): assert manifest.description == "中文测试 — émojis 🚀" def test_invalid_utf8_bytes_raises_validation_error(self, temp_dir): - """Negative case: file containing invalid UTF-8 bytes should not crash silently.""" + """Negative case: file containing invalid UTF-8 bytes raises PresetValidationError, not raw UnicodeDecodeError.""" manifest_path = temp_dir / "preset.yml" manifest_path.write_bytes(b"\xff\xfe not valid utf-8 \xff\n") - with pytest.raises((PresetValidationError, UnicodeDecodeError)): + with pytest.raises(PresetValidationError, match="not valid UTF-8"): PresetManifest(manifest_path) + def test_non_mapping_yaml_raises_validation_error(self, temp_dir): + """Manifest whose YAML root is a scalar or list raises PresetValidationError, not TypeError.""" + manifest_path = temp_dir / "preset.yml" + for bad_content in ("42\n", "[1, 2]\n"): + manifest_path.write_text(bad_content, encoding="utf-8") + with pytest.raises(PresetValidationError, match="YAML mapping"): + PresetManifest(manifest_path) + def test_missing_schema_version(self, temp_dir, valid_pack_data): """Test missing schema_version field.""" del valid_pack_data["schema_version"]