diff --git a/src/codegen/git/repo_operator/repo_operator.py b/src/codegen/git/repo_operator/repo_operator.py index 9f1b66ca9..dd00d7e42 100644 --- a/src/codegen/git/repo_operator/repo_operator.py +++ b/src/codegen/git/repo_operator/repo_operator.py @@ -1,3 +1,4 @@ +import codecs import fnmatch import glob import logging @@ -577,6 +578,25 @@ def get_filepaths_for_repo(self, ignore_list): if ignore_list: filepaths = [f for f in filepaths if not any(fnmatch.fnmatch(f, pattern) or f.startswith(pattern) for pattern in ignore_list)] + # Fix bug where unicode characters are not handled correctly + for i, filepath in enumerate(filepaths): + # Check if it is one of the broken cases + if filepath.startswith('"'): + # Step 1: Strip the quotes + filepath = filepath.strip('"').strip("'") + + # Step 2: Convert the Python string to raw ASCII bytes (so \\ stays as two 0x5C). + raw_filepath = filepath.encode("ascii") + + # Step 3: Use escape_decode to process backslash escapes like \346 -> 0xE6 + decoded_filepath, _ = codecs.escape_decode(raw_filepath) + + # Step 4: Decode those bytes as UTF-8 to get the actual Unicode text + filepath = decoded_filepath.decode("utf-8") + + # Step 5: Replace the original filepath with the decoded filepath + filepaths[i] = filepath + return filepaths # TODO: unify param naming i.e. subdirectories vs subdirs probably use subdirectories since that's in the DB diff --git a/tests/unit/codegen/sdk/core/test_directory.py b/tests/unit/codegen/sdk/core/test_directory.py index 8e8c5fc85..1af8480df 100644 --- a/tests/unit/codegen/sdk/core/test_directory.py +++ b/tests/unit/codegen/sdk/core/test_directory.py @@ -7,8 +7,10 @@ from codegen.sdk.codebase.codebase_context import CodebaseContext from codegen.sdk.codebase.config import CodebaseConfig +from codegen.sdk.codebase.factory.get_session import get_codebase_session from codegen.sdk.core.directory import Directory from codegen.sdk.core.file import File +from codegen.shared.enums.programming_language import ProgrammingLanguage @pytest.fixture @@ -220,3 +222,15 @@ def test_get_set_delete_item(mock_directory): with pytest.raises(KeyError, match="subdir_2"): del mock_directory["subdir_2"] + + +def test_unicode_in_filename(tmpdir) -> None: + with get_codebase_session( + tmpdir=tmpdir, + files={"ascii.py": "print('Hello, world!')", "test/我很喜欢冰激淋/test-file 12'3_🍦.py": "print('Hello, world!')"}, + programming_language=ProgrammingLanguage.PYTHON, + verify_output=True, + ) as codebase: + file = codebase.get_file("test/我很喜欢冰激淋/test-file 12'3_🍦.py") + assert file is not None + assert file.content == "print('Hello, world!')"