From 3aec21ffa8922e24fac85b8ef4852d095402749d Mon Sep 17 00:00:00 2001 From: tkucar Date: Thu, 20 Feb 2025 02:35:08 +0100 Subject: [PATCH 1/5] patch --- src/codegen/sdk/codebase/codebase_context.py | 6 ++- .../codebase_graph/test_codebase_graph.py | 38 +++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/codegen/sdk/codebase/codebase_context.py b/src/codegen/sdk/codebase/codebase_context.py index 957efe708..c4337683d 100644 --- a/src/codegen/sdk/codebase/codebase_context.py +++ b/src/codegen/sdk/codebase/codebase_context.py @@ -474,7 +474,11 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr task = self.progress.begin("Adding new files", count=len(files_to_sync[SyncType.ADD])) for idx, filepath in enumerate(files_to_sync[SyncType.ADD]): task.update(f"Adding {self.to_relative(filepath)}", count=idx) - content = self.io.read_text(filepath) + try: + content = self.io.read_text(filepath) + except UnicodeDecodeError as e: + logger.warning(f"Can't read file at:{filepath} since it contains non-unicode characters. File will be ignored!") + continue # TODO: this is wrong with context changes if filepath.suffix in self.extensions: file_cls = self.node_classes.file_cls diff --git a/tests/unit/codegen/sdk/codebase/codebase_graph/test_codebase_graph.py b/tests/unit/codegen/sdk/codebase/codebase_graph/test_codebase_graph.py index a5e8f5d5f..1583875d1 100644 --- a/tests/unit/codegen/sdk/codebase/codebase_graph/test_codebase_graph.py +++ b/tests/unit/codegen/sdk/codebase/codebase_graph/test_codebase_graph.py @@ -58,3 +58,41 @@ def __init__(self): assert len(import_resolution_edges) == 4 assert len(file_contains_node_edges) == 14 assert len(symbol_usage_edges) == 6 + + +def test_codebase_broken_file(tmpdir) -> None: + # language=python + content = """ +from some_file import x, y, z +import numpy as np + +global_var_1 = 1 +global_var_2 = 2 + +def foo(): + return bar() + +def bar(): + return 42 + +class MyClass: + def __init__(self): + pass + +class MySubClass(MyClass): + def __init__(self): + super().__init__() + pass + """ + content_broken=bytes('你好','big5hkscs') + with get_codebase_session(tmpdir=tmpdir, files={"test.py": content,"test2.py": content_broken}) as codebase: + assert codebase is not None + assert isinstance(codebase.ctx, CodebaseContext) + import_resolution_edges = [edge for edge in codebase.ctx.edges if edge[2].type == EdgeType.IMPORT_SYMBOL_RESOLUTION] + file_contains_node_edges = list(itertools.chain.from_iterable(file.get_nodes() for file in codebase.files)) + symbol_usage_edges = [edge for edge in codebase.ctx.edges if edge[2].type == EdgeType.SYMBOL_USAGE] + + assert len(import_resolution_edges) == 4 + assert len(file_contains_node_edges) == 14 + assert len(symbol_usage_edges) == 6 + From ac16d6ee2e2bfec153afb37d6d77efccbec06ade Mon Sep 17 00:00:00 2001 From: tomcodgen <191515280+tomcodgen@users.noreply.github.com> Date: Thu, 20 Feb 2025 01:37:02 +0000 Subject: [PATCH 2/5] Automated pre-commit update --- .../sdk/codebase/codebase_graph/test_codebase_graph.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/unit/codegen/sdk/codebase/codebase_graph/test_codebase_graph.py b/tests/unit/codegen/sdk/codebase/codebase_graph/test_codebase_graph.py index 1583875d1..61c8e5534 100644 --- a/tests/unit/codegen/sdk/codebase/codebase_graph/test_codebase_graph.py +++ b/tests/unit/codegen/sdk/codebase/codebase_graph/test_codebase_graph.py @@ -84,8 +84,8 @@ def __init__(self): super().__init__() pass """ - content_broken=bytes('你好','big5hkscs') - with get_codebase_session(tmpdir=tmpdir, files={"test.py": content,"test2.py": content_broken}) as codebase: + content_broken = bytes("你好", "big5hkscs") + with get_codebase_session(tmpdir=tmpdir, files={"test.py": content, "test2.py": content_broken}) as codebase: assert codebase is not None assert isinstance(codebase.ctx, CodebaseContext) import_resolution_edges = [edge for edge in codebase.ctx.edges if edge[2].type == EdgeType.IMPORT_SYMBOL_RESOLUTION] @@ -95,4 +95,3 @@ def __init__(self): assert len(import_resolution_edges) == 4 assert len(file_contains_node_edges) == 14 assert len(symbol_usage_edges) == 6 - From 25ed0a52ec08a5d04cae1e7684a02aff95d28531 Mon Sep 17 00:00:00 2001 From: tkucar Date: Thu, 20 Feb 2025 02:54:15 +0100 Subject: [PATCH 3/5] fallback --- src/codegen/sdk/codebase/codebase_context.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/codegen/sdk/codebase/codebase_context.py b/src/codegen/sdk/codebase/codebase_context.py index c4337683d..77a61a446 100644 --- a/src/codegen/sdk/codebase/codebase_context.py +++ b/src/codegen/sdk/codebase/codebase_context.py @@ -477,8 +477,12 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr try: content = self.io.read_text(filepath) except UnicodeDecodeError as e: - logger.warning(f"Can't read file at:{filepath} since it contains non-unicode characters. File will be ignored!") - continue + logger.warning(f"File at:{filepath} contains non-unicode characters. Attempting to decode with replacement characters.") + try: + content = self.io.read_text(filepath, errors='replace') + except Exception as e: + logger.warning(f"Failed to decode file even with replacement: {e}. File will be ignored!") + continue # TODO: this is wrong with context changes if filepath.suffix in self.extensions: file_cls = self.node_classes.file_cls From 2f65c683f0140b8f687ec6e9a64635fcd641f954 Mon Sep 17 00:00:00 2001 From: tomcodgen <191515280+tomcodgen@users.noreply.github.com> Date: Thu, 20 Feb 2025 01:55:48 +0000 Subject: [PATCH 4/5] Automated pre-commit update --- src/codegen/sdk/codebase/codebase_context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codegen/sdk/codebase/codebase_context.py b/src/codegen/sdk/codebase/codebase_context.py index 77a61a446..ed9a38ee8 100644 --- a/src/codegen/sdk/codebase/codebase_context.py +++ b/src/codegen/sdk/codebase/codebase_context.py @@ -479,7 +479,7 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr except UnicodeDecodeError as e: logger.warning(f"File at:{filepath} contains non-unicode characters. Attempting to decode with replacement characters.") try: - content = self.io.read_text(filepath, errors='replace') + content = self.io.read_text(filepath, errors="replace") except Exception as e: logger.warning(f"Failed to decode file even with replacement: {e}. File will be ignored!") continue From cfb66786ea853d8c5ebc4fb205ac6c8509955f6d Mon Sep 17 00:00:00 2001 From: tkucar Date: Thu, 20 Feb 2025 18:33:59 +0100 Subject: [PATCH 5/5] revert fallback --- src/codegen/sdk/codebase/codebase_context.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/codegen/sdk/codebase/codebase_context.py b/src/codegen/sdk/codebase/codebase_context.py index ed9a38ee8..c4337683d 100644 --- a/src/codegen/sdk/codebase/codebase_context.py +++ b/src/codegen/sdk/codebase/codebase_context.py @@ -477,12 +477,8 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr try: content = self.io.read_text(filepath) except UnicodeDecodeError as e: - logger.warning(f"File at:{filepath} contains non-unicode characters. Attempting to decode with replacement characters.") - try: - content = self.io.read_text(filepath, errors="replace") - except Exception as e: - logger.warning(f"Failed to decode file even with replacement: {e}. File will be ignored!") - continue + logger.warning(f"Can't read file at:{filepath} since it contains non-unicode characters. File will be ignored!") + continue # TODO: this is wrong with context changes if filepath.suffix in self.extensions: file_cls = self.node_classes.file_cls