From 87788681f13d08cf6cd732e472536ec58d09ccae Mon Sep 17 00:00:00 2001 From: Edward Li Date: Tue, 18 Feb 2025 16:38:07 -0800 Subject: [PATCH 01/13] Add a minimum language ratio for langage detection --- src/codegen/git/utils/language.py | 34 ++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/codegen/git/utils/language.py b/src/codegen/git/utils/language.py index 7701a3459..597e70225 100644 --- a/src/codegen/git/utils/language.py +++ b/src/codegen/git/utils/language.py @@ -5,6 +5,9 @@ from codegen.git.utils.file_utils import split_git_path from codegen.shared.enums.programming_language import ProgrammingLanguage +# Minimum ratio of files that must match the dominant language +MIN_LANGUAGE_RATIO = 0.1 + def determine_project_language(folder_path: str, strategy: Literal["most_common", "git_most_common", "package_json"] = "git_most_common") -> ProgrammingLanguage: """Determines the primary programming language of a project. @@ -28,7 +31,6 @@ def determine_project_language(folder_path: str, strategy: Literal["most_common" msg = f"Invalid strategy: {strategy}" raise ValueError(msg) - def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: """Analyzes a folder to determine the primary programming language based on file extensions. Returns the language with the most matching files. @@ -38,6 +40,7 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: Returns: ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found + or if less than MIN_LANGUAGE_RATIO of files match the dominant language """ from codegen.sdk.python import PyFile from codegen.sdk.typescript.file import TSFile @@ -54,6 +57,7 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: # Initialize counters for each language language_counts = Counter() + total_files = 0 # Walk through the directory for file_path in folder.rglob("*"): @@ -65,6 +69,8 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: if any(ignore in str(file_path) for ignore in [".git", "node_modules", "__pycache__", "venv", ".env"]): continue + total_files += 1 + # Count files for each language based on extensions for language, exts in EXTENSIONS.items(): if file_path.suffix in exts: @@ -74,8 +80,14 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: if not language_counts: return ProgrammingLanguage.UNSUPPORTED - # Return the language with the highest count - return language_counts.most_common(1)[0][0] + # Get the most common language and its count + most_common_language, count = language_counts.most_common(1)[0] + + # Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files + if total_files > 0 and (count / total_files) < MIN_LANGUAGE_RATIO: + return ProgrammingLanguage.UNSUPPORTED + + return most_common_language def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLanguage: @@ -87,6 +99,7 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua Returns: ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found + or if less than MIN_LANGUAGE_RATIO of files match the dominant language """ from codegen.git.repo_operator.local_repo_operator import LocalRepoOperator from codegen.git.schemas.repo_config import RepoConfig @@ -105,6 +118,7 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua # Initialize counters for each language language_counts = Counter() + total_files = 0 # Initiate LocalRepoOperator git_root, base_path = split_git_path(folder_path) @@ -120,6 +134,8 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua if file_path.is_dir() or file_path.name.startswith("."): continue + total_files += 1 + # Count files for each language based on extensions for language, exts in EXTENSIONS.items(): if file_path.suffix in exts: @@ -129,8 +145,16 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua if not language_counts: return ProgrammingLanguage.UNSUPPORTED - # Return the language with the highest count - return language_counts.most_common(1)[0][0] + # Get the most common language and its count + most_common_language, count = language_counts.most_common(1)[0] + + print(f"Most common language: {most_common_language}, count: {count}, total files: {total_files}") + + # Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files + if total_files > 0 and (count / total_files) < MIN_LANGUAGE_RATIO: + return ProgrammingLanguage.UNSUPPORTED + + return most_common_language def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage: From 7483b631fb3c40e3e5acfc2177b23f854fb29b7d Mon Sep 17 00:00:00 2001 From: Edward Li Date: Tue, 18 Feb 2025 16:39:12 -0800 Subject: [PATCH 02/13] Remove debug code --- src/codegen/git/utils/language.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/codegen/git/utils/language.py b/src/codegen/git/utils/language.py index 597e70225..05699a76b 100644 --- a/src/codegen/git/utils/language.py +++ b/src/codegen/git/utils/language.py @@ -148,8 +148,6 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua # Get the most common language and its count most_common_language, count = language_counts.most_common(1)[0] - print(f"Most common language: {most_common_language}, count: {count}, total files: {total_files}") - # Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files if total_files > 0 and (count / total_files) < MIN_LANGUAGE_RATIO: return ProgrammingLanguage.UNSUPPORTED From 44f2849ba2b2695dfb00686d711534744c409ce3 Mon Sep 17 00:00:00 2001 From: Edward Li Date: Tue, 18 Feb 2025 16:39:58 -0800 Subject: [PATCH 03/13] Add some logging --- src/codegen/git/utils/language.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/codegen/git/utils/language.py b/src/codegen/git/utils/language.py index 05699a76b..4459eee78 100644 --- a/src/codegen/git/utils/language.py +++ b/src/codegen/git/utils/language.py @@ -1,3 +1,4 @@ +import logging from collections import Counter from pathlib import Path from typing import Literal @@ -5,6 +6,8 @@ from codegen.git.utils.file_utils import split_git_path from codegen.shared.enums.programming_language import ProgrammingLanguage +logger = logging.getLogger(__name__) + # Minimum ratio of files that must match the dominant language MIN_LANGUAGE_RATIO = 0.1 @@ -83,6 +86,8 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: # Get the most common language and its count most_common_language, count = language_counts.most_common(1)[0] + logger.debug(f"Most common language: {most_common_language}, count: {count}, total files: {total_files}") + # Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files if total_files > 0 and (count / total_files) < MIN_LANGUAGE_RATIO: return ProgrammingLanguage.UNSUPPORTED @@ -148,6 +153,8 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua # Get the most common language and its count most_common_language, count = language_counts.most_common(1)[0] + logger.debug(f"Most common language: {most_common_language}, count: {count}, total files: {total_files}") + # Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files if total_files > 0 and (count / total_files) < MIN_LANGUAGE_RATIO: return ProgrammingLanguage.UNSUPPORTED @@ -167,6 +174,8 @@ def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage """ package_json_path = Path(folder_path) / "package.json" if package_json_path.exists(): + logger.debug(f"Found package.json at {package_json_path}") return ProgrammingLanguage.TYPESCRIPT else: + logger.debug(f"No package.json found at {package_json_path}") return ProgrammingLanguage.PYTHON From d66ebf0e5476d1e273b0c2456727cd902c211eb8 Mon Sep 17 00:00:00 2001 From: Edward Li Date: Tue, 18 Feb 2025 16:48:22 -0800 Subject: [PATCH 04/13] Enable generic "Unsupported" language parsing --- src/codegen/sdk/codebase/codebase_context.py | 5 +++-- .../node_classes/generic_node_classes.py | 22 +++++++++++++++++++ src/codegen/sdk/core/file.py | 5 +++++ 3 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 src/codegen/sdk/codebase/node_classes/generic_node_classes.py diff --git a/src/codegen/sdk/codebase/codebase_context.py b/src/codegen/sdk/codebase/codebase_context.py index 78a1577ed..13d501637 100644 --- a/src/codegen/sdk/codebase/codebase_context.py +++ b/src/codegen/sdk/codebase/codebase_context.py @@ -75,8 +75,9 @@ def get_node_classes(programming_language: ProgrammingLanguage) -> NodeClasses: return TSNodeClasses else: - msg = f"Unsupported programming language: {programming_language}!" - raise ValueError(msg) + from codegen.sdk.codebase.node_classes.generic_node_classes import GenericNodeClasses + + return GenericNodeClasses class CodebaseContext: diff --git a/src/codegen/sdk/codebase/node_classes/generic_node_classes.py b/src/codegen/sdk/codebase/node_classes/generic_node_classes.py new file mode 100644 index 000000000..a3b67a8ab --- /dev/null +++ b/src/codegen/sdk/codebase/node_classes/generic_node_classes.py @@ -0,0 +1,22 @@ +from codegen.sdk.codebase.node_classes.node_classes import NodeClasses +from codegen.sdk.core.class_definition import Class +from codegen.sdk.core.detached_symbols.code_block import CodeBlock +from codegen.sdk.core.detached_symbols.function_call import FunctionCall +from codegen.sdk.core.detached_symbols.parameter import Parameter +from codegen.sdk.core.file import File +from codegen.sdk.core.function import Function +from codegen.sdk.core.import_resolution import Import +from codegen.sdk.core.statements.comment import Comment + +GenericNodeClasses = NodeClasses( + file_cls=File, + class_cls=Class, + function_cls=Function, + import_cls=Import, + parameter_cls=Parameter, + comment_cls=Comment, + code_block_cls=CodeBlock, + function_call_cls=FunctionCall, + bool_conversion={}, + dynamic_import_parent_types={}, +) diff --git a/src/codegen/sdk/core/file.py b/src/codegen/sdk/core/file.py index 9164994a7..9b3c05425 100644 --- a/src/codegen/sdk/core/file.py +++ b/src/codegen/sdk/core/file.py @@ -398,6 +398,11 @@ def replace(self, old: str, new: str, count: int = -1, is_regex: bool = False, p else: return super().replace(old, new, count, is_regex, priority) + @staticmethod + @noapidoc + def get_extensions() -> list[str]: + """Returns a list of file extensions for the given programming language file.""" + return [] # By default, no extensions are "supported" for generic files TImport = TypeVar("TImport", bound="Import") TFunction = TypeVar("TFunction", bound="Function") From 2cf9f059db119dc9ebe577daa69de95482e4ddc4 Mon Sep 17 00:00:00 2001 From: Edward Li Date: Tue, 18 Feb 2025 16:54:07 -0800 Subject: [PATCH 05/13] Add "OTHER" as a language --- src/codegen/git/utils/language.py | 12 ++++++------ src/codegen/shared/enums/programming_language.py | 1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/codegen/git/utils/language.py b/src/codegen/git/utils/language.py index 4459eee78..2c55c9343 100644 --- a/src/codegen/git/utils/language.py +++ b/src/codegen/git/utils/language.py @@ -42,7 +42,7 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: folder_path (str): Path to the folder to analyze Returns: - ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found + ProgrammingLanguage: The dominant programming language, or OTHER if no matching files found or if less than MIN_LANGUAGE_RATIO of files match the dominant language """ from codegen.sdk.python import PyFile @@ -81,7 +81,7 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: # If no files found, return None if not language_counts: - return ProgrammingLanguage.UNSUPPORTED + return ProgrammingLanguage.OTHER # Get the most common language and its count most_common_language, count = language_counts.most_common(1)[0] @@ -90,7 +90,7 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: # Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files if total_files > 0 and (count / total_files) < MIN_LANGUAGE_RATIO: - return ProgrammingLanguage.UNSUPPORTED + return ProgrammingLanguage.OTHER return most_common_language @@ -103,7 +103,7 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua folder_path (str): Path to the git repo to analyze Returns: - ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found + ProgrammingLanguage: The dominant programming language, or OTHER if no matching files found or if less than MIN_LANGUAGE_RATIO of files match the dominant language """ from codegen.git.repo_operator.local_repo_operator import LocalRepoOperator @@ -148,7 +148,7 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua # If no files found, return None if not language_counts: - return ProgrammingLanguage.UNSUPPORTED + return ProgrammingLanguage.OTHER # Get the most common language and its count most_common_language, count = language_counts.most_common(1)[0] @@ -157,7 +157,7 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua # Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files if total_files > 0 and (count / total_files) < MIN_LANGUAGE_RATIO: - return ProgrammingLanguage.UNSUPPORTED + return ProgrammingLanguage.OTHER return most_common_language diff --git a/src/codegen/shared/enums/programming_language.py b/src/codegen/shared/enums/programming_language.py index 8f33ccec6..28eb52d2b 100644 --- a/src/codegen/shared/enums/programming_language.py +++ b/src/codegen/shared/enums/programming_language.py @@ -4,4 +4,5 @@ class ProgrammingLanguage(StrEnum): TYPESCRIPT = "TYPESCRIPT" PYTHON = "PYTHON" + OTHER = "OTHER" UNSUPPORTED = "UNSUPPORTED" From e15c9909ad301c252e17c7501cab446873558477 Mon Sep 17 00:00:00 2001 From: Edward Li Date: Tue, 18 Feb 2025 17:02:32 -0800 Subject: [PATCH 06/13] Add support for querying files with `OTHER` language type --- src/codegen/sdk/core/codebase.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/codegen/sdk/core/codebase.py b/src/codegen/sdk/core/codebase.py index d4c347dfc..9b4fa9a76 100644 --- a/src/codegen/sdk/core/codebase.py +++ b/src/codegen/sdk/core/codebase.py @@ -259,7 +259,11 @@ def files(self, *, extensions: list[str] | Literal["*"] | None = None) -> list[T By default, this only returns source files. Setting `extensions='*'` will return all files in the codebase, and `extensions=[...]` will return all files with the specified extensions. - `extensions='*'` is REQUIRED for listing all non source code files. Or else, codebase.files will ONLY return source files (e.g. .py, .ts). + For Python and Typescript repos WITH file parsing enabled, + `extensions='*'` is REQUIRED for listing all non source code files. + Or else, codebase.files will ONLY return source files (e.g. .py, .ts). + + For repos with file parsing disabled or repos with other languages, this will return all files in the codebase. Returns all Files in the codebase, sorted alphabetically. For Python codebases, returns PyFiles (python files). For Typescript codebases, returns TSFiles (typescript files). @@ -267,7 +271,8 @@ def files(self, *, extensions: list[str] | Literal["*"] | None = None) -> list[T Returns: list[TSourceFile]: A sorted list of source files in the codebase. """ - if extensions is None: + if extensions is None and len(self.ctx.get_nodes(NodeType.FILE)) > 0: + # If extensions is None AND there is at least one file in the codebase (This checks for unsupported languages or parse-off repos), # Return all source files files = self.ctx.get_nodes(NodeType.FILE) elif isinstance(extensions, str) and extensions != "*": From 295253fa413b1b36ef503b594e6ae98c3c028c9f Mon Sep 17 00:00:00 2001 From: Edward Li Date: Tue, 18 Feb 2025 17:05:04 -0800 Subject: [PATCH 07/13] Add warning if unsupported language --- src/codegen/sdk/codebase/codebase_context.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/codegen/sdk/codebase/codebase_context.py b/src/codegen/sdk/codebase/codebase_context.py index 13d501637..57124dbf7 100644 --- a/src/codegen/sdk/codebase/codebase_context.py +++ b/src/codegen/sdk/codebase/codebase_context.py @@ -164,6 +164,11 @@ def __init__( self.language_engine = get_language_engine(context.programming_language, self) self.programming_language = context.programming_language + # Raise warning if language is not supported + if self.programming_language is ProgrammingLanguage.UNSUPPORTED or self.programming_language is ProgrammingLanguage.OTHER: + logger.warning("WARNING: The codebase is using an unsupported language!") + logger.warning("Some features may not work as expected. Advanced static analysis will be disabled but simple file IO will still work.") + # Build the graph self.build_graph(context.repo_operator) try: From ddd22fc1f3170eb62a5224402b7539eb7b369c6f Mon Sep 17 00:00:00 2001 From: EdwardJXLi <20020059+EdwardJXLi@users.noreply.github.com> Date: Wed, 19 Feb 2025 01:13:27 +0000 Subject: [PATCH 08/13] Automated pre-commit update --- src/codegen/git/utils/language.py | 1 + src/codegen/sdk/core/file.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/codegen/git/utils/language.py b/src/codegen/git/utils/language.py index 2c55c9343..4f8e52c49 100644 --- a/src/codegen/git/utils/language.py +++ b/src/codegen/git/utils/language.py @@ -34,6 +34,7 @@ def determine_project_language(folder_path: str, strategy: Literal["most_common" msg = f"Invalid strategy: {strategy}" raise ValueError(msg) + def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: """Analyzes a folder to determine the primary programming language based on file extensions. Returns the language with the most matching files. diff --git a/src/codegen/sdk/core/file.py b/src/codegen/sdk/core/file.py index 9b3c05425..930c9394c 100644 --- a/src/codegen/sdk/core/file.py +++ b/src/codegen/sdk/core/file.py @@ -404,6 +404,7 @@ def get_extensions() -> list[str]: """Returns a list of file extensions for the given programming language file.""" return [] # By default, no extensions are "supported" for generic files + TImport = TypeVar("TImport", bound="Import") TFunction = TypeVar("TFunction", bound="Function") TClass = TypeVar("TClass", bound="Class") From f5f2b88080d6a84933b37e9a103698bd60aa45cf Mon Sep 17 00:00:00 2001 From: Edward Li Date: Wed, 19 Feb 2025 11:06:26 -0800 Subject: [PATCH 09/13] Fix tests --- .../python/codebase/test_codebase_reset.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/unit/codegen/sdk/python/codebase/test_codebase_reset.py b/tests/unit/codegen/sdk/python/codebase/test_codebase_reset.py index 9f49f7f42..0bfcbf13a 100644 --- a/tests/unit/codegen/sdk/python/codebase/test_codebase_reset.py +++ b/tests/unit/codegen/sdk/python/codebase/test_codebase_reset.py @@ -129,8 +129,23 @@ def test_codebase_reset_gitignore(tmpdir: str) -> None: def square(x: a): return x * x """ - with get_codebase_session(tmpdir=tmpdir, files={"dir/file0.py": file0_content, ".gitignore": gitignore_content}, programming_language=ProgrammingLanguage.PYTHON) as codebase: - assert len(codebase.files) == 0 + file1_content = """ +from dir.file0 import square + +class MyClass: + def foo(self, arg1, arg2): + return arg1 + square(arg2) + """ + with get_codebase_session( + tmpdir=tmpdir, + files={ + "dir/file0.py": file0_content, + "dir/file1.py": file1_content, + ".gitignore": gitignore_content, + }, + programming_language=ProgrammingLanguage.PYTHON, + ) as codebase: + assert len(codebase.files) == 1 codebase.reset() codebase.checkout(branch="test-branch", create_if_missing=True) codebase.commit(sync_graph=True) From fcc770fb293e1c59b30757ed09898f683ec747c4 Mon Sep 17 00:00:00 2001 From: Edward Li Date: Wed, 19 Feb 2025 12:59:48 -0800 Subject: [PATCH 10/13] Add tests for determine_project_language --- src/codegen/sdk/codebase/codebase_context.py | 1 + .../git/utils/test_language_detection.py | 73 +++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 tests/unit/codegen/git/utils/test_language_detection.py diff --git a/src/codegen/sdk/codebase/codebase_context.py b/src/codegen/sdk/codebase/codebase_context.py index 57124dbf7..957efe708 100644 --- a/src/codegen/sdk/codebase/codebase_context.py +++ b/src/codegen/sdk/codebase/codebase_context.py @@ -148,6 +148,7 @@ def __init__( self.config = config self.repo_name = context.repo_operator.repo_name self.repo_path = str(Path(context.repo_operator.repo_path).resolve()) + self.full_path = os.path.join(self.repo_path, context.base_path) if context.base_path else self.repo_path self.codeowners_parser = context.repo_operator.codeowners_parser self.base_url = context.repo_operator.base_url # =====[ computed attributes ]===== diff --git a/tests/unit/codegen/git/utils/test_language_detection.py b/tests/unit/codegen/git/utils/test_language_detection.py new file mode 100644 index 000000000..7556a7e1e --- /dev/null +++ b/tests/unit/codegen/git/utils/test_language_detection.py @@ -0,0 +1,73 @@ +from codegen.git.utils.language import determine_project_language +from codegen.sdk.codebase.factory.get_session import get_codebase_session +from codegen.shared.enums.programming_language import ProgrammingLanguage + + +def test_determine_language_python(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "", "file2.py": "", "file3.py": ""}, programming_language=ProgrammingLanguage.PYTHON) as codebase: + # Check for package.json -> False, therefore return PYTHON + assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.PYTHON + # Check for git_most_common -> PYTHON + assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.PYTHON + # Check for most_common -> PYTHON + assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.PYTHON + + +def test_determine_language_typescript(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"file1.ts": "", "file2.ts": "", "file3.ts": ""}, programming_language=ProgrammingLanguage.TYPESCRIPT) as codebase: + # Check for package.json -> False, therefore return PYTHON (THIS IS EXPECTED, even if it's a TS project) + assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.PYTHON + # Check for git_most_common -> TYPESCRIPT + assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.TYPESCRIPT + # Check for most_common -> TYPESCRIPT + assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.TYPESCRIPT + + +def test_determine_language_other(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"file1.txt": "", "file2.txt": "", "file3.txt": ""}, programming_language=ProgrammingLanguage.OTHER) as codebase: + # Check for package.json -> False, therefore return PYTHON (THIS IS EXPECTED) + assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.PYTHON + # Check for git_most_common -> OTHER + assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.OTHER + # Check for most_common -> OTHER + assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.OTHER + + +def test_determine_language_package_json(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"package.json": ""}, programming_language=ProgrammingLanguage.TYPESCRIPT) as codebase: + # Check for package.json -> True, therefore return Typescript + assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.TYPESCRIPT + # Check for git_most_common -> OTHER + assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.OTHER + # Check for most_common -> OTHER + assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.OTHER + + +def test_determine_language_mixed(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "", "file2.ts": "", "file3.txt": ""}, programming_language=ProgrammingLanguage.PYTHON) as codebase: + # Check for package.json -> False, therefore return PYTHON + assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.PYTHON + # Check for git_most_common -> PYTHON + assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.PYTHON + # Check for most_common -> PYTHON + assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.PYTHON + + +def test_determine_language_threshold(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"file0.py": ""} | {f"file{i}.txt": "" for i in range(1, 20)}, programming_language=ProgrammingLanguage.PYTHON) as codebase: + # Check for package.json -> False, therefore return PYTHON + assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.PYTHON + # Check for git_most_common -> OTHER + assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.OTHER + # Check for most_common -> OTHER + assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.OTHER + + +def test_determine_language_gitignore(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"dir/file1.py": "", "dir/file2.py": "", "dir/file3.py": "", ".gitignore": "dir/*"}, programming_language=ProgrammingLanguage.PYTHON) as codebase: + # Check for package.json -> False, therefore return PYTHON + assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.PYTHON + # Check for git_most_common -> OTHER (follows gitignore, therefore finds no files) + assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.OTHER + # Check for most_common -> PYTHON (ignores gitignore) + assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.PYTHON From 4a61fdd69b44b6ea317bff70f67109868c26754d Mon Sep 17 00:00:00 2001 From: Edward Li Date: Wed, 19 Feb 2025 13:28:01 -0800 Subject: [PATCH 11/13] Update and fix tests --- .../git/utils/test_language_detection.py | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/tests/unit/codegen/git/utils/test_language_detection.py b/tests/unit/codegen/git/utils/test_language_detection.py index 7556a7e1e..e6effd1f7 100644 --- a/tests/unit/codegen/git/utils/test_language_detection.py +++ b/tests/unit/codegen/git/utils/test_language_detection.py @@ -6,68 +6,68 @@ def test_determine_language_python(tmpdir) -> None: with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "", "file2.py": "", "file3.py": ""}, programming_language=ProgrammingLanguage.PYTHON) as codebase: # Check for package.json -> False, therefore return PYTHON - assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.PYTHON + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON # Check for git_most_common -> PYTHON - assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.PYTHON + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.PYTHON # Check for most_common -> PYTHON - assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.PYTHON + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.PYTHON def test_determine_language_typescript(tmpdir) -> None: with get_codebase_session(tmpdir=tmpdir, files={"file1.ts": "", "file2.ts": "", "file3.ts": ""}, programming_language=ProgrammingLanguage.TYPESCRIPT) as codebase: # Check for package.json -> False, therefore return PYTHON (THIS IS EXPECTED, even if it's a TS project) - assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.PYTHON + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON # Check for git_most_common -> TYPESCRIPT - assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.TYPESCRIPT + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.TYPESCRIPT # Check for most_common -> TYPESCRIPT - assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.TYPESCRIPT + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.TYPESCRIPT def test_determine_language_other(tmpdir) -> None: with get_codebase_session(tmpdir=tmpdir, files={"file1.txt": "", "file2.txt": "", "file3.txt": ""}, programming_language=ProgrammingLanguage.OTHER) as codebase: # Check for package.json -> False, therefore return PYTHON (THIS IS EXPECTED) - assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.PYTHON + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON # Check for git_most_common -> OTHER - assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.OTHER + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER # Check for most_common -> OTHER - assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.OTHER + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.OTHER def test_determine_language_package_json(tmpdir) -> None: with get_codebase_session(tmpdir=tmpdir, files={"package.json": ""}, programming_language=ProgrammingLanguage.TYPESCRIPT) as codebase: # Check for package.json -> True, therefore return Typescript - assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.TYPESCRIPT + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.TYPESCRIPT # Check for git_most_common -> OTHER - assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.OTHER + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER # Check for most_common -> OTHER - assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.OTHER + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.OTHER def test_determine_language_mixed(tmpdir) -> None: with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "", "file2.ts": "", "file3.txt": ""}, programming_language=ProgrammingLanguage.PYTHON) as codebase: # Check for package.json -> False, therefore return PYTHON - assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.PYTHON + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON # Check for git_most_common -> PYTHON - assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.PYTHON + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.PYTHON # Check for most_common -> PYTHON - assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.PYTHON + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.PYTHON def test_determine_language_threshold(tmpdir) -> None: with get_codebase_session(tmpdir=tmpdir, files={"file0.py": ""} | {f"file{i}.txt": "" for i in range(1, 20)}, programming_language=ProgrammingLanguage.PYTHON) as codebase: # Check for package.json -> False, therefore return PYTHON - assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.PYTHON + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON # Check for git_most_common -> OTHER - assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.OTHER + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER # Check for most_common -> OTHER - assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.OTHER + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.OTHER def test_determine_language_gitignore(tmpdir) -> None: - with get_codebase_session(tmpdir=tmpdir, files={"dir/file1.py": "", "dir/file2.py": "", "dir/file3.py": "", ".gitignore": "dir/*"}, programming_language=ProgrammingLanguage.PYTHON) as codebase: + with get_codebase_session(tmpdir=tmpdir, files={"dir/file1.py": "", "dir/file2.py": "", "dir/file3.py": "", ".gitignore": "dir"}, programming_language=ProgrammingLanguage.PYTHON) as codebase: # Check for package.json -> False, therefore return PYTHON - assert determine_project_language(codebase.ctx.full_path, strategy="package_json") == ProgrammingLanguage.PYTHON + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON # Check for git_most_common -> OTHER (follows gitignore, therefore finds no files) - assert determine_project_language(codebase.ctx.full_path, strategy="git_most_common") == ProgrammingLanguage.OTHER + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER # Check for most_common -> PYTHON (ignores gitignore) - assert determine_project_language(codebase.ctx.full_path, strategy="most_common") == ProgrammingLanguage.PYTHON + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.PYTHON From 15782ccac616fa72f4aee21e2c1aa0c02d1df74b Mon Sep 17 00:00:00 2001 From: Edward Li Date: Wed, 19 Feb 2025 13:35:56 -0800 Subject: [PATCH 12/13] Add `test_codebase_files_other_language` test --- .../codegen/sdk/codebase/file/test_file.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/unit/codegen/sdk/codebase/file/test_file.py b/tests/unit/codegen/sdk/codebase/file/test_file.py index c3a6fdce2..d9ef9e87a 100644 --- a/tests/unit/codegen/sdk/codebase/file/test_file.py +++ b/tests/unit/codegen/sdk/codebase/file/test_file.py @@ -4,6 +4,7 @@ from codegen.sdk.codebase.factory.get_session import get_codebase_session from codegen.sdk.core.file import File, SourceFile +from codegen.shared.enums.programming_language import ProgrammingLanguage def test_file(tmpdir) -> None: @@ -59,6 +60,26 @@ def test_codebase_files(tmpdir) -> None: assert {f for f in codebase.files(extensions=[".bin"])} == {file3} +def test_codebase_files_other_language(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG", "file4": "Hello world!"}, programming_language=ProgrammingLanguage.OTHER) as codebase: + file1 = codebase.get_file("file1.py") + file2 = codebase.get_file("file2.py") + file3 = codebase.get_file("file3.bin") + file4 = codebase.get_file("file4") + + assert len(codebase.files) == 4 # Match all files if the language is OTHER + assert {f for f in codebase.files} == {file1, file2, file3, file4} + + assert len(codebase.files(extensions="*")) == 4 + assert {f for f in codebase.files(extensions="*")} == {file1, file2, file3, file4} + + assert len(codebase.files(extensions=[".py"])) == 2 + assert {f for f in codebase.files(extensions=[".py"])} == {file1, file2} + + assert len(codebase.files(extensions=[".bin"])) == 1 + assert {f for f in codebase.files(extensions=[".bin"])} == {file3} + + @pytest.mark.skipif(sys.platform == "darwin", reason="macOS is case-insensitive") def test_file_extensions_ignore_case(tmpdir) -> None: with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG", "file4": "Hello world!"}) as codebase: From 71abecbf0b156b552c69432ea0b3c50239bfd90f Mon Sep 17 00:00:00 2001 From: EdwardJXLi <20020059+EdwardJXLi@users.noreply.github.com> Date: Wed, 19 Feb 2025 21:36:48 +0000 Subject: [PATCH 13/13] Automated pre-commit update --- tests/unit/codegen/sdk/codebase/file/test_file.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/codegen/sdk/codebase/file/test_file.py b/tests/unit/codegen/sdk/codebase/file/test_file.py index d9ef9e87a..2c6b4dfa0 100644 --- a/tests/unit/codegen/sdk/codebase/file/test_file.py +++ b/tests/unit/codegen/sdk/codebase/file/test_file.py @@ -61,7 +61,9 @@ def test_codebase_files(tmpdir) -> None: def test_codebase_files_other_language(tmpdir) -> None: - with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG", "file4": "Hello world!"}, programming_language=ProgrammingLanguage.OTHER) as codebase: + with get_codebase_session( + tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG", "file4": "Hello world!"}, programming_language=ProgrammingLanguage.OTHER + ) as codebase: file1 = codebase.get_file("file1.py") file2 = codebase.get_file("file2.py") file3 = codebase.get_file("file3.bin")