diff --git a/src/codegen/git/utils/language.py b/src/codegen/git/utils/language.py index 60d8f041d..6868b02dc 100644 --- a/src/codegen/git/utils/language.py +++ b/src/codegen/git/utils/language.py @@ -1,3 +1,4 @@ +import logging from collections import Counter from pathlib import Path from typing import Literal @@ -5,6 +6,11 @@ from codegen.git.utils.file_utils import split_git_path from codegen.shared.enums.programming_language import ProgrammingLanguage +logger = logging.getLogger(__name__) + +# Minimum ratio of files that must match the dominant language +MIN_LANGUAGE_RATIO = 0.1 + def determine_project_language(folder_path: str, strategy: Literal["most_common", "git_most_common", "package_json"] = "git_most_common") -> ProgrammingLanguage: """Determines the primary programming language of a project. @@ -37,7 +43,8 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: folder_path (str): Path to the folder to analyze Returns: - ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found + ProgrammingLanguage: The dominant programming language, or OTHER if no matching files found + or if less than MIN_LANGUAGE_RATIO of files match the dominant language """ from codegen.sdk.python import PyFile from codegen.sdk.typescript.file import TSFile @@ -54,6 +61,7 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: # Initialize counters for each language language_counts = Counter() + total_files = 0 # Walk through the directory for file_path in folder.rglob("*"): @@ -65,6 +73,8 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: if any(ignore in str(file_path) for ignore in [".git", "node_modules", "__pycache__", "venv", ".env"]): continue + total_files += 1 + # Count files for each language based on extensions for language, exts in EXTENSIONS.items(): if file_path.suffix in exts: @@ -72,10 +82,18 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage: # If no files found, return None if not language_counts: - return ProgrammingLanguage.UNSUPPORTED + return ProgrammingLanguage.OTHER + + # Get the most common language and its count + most_common_language, count = language_counts.most_common(1)[0] + + logger.debug(f"Most common language: {most_common_language}, count: {count}, total files: {total_files}") - # Return the language with the highest count - return language_counts.most_common(1)[0][0] + # Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files + if total_files > 0 and (count / total_files) < MIN_LANGUAGE_RATIO: + return ProgrammingLanguage.OTHER + + return most_common_language def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLanguage: @@ -86,7 +104,8 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua folder_path (str): Path to the git repo to analyze Returns: - ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found + ProgrammingLanguage: The dominant programming language, or OTHER if no matching files found + or if less than MIN_LANGUAGE_RATIO of files match the dominant language """ from codegen.git.repo_operator.repo_operator import RepoOperator from codegen.git.schemas.repo_config import RepoConfig @@ -105,6 +124,7 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua # Initialize counters for each language language_counts = Counter() + total_files = 0 # Initiate RepoOperator git_root, base_path = split_git_path(folder_path) @@ -120,6 +140,8 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua if file_path.is_dir() or file_path.name.startswith("."): continue + total_files += 1 + # Count files for each language based on extensions for language, exts in EXTENSIONS.items(): if file_path.suffix in exts: @@ -127,10 +149,18 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua # If no files found, return None if not language_counts: - return ProgrammingLanguage.UNSUPPORTED + return ProgrammingLanguage.OTHER + + # Get the most common language and its count + most_common_language, count = language_counts.most_common(1)[0] + + logger.debug(f"Most common language: {most_common_language}, count: {count}, total files: {total_files}") + + # Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files + if total_files > 0 and (count / total_files) < MIN_LANGUAGE_RATIO: + return ProgrammingLanguage.OTHER - # Return the language with the highest count - return language_counts.most_common(1)[0][0] + return most_common_language def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage: @@ -145,6 +175,8 @@ def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage """ package_json_path = Path(folder_path) / "package.json" if package_json_path.exists(): + logger.debug(f"Found package.json at {package_json_path}") return ProgrammingLanguage.TYPESCRIPT else: + logger.debug(f"No package.json found at {package_json_path}") return ProgrammingLanguage.PYTHON diff --git a/src/codegen/sdk/codebase/codebase_context.py b/src/codegen/sdk/codebase/codebase_context.py index 78a1577ed..957efe708 100644 --- a/src/codegen/sdk/codebase/codebase_context.py +++ b/src/codegen/sdk/codebase/codebase_context.py @@ -75,8 +75,9 @@ def get_node_classes(programming_language: ProgrammingLanguage) -> NodeClasses: return TSNodeClasses else: - msg = f"Unsupported programming language: {programming_language}!" - raise ValueError(msg) + from codegen.sdk.codebase.node_classes.generic_node_classes import GenericNodeClasses + + return GenericNodeClasses class CodebaseContext: @@ -147,6 +148,7 @@ def __init__( self.config = config self.repo_name = context.repo_operator.repo_name self.repo_path = str(Path(context.repo_operator.repo_path).resolve()) + self.full_path = os.path.join(self.repo_path, context.base_path) if context.base_path else self.repo_path self.codeowners_parser = context.repo_operator.codeowners_parser self.base_url = context.repo_operator.base_url # =====[ computed attributes ]===== @@ -163,6 +165,11 @@ def __init__( self.language_engine = get_language_engine(context.programming_language, self) self.programming_language = context.programming_language + # Raise warning if language is not supported + if self.programming_language is ProgrammingLanguage.UNSUPPORTED or self.programming_language is ProgrammingLanguage.OTHER: + logger.warning("WARNING: The codebase is using an unsupported language!") + logger.warning("Some features may not work as expected. Advanced static analysis will be disabled but simple file IO will still work.") + # Build the graph self.build_graph(context.repo_operator) try: diff --git a/src/codegen/sdk/codebase/node_classes/generic_node_classes.py b/src/codegen/sdk/codebase/node_classes/generic_node_classes.py new file mode 100644 index 000000000..a3b67a8ab --- /dev/null +++ b/src/codegen/sdk/codebase/node_classes/generic_node_classes.py @@ -0,0 +1,22 @@ +from codegen.sdk.codebase.node_classes.node_classes import NodeClasses +from codegen.sdk.core.class_definition import Class +from codegen.sdk.core.detached_symbols.code_block import CodeBlock +from codegen.sdk.core.detached_symbols.function_call import FunctionCall +from codegen.sdk.core.detached_symbols.parameter import Parameter +from codegen.sdk.core.file import File +from codegen.sdk.core.function import Function +from codegen.sdk.core.import_resolution import Import +from codegen.sdk.core.statements.comment import Comment + +GenericNodeClasses = NodeClasses( + file_cls=File, + class_cls=Class, + function_cls=Function, + import_cls=Import, + parameter_cls=Parameter, + comment_cls=Comment, + code_block_cls=CodeBlock, + function_call_cls=FunctionCall, + bool_conversion={}, + dynamic_import_parent_types={}, +) diff --git a/src/codegen/sdk/core/codebase.py b/src/codegen/sdk/core/codebase.py index bfe0e05dd..961f170b5 100644 --- a/src/codegen/sdk/core/codebase.py +++ b/src/codegen/sdk/core/codebase.py @@ -257,7 +257,11 @@ def files(self, *, extensions: list[str] | Literal["*"] | None = None) -> list[T By default, this only returns source files. Setting `extensions='*'` will return all files in the codebase, and `extensions=[...]` will return all files with the specified extensions. - `extensions='*'` is REQUIRED for listing all non source code files. Or else, codebase.files will ONLY return source files (e.g. .py, .ts). + For Python and Typescript repos WITH file parsing enabled, + `extensions='*'` is REQUIRED for listing all non source code files. + Or else, codebase.files will ONLY return source files (e.g. .py, .ts). + + For repos with file parsing disabled or repos with other languages, this will return all files in the codebase. Returns all Files in the codebase, sorted alphabetically. For Python codebases, returns PyFiles (python files). For Typescript codebases, returns TSFiles (typescript files). @@ -265,7 +269,8 @@ def files(self, *, extensions: list[str] | Literal["*"] | None = None) -> list[T Returns: list[TSourceFile]: A sorted list of source files in the codebase. """ - if extensions is None: + if extensions is None and len(self.ctx.get_nodes(NodeType.FILE)) > 0: + # If extensions is None AND there is at least one file in the codebase (This checks for unsupported languages or parse-off repos), # Return all source files files = self.ctx.get_nodes(NodeType.FILE) elif isinstance(extensions, str) and extensions != "*": diff --git a/src/codegen/sdk/core/file.py b/src/codegen/sdk/core/file.py index 9164994a7..930c9394c 100644 --- a/src/codegen/sdk/core/file.py +++ b/src/codegen/sdk/core/file.py @@ -398,6 +398,12 @@ def replace(self, old: str, new: str, count: int = -1, is_regex: bool = False, p else: return super().replace(old, new, count, is_regex, priority) + @staticmethod + @noapidoc + def get_extensions() -> list[str]: + """Returns a list of file extensions for the given programming language file.""" + return [] # By default, no extensions are "supported" for generic files + TImport = TypeVar("TImport", bound="Import") TFunction = TypeVar("TFunction", bound="Function") diff --git a/src/codegen/shared/enums/programming_language.py b/src/codegen/shared/enums/programming_language.py index 8f33ccec6..28eb52d2b 100644 --- a/src/codegen/shared/enums/programming_language.py +++ b/src/codegen/shared/enums/programming_language.py @@ -4,4 +4,5 @@ class ProgrammingLanguage(StrEnum): TYPESCRIPT = "TYPESCRIPT" PYTHON = "PYTHON" + OTHER = "OTHER" UNSUPPORTED = "UNSUPPORTED" diff --git a/tests/unit/codegen/git/utils/test_language_detection.py b/tests/unit/codegen/git/utils/test_language_detection.py new file mode 100644 index 000000000..e6effd1f7 --- /dev/null +++ b/tests/unit/codegen/git/utils/test_language_detection.py @@ -0,0 +1,73 @@ +from codegen.git.utils.language import determine_project_language +from codegen.sdk.codebase.factory.get_session import get_codebase_session +from codegen.shared.enums.programming_language import ProgrammingLanguage + + +def test_determine_language_python(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "", "file2.py": "", "file3.py": ""}, programming_language=ProgrammingLanguage.PYTHON) as codebase: + # Check for package.json -> False, therefore return PYTHON + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON + # Check for git_most_common -> PYTHON + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.PYTHON + # Check for most_common -> PYTHON + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.PYTHON + + +def test_determine_language_typescript(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"file1.ts": "", "file2.ts": "", "file3.ts": ""}, programming_language=ProgrammingLanguage.TYPESCRIPT) as codebase: + # Check for package.json -> False, therefore return PYTHON (THIS IS EXPECTED, even if it's a TS project) + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON + # Check for git_most_common -> TYPESCRIPT + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.TYPESCRIPT + # Check for most_common -> TYPESCRIPT + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.TYPESCRIPT + + +def test_determine_language_other(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"file1.txt": "", "file2.txt": "", "file3.txt": ""}, programming_language=ProgrammingLanguage.OTHER) as codebase: + # Check for package.json -> False, therefore return PYTHON (THIS IS EXPECTED) + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON + # Check for git_most_common -> OTHER + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER + # Check for most_common -> OTHER + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.OTHER + + +def test_determine_language_package_json(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"package.json": ""}, programming_language=ProgrammingLanguage.TYPESCRIPT) as codebase: + # Check for package.json -> True, therefore return Typescript + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.TYPESCRIPT + # Check for git_most_common -> OTHER + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER + # Check for most_common -> OTHER + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.OTHER + + +def test_determine_language_mixed(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "", "file2.ts": "", "file3.txt": ""}, programming_language=ProgrammingLanguage.PYTHON) as codebase: + # Check for package.json -> False, therefore return PYTHON + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON + # Check for git_most_common -> PYTHON + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.PYTHON + # Check for most_common -> PYTHON + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.PYTHON + + +def test_determine_language_threshold(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"file0.py": ""} | {f"file{i}.txt": "" for i in range(1, 20)}, programming_language=ProgrammingLanguage.PYTHON) as codebase: + # Check for package.json -> False, therefore return PYTHON + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON + # Check for git_most_common -> OTHER + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER + # Check for most_common -> OTHER + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.OTHER + + +def test_determine_language_gitignore(tmpdir) -> None: + with get_codebase_session(tmpdir=tmpdir, files={"dir/file1.py": "", "dir/file2.py": "", "dir/file3.py": "", ".gitignore": "dir"}, programming_language=ProgrammingLanguage.PYTHON) as codebase: + # Check for package.json -> False, therefore return PYTHON + assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON + # Check for git_most_common -> OTHER (follows gitignore, therefore finds no files) + assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER + # Check for most_common -> PYTHON (ignores gitignore) + assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.PYTHON diff --git a/tests/unit/codegen/sdk/codebase/file/test_file.py b/tests/unit/codegen/sdk/codebase/file/test_file.py index c3a6fdce2..2c6b4dfa0 100644 --- a/tests/unit/codegen/sdk/codebase/file/test_file.py +++ b/tests/unit/codegen/sdk/codebase/file/test_file.py @@ -4,6 +4,7 @@ from codegen.sdk.codebase.factory.get_session import get_codebase_session from codegen.sdk.core.file import File, SourceFile +from codegen.shared.enums.programming_language import ProgrammingLanguage def test_file(tmpdir) -> None: @@ -59,6 +60,28 @@ def test_codebase_files(tmpdir) -> None: assert {f for f in codebase.files(extensions=[".bin"])} == {file3} +def test_codebase_files_other_language(tmpdir) -> None: + with get_codebase_session( + tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG", "file4": "Hello world!"}, programming_language=ProgrammingLanguage.OTHER + ) as codebase: + file1 = codebase.get_file("file1.py") + file2 = codebase.get_file("file2.py") + file3 = codebase.get_file("file3.bin") + file4 = codebase.get_file("file4") + + assert len(codebase.files) == 4 # Match all files if the language is OTHER + assert {f for f in codebase.files} == {file1, file2, file3, file4} + + assert len(codebase.files(extensions="*")) == 4 + assert {f for f in codebase.files(extensions="*")} == {file1, file2, file3, file4} + + assert len(codebase.files(extensions=[".py"])) == 2 + assert {f for f in codebase.files(extensions=[".py"])} == {file1, file2} + + assert len(codebase.files(extensions=[".bin"])) == 1 + assert {f for f in codebase.files(extensions=[".bin"])} == {file3} + + @pytest.mark.skipif(sys.platform == "darwin", reason="macOS is case-insensitive") def test_file_extensions_ignore_case(tmpdir) -> None: with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG", "file4": "Hello world!"}) as codebase: diff --git a/tests/unit/codegen/sdk/python/codebase/test_codebase_reset.py b/tests/unit/codegen/sdk/python/codebase/test_codebase_reset.py index 9f49f7f42..0bfcbf13a 100644 --- a/tests/unit/codegen/sdk/python/codebase/test_codebase_reset.py +++ b/tests/unit/codegen/sdk/python/codebase/test_codebase_reset.py @@ -129,8 +129,23 @@ def test_codebase_reset_gitignore(tmpdir: str) -> None: def square(x: a): return x * x """ - with get_codebase_session(tmpdir=tmpdir, files={"dir/file0.py": file0_content, ".gitignore": gitignore_content}, programming_language=ProgrammingLanguage.PYTHON) as codebase: - assert len(codebase.files) == 0 + file1_content = """ +from dir.file0 import square + +class MyClass: + def foo(self, arg1, arg2): + return arg1 + square(arg2) + """ + with get_codebase_session( + tmpdir=tmpdir, + files={ + "dir/file0.py": file0_content, + "dir/file1.py": file1_content, + ".gitignore": gitignore_content, + }, + programming_language=ProgrammingLanguage.PYTHON, + ) as codebase: + assert len(codebase.files) == 1 codebase.reset() codebase.checkout(branch="test-branch", create_if_missing=True) codebase.commit(sync_graph=True)