From 3240904878d67fe49953e8b2557e184921ed67af Mon Sep 17 00:00:00 2001 From: codegen-bot Date: Mon, 24 Feb 2025 14:23:27 -0800 Subject: [PATCH 1/2] . --- src/codegen/sdk/core/codebase.py | 113 ++++++++++++++++++ .../session/test_codebase_from_files.py | 66 ++++++++++ .../session/test_codebase_from_string.py | 78 ++++++++++++ 3 files changed, 257 insertions(+) create mode 100644 tests/unit/codegen/sdk/codebase/session/test_codebase_from_files.py create mode 100644 tests/unit/codegen/sdk/codebase/session/test_codebase_from_string.py diff --git a/src/codegen/sdk/core/codebase.py b/src/codegen/sdk/core/codebase.py index 090e87c50..efac03f9b 100644 --- a/src/codegen/sdk/core/codebase.py +++ b/src/codegen/sdk/core/codebase.py @@ -1311,6 +1311,119 @@ def from_repo( logger.exception(f"Failed to initialize codebase: {e}") raise + @classmethod + def from_string( + cls, + code: str, + *, + language: Literal["python", "typescript"] | ProgrammingLanguage, + ) -> "Codebase": + """Creates a Codebase instance from a string of code. + + Args: + code (str): The source code string + language (Literal["python", "typescript"] | ProgrammingLanguage): The programming language of the code. + + Returns: + Codebase: A Codebase instance initialized with the provided code + """ + logger.info("Creating codebase from string") + + # Determine language and filename + prog_lang = ProgrammingLanguage(language.upper()) if isinstance(language, str) else language + filename = "test.ts" if prog_lang == ProgrammingLanguage.TYPESCRIPT else "test.py" + + # Create temporary directory + import tempfile + + tmp_dir = tempfile.mkdtemp(prefix="codegen_") + logger.info(f"Using directory: {tmp_dir}") + + # Create codebase using factory + from codegen.sdk.codebase.factory.codebase_factory import CodebaseFactory + + files = {filename: code} + codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang) + logger.info("Codebase initialization complete") + return codebase + + @classmethod + def from_files( + cls, + files: dict[str, str], + *, + language: Literal["python", "typescript"] | ProgrammingLanguage | None = None, + ) -> "Codebase": + """Creates a Codebase instance from multiple files. + + Args: + files: Dictionary mapping filenames to their content, e.g. {"main.py": "print('hello')"} + language: Optional language override. If not provided, will be inferred from file extensions. + All files must have extensions matching the same language. + + Returns: + Codebase: A Codebase instance initialized with the provided files + + Raises: + ValueError: If file extensions don't match a single language or if explicitly provided + language doesn't match the extensions + + Example: + >>> # Language inferred as Python + >>> files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"} + >>> codebase = Codebase.from_files(files) + + >>> # Language inferred as TypeScript + >>> files = {"index.ts": "console.log('hello')", "utils.tsx": "export const App = () =>
Hello
"} + >>> codebase = Codebase.from_files(files) + """ + logger.info("Creating codebase from files") + + if not files: + # Default to Python if no files provided + prog_lang = ProgrammingLanguage.PYTHON if language is None else (ProgrammingLanguage(language.upper()) if isinstance(language, str) else language) + logger.info(f"No files provided, using {prog_lang}") + else: + # Map extensions to languages + py_extensions = {".py"} + ts_extensions = {".ts", ".tsx", ".js", ".jsx"} + + # Get unique extensions from files + extensions = {os.path.splitext(f)[1].lower() for f in files} + + # Determine language from extensions + inferred_lang = None + if all(ext in py_extensions for ext in extensions): + inferred_lang = ProgrammingLanguage.PYTHON + elif all(ext in ts_extensions for ext in extensions): + inferred_lang = ProgrammingLanguage.TYPESCRIPT + else: + msg = f"Cannot determine single language from extensions: {extensions}. Files must all be Python (.py) or TypeScript (.ts, .tsx, .js, .jsx)" + raise ValueError(msg) + + # If language was explicitly provided, verify it matches inferred language + if language is not None: + explicit_lang = ProgrammingLanguage(language.upper()) if isinstance(language, str) else language + if explicit_lang != inferred_lang: + msg = f"Provided language {explicit_lang} doesn't match inferred language {inferred_lang} from file extensions" + raise ValueError(msg) + + prog_lang = inferred_lang + logger.info(f"Using language: {prog_lang} ({'inferred' if language is None else 'explicit'})") + + # Create temporary directory + import tempfile + + tmp_dir = tempfile.mkdtemp(prefix="codegen_") + logger.info(f"Using directory: {tmp_dir}") + + # Create codebase using factory + from codegen.sdk.codebase.factory.codebase_factory import CodebaseFactory + + codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang) + logger.info("Codebase initialization complete") + return codebase + def get_modified_symbols_in_pr(self, pr_id: int) -> tuple[str, dict[str, str], list[str]]: """Get all modified symbols in a pull request""" pr = self._op.get_pull_request(pr_id) diff --git a/tests/unit/codegen/sdk/codebase/session/test_codebase_from_files.py b/tests/unit/codegen/sdk/codebase/session/test_codebase_from_files.py new file mode 100644 index 000000000..b5eb25b5f --- /dev/null +++ b/tests/unit/codegen/sdk/codebase/session/test_codebase_from_files.py @@ -0,0 +1,66 @@ +import pytest + +from codegen.sdk.core.codebase import Codebase + + +def test_from_files_python(): + """Test creating a Python codebase from multiple files""" + files = {"main.py": "from utils import add\nprint(add(1, 2))", "utils.py": "def add(a, b):\n return a + b"} + # Language is optional, will be inferred + codebase = Codebase.from_files(files) + assert len(codebase.files) == 2 + assert any(f.filepath.endswith("main.py") for f in codebase.files) + assert any(f.filepath.endswith("utils.py") for f in codebase.files) + assert any("from utils import add" in f.content for f in codebase.files) + + +def test_from_files_typescript(): + """Test creating a TypeScript codebase from multiple files""" + files = {"index.ts": "import { add } from './utils';\nconsole.log(add(1, 2));", "utils.ts": "export function add(a: number, b: number): number {\n return a + b;\n}"} + # Language is optional, will be inferred + codebase = Codebase.from_files(files) + assert len(codebase.files) == 2 + assert any(f.filepath.endswith("index.ts") for f in codebase.files) + assert any(f.filepath.endswith("utils.ts") for f in codebase.files) + assert any("import { add }" in f.content for f in codebase.files) + + +def test_from_files_empty(): + """Test creating a codebase with no files""" + # Defaults to Python when no files provided + codebase = Codebase.from_files({}) + assert len(codebase.files) == 0 + + +def test_from_files_mixed_extensions(): + """Test files with mixed extensions raises error""" + files = {"main.py": "print('hello')", "test.ts": "console.log('world')"} + with pytest.raises(ValueError, match="Cannot determine single language from extensions"): + Codebase.from_files(files) + + +def test_from_files_typescript_multiple_extensions(): + """Test TypeScript codebase with various valid extensions""" + files = { + "index.ts": "console.log('hi')", + "component.tsx": "export const App = () =>
Hello
", + "utils.js": "module.exports = { add: (a, b) => a + b }", + "button.jsx": "export const Button = () => ", + } + # Language is optional, will be inferred as TypeScript + codebase = Codebase.from_files(files) + assert len(codebase.files) == 4 + + +def test_from_files_explicit_language_mismatch(): + """Test error when explicit language doesn't match extensions""" + files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"} + with pytest.raises(ValueError, match="Provided language.*doesn't match inferred language"): + Codebase.from_files(files, language="typescript") + + +def test_from_files_explicit_language_match(): + """Test explicit language matching file extensions works""" + files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"} + codebase = Codebase.from_files(files, language="python") + assert len(codebase.files) == 2 diff --git a/tests/unit/codegen/sdk/codebase/session/test_codebase_from_string.py b/tests/unit/codegen/sdk/codebase/session/test_codebase_from_string.py new file mode 100644 index 000000000..ee9f73a6f --- /dev/null +++ b/tests/unit/codegen/sdk/codebase/session/test_codebase_from_string.py @@ -0,0 +1,78 @@ +import pytest + +from codegen.sdk.core.codebase import Codebase +from codegen.shared.enums.programming_language import ProgrammingLanguage + + +def test_from_string_python(): + """Test creating a Python codebase from string""" + code = """ +def hello(): + return "world" + """ + codebase = Codebase.from_string(code, language="python") + assert len(codebase.files) == 1 + assert codebase.files[0].filepath.endswith("test.py") + assert "def hello" in codebase.files[0].content + + +def test_from_string_typescript(): + """Test creating a TypeScript codebase from string""" + code = """ +function hello(): string { + return "world"; +} + """ + codebase = Codebase.from_string(code, language="typescript") + assert len(codebase.files) == 1 + assert codebase.files[0].filepath.endswith("test.ts") + assert "function hello" in codebase.files[0].content + + +def test_from_string_with_enum(): + """Test creating a codebase using ProgrammingLanguage enum""" + code = "const x = 42;" + codebase = Codebase.from_string(code, language=ProgrammingLanguage.TYPESCRIPT) + assert len(codebase.files) == 1 + assert codebase.files[0].filepath.endswith("test.ts") + + +def test_from_string_invalid_syntax(): + """Test that invalid syntax is still accepted (parsing happens later)""" + code = "this is not valid python" + codebase = Codebase.from_string(code, language="python") + assert len(codebase.files) == 1 + assert codebase.files[0].content == code + + +def test_from_string_empty(): + """Test creating a codebase from empty string""" + codebase = Codebase.from_string("", language="python") + assert len(codebase.files) == 1 + assert codebase.files[0].content == "" + + +def test_from_string_missing_language(): + """Test that language is required""" + with pytest.raises(TypeError, match="missing.*required.*argument.*language"): + Codebase.from_string("print('hello')") + + +def test_from_string_invalid_language(): + """Test that invalid language raises error""" + with pytest.raises(ValueError): + Codebase.from_string("print('hello')", language="invalid") + + +def test_from_string_multifile(): + """Test that multifile is not supported yet""" + code = """ +# file1.py +def hello(): pass + +# file2.py +def world(): pass + """ + # Still works, just puts everything in one file + codebase = Codebase.from_string(code, language="python") + assert len(codebase.files) == 1 From ac77a7434ed2fcce816b8f9f7676ddf389adbe16 Mon Sep 17 00:00:00 2001 From: tawsif kamal Date: Wed, 26 Feb 2025 15:19:04 -0800 Subject: [PATCH 2/2] done --- src/codegen/sdk/core/codebase.py | 79 +++++++++++-------- .../session/test_codebase_from_files.py | 11 ++- .../session/test_codebase_from_string.py | 8 +- 3 files changed, 57 insertions(+), 41 deletions(-) diff --git a/src/codegen/sdk/core/codebase.py b/src/codegen/sdk/core/codebase.py index efac03f9b..6ee3fd089 100644 --- a/src/codegen/sdk/core/codebase.py +++ b/src/codegen/sdk/core/codebase.py @@ -5,6 +5,7 @@ import logging import os import re +import tempfile from collections.abc import Generator from contextlib import contextmanager from functools import cached_property @@ -1321,31 +1322,41 @@ def from_string( """Creates a Codebase instance from a string of code. Args: - code (str): The source code string - language (Literal["python", "typescript"] | ProgrammingLanguage): The programming language of the code. + code: String containing code + language: Language of the code. Defaults to Python. Returns: Codebase: A Codebase instance initialized with the provided code + + Example: + >>> # Python code + >>> code = "def add(a, b): return a + b" + >>> codebase = Codebase.from_string(code, language="python") + + >>> # TypeScript code + >>> code = "function add(a: number, b: number): number { return a + b; }" + >>> codebase = Codebase.from_string(code, language="typescript") """ + if not language: + msg = "missing required argument language" + raise TypeError(msg) + logger.info("Creating codebase from string") # Determine language and filename prog_lang = ProgrammingLanguage(language.upper()) if isinstance(language, str) else language filename = "test.ts" if prog_lang == ProgrammingLanguage.TYPESCRIPT else "test.py" - # Create temporary directory - import tempfile - - tmp_dir = tempfile.mkdtemp(prefix="codegen_") - logger.info(f"Using directory: {tmp_dir}") - # Create codebase using factory from codegen.sdk.codebase.factory.codebase_factory import CodebaseFactory files = {filename: code} - codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang) - logger.info("Codebase initialization complete") - return codebase + + with tempfile.TemporaryDirectory(prefix="codegen_") as tmp_dir: + logger.info(f"Using directory: {tmp_dir}") + codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang) + logger.info("Codebase initialization complete") + return codebase @classmethod def from_files( @@ -1377,22 +1388,26 @@ def from_files( >>> files = {"index.ts": "console.log('hello')", "utils.tsx": "export const App = () =>
Hello
"} >>> codebase = Codebase.from_files(files) """ - logger.info("Creating codebase from files") + # Create codebase using factory + from codegen.sdk.codebase.factory.codebase_factory import CodebaseFactory if not files: - # Default to Python if no files provided - prog_lang = ProgrammingLanguage.PYTHON if language is None else (ProgrammingLanguage(language.upper()) if isinstance(language, str) else language) - logger.info(f"No files provided, using {prog_lang}") - else: - # Map extensions to languages + msg = "No files provided" + raise ValueError(msg) + + logger.info("Creating codebase from files") + + prog_lang = ProgrammingLanguage.PYTHON # Default language + + if files: py_extensions = {".py"} ts_extensions = {".ts", ".tsx", ".js", ".jsx"} - # Get unique extensions from files extensions = {os.path.splitext(f)[1].lower() for f in files} - - # Determine language from extensions inferred_lang = None + + # all check to ensure that the from_files method is being used for small testing purposes only. + # If parsing an actual repo, it should not be used. Instead do Codebase("path/to/repo") if all(ext in py_extensions for ext in extensions): inferred_lang = ProgrammingLanguage.PYTHON elif all(ext in ts_extensions for ext in extensions): @@ -1401,7 +1416,6 @@ def from_files( msg = f"Cannot determine single language from extensions: {extensions}. Files must all be Python (.py) or TypeScript (.ts, .tsx, .js, .jsx)" raise ValueError(msg) - # If language was explicitly provided, verify it matches inferred language if language is not None: explicit_lang = ProgrammingLanguage(language.upper()) if isinstance(language, str) else language if explicit_lang != inferred_lang: @@ -1409,20 +1423,23 @@ def from_files( raise ValueError(msg) prog_lang = inferred_lang - logger.info(f"Using language: {prog_lang} ({'inferred' if language is None else 'explicit'})") + else: + # Default to Python if no files provided + prog_lang = ProgrammingLanguage.PYTHON if language is None else (ProgrammingLanguage(language.upper()) if isinstance(language, str) else language) - # Create temporary directory - import tempfile + logger.info(f"Using language: {prog_lang}") - tmp_dir = tempfile.mkdtemp(prefix="codegen_") - logger.info(f"Using directory: {tmp_dir}") + with tempfile.TemporaryDirectory(prefix="codegen_") as tmp_dir: + logger.info(f"Using directory: {tmp_dir}") - # Create codebase using factory - from codegen.sdk.codebase.factory.codebase_factory import CodebaseFactory + # Initialize git repo to avoid "not in a git repository" error + import subprocess + + subprocess.run(["git", "init"], cwd=tmp_dir, check=True, capture_output=True) - codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang) - logger.info("Codebase initialization complete") - return codebase + codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang) + logger.info("Codebase initialization complete") + return codebase def get_modified_symbols_in_pr(self, pr_id: int) -> tuple[str, dict[str, str], list[str]]: """Get all modified symbols in a pull request""" diff --git a/tests/unit/codegen/sdk/codebase/session/test_codebase_from_files.py b/tests/unit/codegen/sdk/codebase/session/test_codebase_from_files.py index b5eb25b5f..5415b0ffc 100644 --- a/tests/unit/codegen/sdk/codebase/session/test_codebase_from_files.py +++ b/tests/unit/codegen/sdk/codebase/session/test_codebase_from_files.py @@ -11,7 +11,7 @@ def test_from_files_python(): assert len(codebase.files) == 2 assert any(f.filepath.endswith("main.py") for f in codebase.files) assert any(f.filepath.endswith("utils.py") for f in codebase.files) - assert any("from utils import add" in f.content for f in codebase.files) + assert any("from utils import add" in f.source for f in codebase.files) def test_from_files_typescript(): @@ -22,14 +22,13 @@ def test_from_files_typescript(): assert len(codebase.files) == 2 assert any(f.filepath.endswith("index.ts") for f in codebase.files) assert any(f.filepath.endswith("utils.ts") for f in codebase.files) - assert any("import { add }" in f.content for f in codebase.files) + assert any("import { add }" in f.source for f in codebase.files) def test_from_files_empty(): - """Test creating a codebase with no files""" - # Defaults to Python when no files provided - codebase = Codebase.from_files({}) - assert len(codebase.files) == 0 + """Test creating a codebase with no files raises ValueError""" + with pytest.raises(ValueError, match="No files provided"): + Codebase.from_files({}) def test_from_files_mixed_extensions(): diff --git a/tests/unit/codegen/sdk/codebase/session/test_codebase_from_string.py b/tests/unit/codegen/sdk/codebase/session/test_codebase_from_string.py index ee9f73a6f..328b318a9 100644 --- a/tests/unit/codegen/sdk/codebase/session/test_codebase_from_string.py +++ b/tests/unit/codegen/sdk/codebase/session/test_codebase_from_string.py @@ -13,7 +13,7 @@ def hello(): codebase = Codebase.from_string(code, language="python") assert len(codebase.files) == 1 assert codebase.files[0].filepath.endswith("test.py") - assert "def hello" in codebase.files[0].content + assert "def hello" in codebase.files[0].source def test_from_string_typescript(): @@ -26,7 +26,7 @@ def test_from_string_typescript(): codebase = Codebase.from_string(code, language="typescript") assert len(codebase.files) == 1 assert codebase.files[0].filepath.endswith("test.ts") - assert "function hello" in codebase.files[0].content + assert "function hello" in codebase.files[0].source def test_from_string_with_enum(): @@ -42,14 +42,14 @@ def test_from_string_invalid_syntax(): code = "this is not valid python" codebase = Codebase.from_string(code, language="python") assert len(codebase.files) == 1 - assert codebase.files[0].content == code + assert codebase.files[0].source == code def test_from_string_empty(): """Test creating a codebase from empty string""" codebase = Codebase.from_string("", language="python") assert len(codebase.files) == 1 - assert codebase.files[0].content == "" + assert codebase.files[0].source == "" def test_from_string_missing_language():