diff --git a/src/codegen/sdk/core/codebase.py b/src/codegen/sdk/core/codebase.py index 1979e5f89..359830e22 100644 --- a/src/codegen/sdk/core/codebase.py +++ b/src/codegen/sdk/core/codebase.py @@ -5,6 +5,7 @@ import logging import os import re +import tempfile from collections.abc import Generator from contextlib import contextmanager from functools import cached_property @@ -1298,6 +1299,135 @@ def from_repo( logger.exception(f"Failed to initialize codebase: {e}") raise + @classmethod + def from_string( + cls, + code: str, + *, + language: Literal["python", "typescript"] | ProgrammingLanguage, + ) -> "Codebase": + """Creates a Codebase instance from a string of code. + + Args: + code: String containing code + language: Language of the code. Defaults to Python. + + Returns: + Codebase: A Codebase instance initialized with the provided code + + Example: + >>> # Python code + >>> code = "def add(a, b): return a + b" + >>> codebase = Codebase.from_string(code, language="python") + + >>> # TypeScript code + >>> code = "function add(a: number, b: number): number { return a + b; }" + >>> codebase = Codebase.from_string(code, language="typescript") + """ + if not language: + msg = "missing required argument language" + raise TypeError(msg) + + logger.info("Creating codebase from string") + + # Determine language and filename + prog_lang = ProgrammingLanguage(language.upper()) if isinstance(language, str) else language + filename = "test.ts" if prog_lang == ProgrammingLanguage.TYPESCRIPT else "test.py" + + # Create codebase using factory + from codegen.sdk.codebase.factory.codebase_factory import CodebaseFactory + + files = {filename: code} + + with tempfile.TemporaryDirectory(prefix="codegen_") as tmp_dir: + logger.info(f"Using directory: {tmp_dir}") + codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang) + logger.info("Codebase initialization complete") + return codebase + + @classmethod + def from_files( + cls, + files: dict[str, str], + *, + language: Literal["python", "typescript"] | ProgrammingLanguage | None = None, + ) -> "Codebase": + """Creates a Codebase instance from multiple files. + + Args: + files: Dictionary mapping filenames to their content, e.g. {"main.py": "print('hello')"} + language: Optional language override. If not provided, will be inferred from file extensions. + All files must have extensions matching the same language. + + Returns: + Codebase: A Codebase instance initialized with the provided files + + Raises: + ValueError: If file extensions don't match a single language or if explicitly provided + language doesn't match the extensions + + Example: + >>> # Language inferred as Python + >>> files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"} + >>> codebase = Codebase.from_files(files) + + >>> # Language inferred as TypeScript + >>> files = {"index.ts": "console.log('hello')", "utils.tsx": "export const App = () =>
Hello
"} + >>> codebase = Codebase.from_files(files) + """ + # Create codebase using factory + from codegen.sdk.codebase.factory.codebase_factory import CodebaseFactory + + if not files: + msg = "No files provided" + raise ValueError(msg) + + logger.info("Creating codebase from files") + + prog_lang = ProgrammingLanguage.PYTHON # Default language + + if files: + py_extensions = {".py"} + ts_extensions = {".ts", ".tsx", ".js", ".jsx"} + + extensions = {os.path.splitext(f)[1].lower() for f in files} + inferred_lang = None + + # all check to ensure that the from_files method is being used for small testing purposes only. + # If parsing an actual repo, it should not be used. Instead do Codebase("path/to/repo") + if all(ext in py_extensions for ext in extensions): + inferred_lang = ProgrammingLanguage.PYTHON + elif all(ext in ts_extensions for ext in extensions): + inferred_lang = ProgrammingLanguage.TYPESCRIPT + else: + msg = f"Cannot determine single language from extensions: {extensions}. Files must all be Python (.py) or TypeScript (.ts, .tsx, .js, .jsx)" + raise ValueError(msg) + + if language is not None: + explicit_lang = ProgrammingLanguage(language.upper()) if isinstance(language, str) else language + if explicit_lang != inferred_lang: + msg = f"Provided language {explicit_lang} doesn't match inferred language {inferred_lang} from file extensions" + raise ValueError(msg) + + prog_lang = inferred_lang + else: + # Default to Python if no files provided + prog_lang = ProgrammingLanguage.PYTHON if language is None else (ProgrammingLanguage(language.upper()) if isinstance(language, str) else language) + + logger.info(f"Using language: {prog_lang}") + + with tempfile.TemporaryDirectory(prefix="codegen_") as tmp_dir: + logger.info(f"Using directory: {tmp_dir}") + + # Initialize git repo to avoid "not in a git repository" error + import subprocess + + subprocess.run(["git", "init"], cwd=tmp_dir, check=True, capture_output=True) + + codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang) + logger.info("Codebase initialization complete") + return codebase + def get_modified_symbols_in_pr(self, pr_id: int) -> tuple[str, dict[str, str], list[str]]: """Get all modified symbols in a pull request""" pr = self._op.get_pull_request(pr_id) diff --git a/tests/unit/codegen/sdk/codebase/session/test_codebase_from_files.py b/tests/unit/codegen/sdk/codebase/session/test_codebase_from_files.py new file mode 100644 index 000000000..5415b0ffc --- /dev/null +++ b/tests/unit/codegen/sdk/codebase/session/test_codebase_from_files.py @@ -0,0 +1,65 @@ +import pytest + +from codegen.sdk.core.codebase import Codebase + + +def test_from_files_python(): + """Test creating a Python codebase from multiple files""" + files = {"main.py": "from utils import add\nprint(add(1, 2))", "utils.py": "def add(a, b):\n return a + b"} + # Language is optional, will be inferred + codebase = Codebase.from_files(files) + assert len(codebase.files) == 2 + assert any(f.filepath.endswith("main.py") for f in codebase.files) + assert any(f.filepath.endswith("utils.py") for f in codebase.files) + assert any("from utils import add" in f.source for f in codebase.files) + + +def test_from_files_typescript(): + """Test creating a TypeScript codebase from multiple files""" + files = {"index.ts": "import { add } from './utils';\nconsole.log(add(1, 2));", "utils.ts": "export function add(a: number, b: number): number {\n return a + b;\n}"} + # Language is optional, will be inferred + codebase = Codebase.from_files(files) + assert len(codebase.files) == 2 + assert any(f.filepath.endswith("index.ts") for f in codebase.files) + assert any(f.filepath.endswith("utils.ts") for f in codebase.files) + assert any("import { add }" in f.source for f in codebase.files) + + +def test_from_files_empty(): + """Test creating a codebase with no files raises ValueError""" + with pytest.raises(ValueError, match="No files provided"): + Codebase.from_files({}) + + +def test_from_files_mixed_extensions(): + """Test files with mixed extensions raises error""" + files = {"main.py": "print('hello')", "test.ts": "console.log('world')"} + with pytest.raises(ValueError, match="Cannot determine single language from extensions"): + Codebase.from_files(files) + + +def test_from_files_typescript_multiple_extensions(): + """Test TypeScript codebase with various valid extensions""" + files = { + "index.ts": "console.log('hi')", + "component.tsx": "export const App = () =>
Hello
", + "utils.js": "module.exports = { add: (a, b) => a + b }", + "button.jsx": "export const Button = () => ", + } + # Language is optional, will be inferred as TypeScript + codebase = Codebase.from_files(files) + assert len(codebase.files) == 4 + + +def test_from_files_explicit_language_mismatch(): + """Test error when explicit language doesn't match extensions""" + files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"} + with pytest.raises(ValueError, match="Provided language.*doesn't match inferred language"): + Codebase.from_files(files, language="typescript") + + +def test_from_files_explicit_language_match(): + """Test explicit language matching file extensions works""" + files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"} + codebase = Codebase.from_files(files, language="python") + assert len(codebase.files) == 2 diff --git a/tests/unit/codegen/sdk/codebase/session/test_codebase_from_string.py b/tests/unit/codegen/sdk/codebase/session/test_codebase_from_string.py new file mode 100644 index 000000000..328b318a9 --- /dev/null +++ b/tests/unit/codegen/sdk/codebase/session/test_codebase_from_string.py @@ -0,0 +1,78 @@ +import pytest + +from codegen.sdk.core.codebase import Codebase +from codegen.shared.enums.programming_language import ProgrammingLanguage + + +def test_from_string_python(): + """Test creating a Python codebase from string""" + code = """ +def hello(): + return "world" + """ + codebase = Codebase.from_string(code, language="python") + assert len(codebase.files) == 1 + assert codebase.files[0].filepath.endswith("test.py") + assert "def hello" in codebase.files[0].source + + +def test_from_string_typescript(): + """Test creating a TypeScript codebase from string""" + code = """ +function hello(): string { + return "world"; +} + """ + codebase = Codebase.from_string(code, language="typescript") + assert len(codebase.files) == 1 + assert codebase.files[0].filepath.endswith("test.ts") + assert "function hello" in codebase.files[0].source + + +def test_from_string_with_enum(): + """Test creating a codebase using ProgrammingLanguage enum""" + code = "const x = 42;" + codebase = Codebase.from_string(code, language=ProgrammingLanguage.TYPESCRIPT) + assert len(codebase.files) == 1 + assert codebase.files[0].filepath.endswith("test.ts") + + +def test_from_string_invalid_syntax(): + """Test that invalid syntax is still accepted (parsing happens later)""" + code = "this is not valid python" + codebase = Codebase.from_string(code, language="python") + assert len(codebase.files) == 1 + assert codebase.files[0].source == code + + +def test_from_string_empty(): + """Test creating a codebase from empty string""" + codebase = Codebase.from_string("", language="python") + assert len(codebase.files) == 1 + assert codebase.files[0].source == "" + + +def test_from_string_missing_language(): + """Test that language is required""" + with pytest.raises(TypeError, match="missing.*required.*argument.*language"): + Codebase.from_string("print('hello')") + + +def test_from_string_invalid_language(): + """Test that invalid language raises error""" + with pytest.raises(ValueError): + Codebase.from_string("print('hello')", language="invalid") + + +def test_from_string_multifile(): + """Test that multifile is not supported yet""" + code = """ +# file1.py +def hello(): pass + +# file2.py +def world(): pass + """ + # Still works, just puts everything in one file + codebase = Codebase.from_string(code, language="python") + assert len(codebase.files) == 1