From e5914be02b4a9e03b8f77edd7a1bf92baab7a665 Mon Sep 17 00:00:00 2001 From: KopekC Date: Tue, 4 Mar 2025 15:53:42 -0500 Subject: [PATCH 1/4] feat: Better search tool --- src/codegen/extensions/index/file_index.py | 170 ++++++++++++++++++- src/codegen/extensions/langchain/tools.py | 162 +++++++++++++++--- src/codegen/extensions/tools/search.py | 185 +++++++++------------ 3 files changed, 384 insertions(+), 133 deletions(-) diff --git a/src/codegen/extensions/index/file_index.py b/src/codegen/extensions/index/file_index.py index 6672221c4..43f2cf260 100644 --- a/src/codegen/extensions/index/file_index.py +++ b/src/codegen/extensions/index/file_index.py @@ -7,6 +7,7 @@ import tiktoken from openai import OpenAI from tqdm import tqdm +import modal from codegen.extensions.index.code_index import CodeIndex from codegen.sdk.core.codebase import Codebase @@ -26,6 +27,7 @@ class FileIndex(CodeIndex): EMBEDDING_MODEL = "text-embedding-3-small" MAX_TOKENS = 8000 BATCH_SIZE = 100 + USE_MODAL_DICT = True # Flag to control whether to use Modal Dict def __init__(self, codebase: Codebase): """Initialize the file index. @@ -37,9 +39,86 @@ def __init__(self, codebase: Codebase): self.client = OpenAI() self.encoding = tiktoken.get_encoding("cl100k_base") + def set_use_modal_dict(self, use_modal: bool) -> None: + """Set whether to use Modal Dict for storage. + + Args: + use_modal: Whether to use Modal Dict for storage + """ + self.USE_MODAL_DICT = use_modal + logger.info(f"Modal Dict storage {'enabled' if use_modal else 'disabled'}") + @property def save_file_name(self) -> str: return "file_index_{commit}.pkl" + + @property + def modal_dict_id(self) -> str: + """Get the Modal Dict ID based on the same naming convention as the pickle file.""" + if not self.commit_hash: + return "file_index_latest" + return f"file_index_{self.commit_hash}" + + def delete_modal_dict(self) -> bool: + """Delete the Modal Dict storage for this index. + + Returns: + bool: True if successfully deleted, False otherwise + """ + if not self.USE_MODAL_DICT: + logger.warning("Modal Dict storage is disabled") + return False + + try: + dict_id = self.modal_dict_id + logger.info(f"Deleting Modal Dict: {dict_id}") + + # Check if the dict exists before trying to delete + try: + # Use modal.Dict.delete to properly delete the dict + modal.Dict.delete(dict_id) + logger.info(f"Successfully deleted Modal Dict: {dict_id}") + return True + except Exception as e: + logger.info(f"Modal Dict {dict_id} does not exist or cannot be deleted: {e}") + return False + except Exception as e: + logger.error(f"Failed to delete Modal Dict: {e}") + return False + + def modal_dict_exists(self, commit_hash: str = None) -> bool: + """Check if a Modal Dict exists for a specific commit. + + Args: + commit_hash: The commit hash to check, or None to use the current commit + + Returns: + bool: True if the Modal Dict exists, False otherwise + """ + if not self.USE_MODAL_DICT: + return False + + try: + # Use provided commit hash or current one + old_commit = self.commit_hash + if commit_hash is not None: + self.commit_hash = commit_hash + + dict_id = self.modal_dict_id + + # Restore original commit hash + if commit_hash is not None: + self.commit_hash = old_commit + + try: + # Try to access the dict - this will raise an exception if it doesn't exist + modal_dict = modal.Dict.from_name(dict_id, create_if_missing=False) + # Check if our data is in the dict + return "index_data" in modal_dict + except Exception: + return False + except Exception: + return False def _split_by_tokens(self, text: str) -> list[str]: """Split text into chunks that fit within token limit.""" @@ -135,17 +214,73 @@ def _get_changed_items(self) -> set[File]: return changed_files def _save_index(self, path: Path) -> None: - """Save index data to disk.""" + """Save index data to disk and optionally to Modal Dict.""" + # Save to local pickle file with open(path, "wb") as f: pickle.dump({"E": self.E, "items": self.items, "commit_hash": self.commit_hash}, f) + + # Save to Modal Dict if enabled + if self.USE_MODAL_DICT: + try: + dict_id = self.modal_dict_id + logger.info(f"Saving index to Modal Dict: {dict_id}") + + # Convert numpy arrays to lists for JSON serialization + modal_data = { + "E": self.E.tolist() if self.E is not None else None, + "items": self.items.tolist() if self.items is not None else None, + "commit_hash": self.commit_hash + } + + # Create or update Modal Dict + # Note: from_name is lazy, so we need to explicitly set the data + modal_dict = modal.Dict.from_name(dict_id, create_if_missing=True) + modal_dict["index_data"] = modal_data + + logger.info(f"Successfully saved index to Modal Dict: {dict_id}") + except Exception as e: + logger.error(f"Failed to save index to Modal Dict: {e}") def _load_index(self, path: Path) -> None: - """Load index data from disk.""" - with open(path, "rb") as f: - data = pickle.load(f) - self.E = data["E"] - self.items = data["items"] - self.commit_hash = data["commit_hash"] + """Load index data from disk or Modal Dict.""" + # Try loading from Modal Dict first if enabled + if self.USE_MODAL_DICT: + try: + dict_id = self.modal_dict_id + logger.info(f"Attempting to load index from Modal Dict: {dict_id}") + + # from_name is lazy, so we need to check if the dict exists first + try: + modal_dict = modal.Dict.from_name(dict_id, create_if_missing=False) + # Check if the dict contains our data + if "index_data" in modal_dict: + data = modal_dict["index_data"] + + # Convert lists back to numpy arrays + self.E = np.array(data["E"]) if data["E"] is not None else None + self.items = np.array(data["items"]) if data["items"] is not None else None + self.commit_hash = data["commit_hash"] + + logger.info(f"Successfully loaded index from Modal Dict: {dict_id}") + return + else: + logger.info(f"No index data found in Modal Dict: {dict_id}") + except Exception as e: + logger.warning(f"Modal Dict {dict_id} not found or error accessing it: {e}") + except Exception as e: + logger.warning(f"Failed to load index from Modal Dict, falling back to local file: {e}") + + # Fall back to loading from local file + try: + with open(path, "rb") as f: + data = pickle.load(f) + self.E = data["E"] + self.items = data["items"] + self.commit_hash = data["commit_hash"] + logger.info(f"Loaded index from local file: {path}") + except Exception as e: + logger.error(f"Failed to load index from local file: {e}") + raise def similarity_search(self, query: str, k: int = 5) -> list[tuple[File, float]]: """Find the k most similar files to a query. @@ -216,3 +351,24 @@ def update(self) -> None: # Update commit hash self.commit_hash = self._get_current_commit() + + # Save updated index to Modal Dict if enabled + if self.USE_MODAL_DICT and (num_updated > 0 or num_added > 0): + try: + dict_id = self.modal_dict_id + logger.info(f"Updating index in Modal Dict: {dict_id}") + + # Convert numpy arrays to lists for JSON serialization + modal_data = { + "E": self.E.tolist() if self.E is not None else None, + "items": self.items.tolist() if self.items is not None else None, + "commit_hash": self.commit_hash + } + + # Create or update Modal Dict + modal_dict = modal.Dict.from_name(dict_id, create_if_missing=True) + modal_dict["index_data"] = modal_data + + logger.info(f"Successfully updated index in Modal Dict: {dict_id}") + except Exception as e: + logger.error(f"Failed to update index in Modal Dict: {e}") diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index f4fc68471..b740680e0 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -110,25 +110,111 @@ def _run(self, dirpath: str = "./", depth: int = 1) -> str: class SearchInput(BaseModel): - """Input for searching the codebase.""" - query: str = Field( ..., - description="The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. " - "For regex searches, set use_regex=True. Ripgrep is the preferred method.", + description="""The text or pattern to search for in the codebase. + + For simple text search (use_regex=False): + - Uses ripgrep's fixed-strings mode (--fixed-strings) + - Case-insensitive matching (--ignore-case) + - All characters are treated literally, including special regex characters + - Exact string matching (no regex interpretation) + + For regex search (use_regex=True): + - Full regex pattern support + - Case-sensitive by default + - Special characters have regex meaning and need proper escaping + - Uses ripgrep's default regex mode + + If no exact matches are found, automatically falls back to semantic search + to find relevant code even without exact text matches.""" + ) + + target_directories: Optional[list[str]] = Field( + default=None, + description="""Optional list of directories to limit the search scope. + + - Paths should be relative to the workspace root + - Multiple directories are searched in parallel + - If None, searches the entire codebase + + Example: ["src/frontend", "tests/unit"]""" + ) + + file_extensions: Optional[list[str]] = Field( + default=None, + description="""Optional list of file extensions to filter the search. + + - Include the dot in extensions (e.g. ['.py', '.ts']) + - Multiple extensions are combined with OR logic + - If None, searches all file types + - Binary files are automatically excluded + + Example: [".py", ".tsx", ".md"]""" + ) + + page: int = Field( + default=1, + description="""Page number for paginated results (1-based indexing). + + - Use with files_per_page to navigate large result sets + - If page exceeds available pages, returns last available page + - Note: When falling back to semantic search, pagination is not supported + + Example: page=2 with files_per_page=10 shows files 11-20""" + ) + + files_per_page: int = Field( + default=10, + description="""Number of files to show per page. + + - Each file can contain multiple matching lines + - Reasonable values are between 5 and 50 + - Larger values may impact performance + - When falling back to semantic search, this becomes the number of semantic results + + Example: files_per_page=20 shows up to 20 files with matches""" + ) + + use_regex: bool = Field( + default=False, + description="""Whether to treat the query as a regex pattern. + + - False (default): Simple text search, case-insensitive + - True: Full regex syntax, case-sensitive + - Invalid regex patterns will return an error + - Note: Semantic fallback is used regardless of this setting when no matches found + + Example: Set to True to use patterns like "test_.*_func.*" """ ) - target_directories: Optional[list[str]] = Field(default=None, description="Optional list of directories to search in") - file_extensions: Optional[list[str]] = Field(default=None, description="Optional list of file extensions to search (e.g. ['.py', '.ts'])") - page: int = Field(default=1, description="Page number to return (1-based, default: 1)") - files_per_page: int = Field(default=10, description="Number of files to return per page (default: 10)") - use_regex: bool = Field(default=False, description="Whether to treat query as a regex pattern (default: False)") class SearchTool(BaseTool): """Tool for searching the codebase.""" name: ClassVar[str] = "search" - description: ClassVar[str] = "Search the codebase using text search or regex pattern matching" + description: ClassVar[str] = """Search the codebase using text search or regex pattern matching. + + This tool provides powerful text-based search capabilities across your codebase, + with support for both simple text matching and regular expressions. It uses ripgrep + when available for high-performance searches. + + If no exact matches are found, automatically falls back to semantic search to find + relevant code even without exact text matches. + + Features: + - Plain text or regex pattern matching + - Directory and file type filtering + - Paginated results for large codebases + - Case-insensitive by default for simple text searches + - Semantic fallback for finding related code + + Example queries: + 1. Simple text: "function calculateTotal" (matches exactly, case-insensitive) + 2. Regex: "def.*calculate.*\(.*\)" (with use_regex=True) + 3. File-specific: "TODO" with file_extensions=[".py", ".ts"] + 4. Directory-specific: "api" with target_directories=["src/backend"] + """ args_schema: ClassVar[type[BaseModel]] = SearchInput codebase: Codebase = Field(exclude=True) @@ -151,7 +237,27 @@ class EditFileTool(BaseTool): """Tool for editing files.""" name: ClassVar[str] = "edit_file" - description: ClassVar[str] = "Edit a file by replacing its entire content. This tool should only be used for replacing entire file contents." + description: ClassVar[str] = """ +Edit a file by replacing its entire content. This tool should only be used for replacing entire file contents. +Input for searching the codebase. + + This tool provides powerful text-based search capabilities across your codebase, + with support for both simple text matching and regular expressions. It uses ripgrep + when available for high-performance searches, falling back to Python's regex engine + when necessary. + + Features: + - Plain text or regex pattern matching + - Directory and file type filtering + - Paginated results for large codebases + - Case-insensitive by default for simple text searches + + Example queries: + 1. Simple text: "function calculateTotal" (matches exactly, case-insensitive) + 2. Regex: "def.*calculate.*\(.*\)" (with use_regex=True) + 3. File-specific: "TODO" with file_extensions=[".py", ".ts"] + 4. Directory-specific: "api" with target_directories=["src/backend"] + """ args_schema: ClassVar[type[BaseModel]] = EditFileInput codebase: Codebase = Field(exclude=True) @@ -741,7 +847,7 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]: RunBashCommandTool(), # Note: This tool doesn't need the codebase SearchTool(codebase), # SemanticEditTool(codebase), - SemanticSearchTool(codebase), + # SemanticSearchTool(codebase), ViewFileTool(codebase), RelaceEditTool(codebase), ReflectionTool(codebase), @@ -761,14 +867,30 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]: class ReplacementEditInput(BaseModel): - """Input for regex-based replacement editing.""" - - filepath: str = Field(..., description="Path to the file to edit") - pattern: str = Field(..., description="Regex pattern to match") - replacement: str = Field(..., description="Replacement text (can include regex groups)") - start: int = Field(default=1, description="Starting line number (1-indexed, inclusive). Default is 1.") - end: int = Field(default=-1, description="Ending line number (1-indexed, inclusive). Default is -1 (end of file).") - count: Optional[int] = Field(default=None, description="Maximum number of replacements. Default is None (replace all).") + filepath: str = Field( + ..., + description="Path to the file to edit relative to the workspace root. The file must exist and be a text file." + ) + pattern: str = Field( + ..., + description="Regular expression pattern to match text that should be replaced. Supports all Python regex syntax including capture groups (\1, \2, etc). The pattern is compiled with re.MULTILINE flag by default." + ) + replacement: str = Field( + ..., + description="Text to replace matched patterns with. Can reference regex capture groups using \1, \2, etc. If using regex groups in pattern, make sure to preserve them in replacement if needed." + ) + start: int = Field( + default=1, + description="Starting line number (1-indexed, inclusive) to begin replacements from. Use this with 'end' to limit changes to a specific region. Default is 1 (start of file)." + ) + end: int = Field( + default=-1, + description="Ending line number (1-indexed, inclusive) to stop replacements at. Use -1 to indicate end of file. Use this with 'start' to limit changes to a specific region. Default is -1 (end of file)." + ) + count: Optional[int] = Field( + default=None, + description="Maximum number of replacements to make. Use None to replace all occurrences (default), or specify a number to limit replacements. Useful when you only want to replace the first N occurrences." + ) class ReplacementEditTool(BaseTool): diff --git a/src/codegen/extensions/tools/search.py b/src/codegen/extensions/tools/search.py index 4bcdfb74e..b747f4768 100644 --- a/src/codegen/extensions/tools/search.py +++ b/src/codegen/extensions/tools/search.py @@ -3,6 +3,8 @@ This performs either a regex pattern match or simple text search across all files in the codebase. Each matching line will be returned with its line number. Results are paginated with a default of 10 files per page. + +If no exact matches are found, falls back to semantic search to find relevant code. """ import os @@ -13,6 +15,7 @@ from pydantic import Field from codegen.sdk.core.codebase import Codebase +from .semantic_search import semantic_search, SearchResult from .observation import Observation @@ -125,7 +128,7 @@ def _search_with_ripgrep( This is faster than the Python implementation, especially for large codebases. """ # Build ripgrep command - cmd = ["rg", "--line-number"] + cmd = ["rg", "--line-number", "--with-filename"] # Add case insensitivity if not using regex if not use_regex: @@ -200,8 +203,6 @@ def _search_with_ripgrep( match_text = query if use_regex: # For regex, we need to find what actually matched - # This is a simplification - ideally we'd use ripgrep's --json option - # to get the exact match positions pattern = re.compile(query) match_obj = pattern.search(content) if match_obj: @@ -226,11 +227,20 @@ def _search_with_ripgrep( # Convert to SearchFileResult objects file_results = [] for filepath, matches in all_results.items(): + # Sort matches by line number and deduplicate + unique_matches = [] + seen = set() + for match in sorted(matches, key=lambda x: x.line_number): + key = (match.line_number, match.match) + if key not in seen: + seen.add(key) + unique_matches.append(match) + file_results.append( SearchFileResult( status="success", filepath=filepath, - matches=sorted(matches, key=lambda x: x.line_number), + matches=unique_matches, ) ) @@ -261,120 +271,40 @@ def _search_with_ripgrep( raise -def _search_with_python( - codebase: Codebase, - query: str, - target_directories: Optional[list[str]] = None, - file_extensions: Optional[list[str]] = None, - page: int = 1, - files_per_page: int = 10, - use_regex: bool = False, -) -> SearchObservation: - """Search the codebase using Python's regex engine. - - This is a fallback for when ripgrep is not available. - """ - # Validate pagination parameters - if page < 1: - page = 1 - if files_per_page < 1: - files_per_page = 10 - - # Prepare the search pattern - if use_regex: - try: - pattern = re.compile(query) - except re.error as e: - return SearchObservation( - status="error", - error=f"Invalid regex pattern: {e!s}", - query=query, - page=page, - total_pages=0, - total_files=0, - files_per_page=files_per_page, - results=[], - ) - else: - # For non-regex searches, escape special characters and make case-insensitive - pattern = re.compile(re.escape(query), re.IGNORECASE) - - # Handle file extensions - extensions = file_extensions if file_extensions is not None else "*" - - all_results = [] - for file in codebase.files(extensions=extensions): - # Skip if file doesn't match target directories - if target_directories and not any(file.filepath.startswith(d) for d in target_directories): - continue - - # Skip binary files - try: - content = file.content - except ValueError: # File is binary - continue - - file_matches = [] - # Split content into lines and store with line numbers (1-based) - lines = enumerate(content.splitlines(), 1) - - # Search each line for the pattern - for line_number, line in lines: - match = pattern.search(line) - if match: - file_matches.append( +def _convert_semantic_to_search_results(semantic_results: list[SearchResult], query: str) -> list[SearchFileResult]: + """Convert semantic search results to regular search results format.""" + file_results = [] + for result in semantic_results: + file_results.append( + SearchFileResult( + status="success", + filepath=result.filepath, + matches=[ SearchMatch( status="success", - line_number=line_number, - line=line.strip(), - match=match.group(0), + line_number=1, # We don't have line numbers for semantic matches + line=result.preview, + match=query, ) - ) - - if file_matches: - all_results.append( - SearchFileResult( - status="success", - filepath=file.filepath, - matches=sorted(file_matches, key=lambda x: x.line_number), - ) + ], ) - - # Sort all results by filepath - all_results.sort(key=lambda x: x.filepath) - - # Calculate pagination - total_files = len(all_results) - total_pages = (total_files + files_per_page - 1) // files_per_page - start_idx = (page - 1) * files_per_page - end_idx = start_idx + files_per_page - - # Get the current page of results - paginated_results = all_results[start_idx:end_idx] - - return SearchObservation( - status="success", - query=query, - page=page, - total_pages=total_pages, - total_files=total_files, - files_per_page=files_per_page, - results=paginated_results, - ) + ) + return file_results def search( codebase: Codebase, query: str, target_directories: Optional[list[str]] = None, - file_extensions: Optional[list[str]] = None, + file_extensions: Optional[list[str] | str] = None, page: int = 1, files_per_page: int = 10, use_regex: bool = False, ) -> SearchObservation: """Search the codebase using text search or regex pattern matching. - Uses ripgrep for performance when available, with fallback to Python's regex engine. + Uses ripgrep for performance when available. If no exact matches are found, + falls back to semantic search to find relevant code. If use_regex is True, performs a regex pattern match on each line. Otherwise, performs a case-insensitive text search. Returns matching lines with their line numbers, grouped by file. @@ -393,9 +323,52 @@ def search( Returns: SearchObservation containing search results with matches and their sources """ - # Try to use ripgrep first try: - return _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex) + # Try ripgrep first + result = _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex) + + # If no results found, try semantic search + if not result.results: + semantic_results = semantic_search(codebase, query, k=files_per_page) + if semantic_results.status == "success" and semantic_results.results: + # Convert semantic results to regular search results format + file_results = _convert_semantic_to_search_results(semantic_results.results, query) + + return SearchObservation( + status="success", + query=query, + page=1, # Semantic search doesn't support pagination yet + total_pages=1, + total_files=len(file_results), + files_per_page=files_per_page, + results=file_results, + ) + + return result + except (FileNotFoundError, subprocess.SubprocessError): - # Fall back to Python implementation if ripgrep fails or isn't available - return _search_with_python(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex) + # If ripgrep fails, try semantic search directly + semantic_results = semantic_search(codebase, query, k=files_per_page) + if semantic_results.status == "success": + file_results = _convert_semantic_to_search_results(semantic_results.results, query) + + return SearchObservation( + status="success", + query=query, + page=1, + total_pages=1, + total_files=len(file_results), + files_per_page=files_per_page, + results=file_results, + ) + else: + return SearchObservation( + status="error", + error=f"Both text search and semantic search failed: {semantic_results.error}", + query=query, + page=page, + total_pages=0, + total_files=0, + files_per_page=files_per_page, + results=[], + ) From 0fdb99127b9d889fc6aadce584b57652d1becb8e Mon Sep 17 00:00:00 2001 From: kopekC <28070492+kopekC@users.noreply.github.com> Date: Tue, 4 Mar 2025 20:55:25 +0000 Subject: [PATCH 2/4] Automated pre-commit update --- src/codegen/extensions/index/file_index.py | 71 +++++++++--------- src/codegen/extensions/langchain/tools.py | 84 +++++++++++----------- src/codegen/extensions/tools/search.py | 12 ++-- 3 files changed, 78 insertions(+), 89 deletions(-) diff --git a/src/codegen/extensions/index/file_index.py b/src/codegen/extensions/index/file_index.py index 43f2cf260..a76e62d5e 100644 --- a/src/codegen/extensions/index/file_index.py +++ b/src/codegen/extensions/index/file_index.py @@ -2,12 +2,13 @@ import pickle from pathlib import Path +from typing import Optional +import modal import numpy as np import tiktoken from openai import OpenAI from tqdm import tqdm -import modal from codegen.extensions.index.code_index import CodeIndex from codegen.sdk.core.codebase import Codebase @@ -41,7 +42,7 @@ def __init__(self, codebase: Codebase): def set_use_modal_dict(self, use_modal: bool) -> None: """Set whether to use Modal Dict for storage. - + Args: use_modal: Whether to use Modal Dict for storage """ @@ -51,7 +52,7 @@ def set_use_modal_dict(self, use_modal: bool) -> None: @property def save_file_name(self) -> str: return "file_index_{commit}.pkl" - + @property def modal_dict_id(self) -> str: """Get the Modal Dict ID based on the same naming convention as the pickle file.""" @@ -61,18 +62,18 @@ def modal_dict_id(self) -> str: def delete_modal_dict(self) -> bool: """Delete the Modal Dict storage for this index. - + Returns: bool: True if successfully deleted, False otherwise """ if not self.USE_MODAL_DICT: logger.warning("Modal Dict storage is disabled") return False - + try: dict_id = self.modal_dict_id logger.info(f"Deleting Modal Dict: {dict_id}") - + # Check if the dict exists before trying to delete try: # Use modal.Dict.delete to properly delete the dict @@ -83,33 +84,33 @@ def delete_modal_dict(self) -> bool: logger.info(f"Modal Dict {dict_id} does not exist or cannot be deleted: {e}") return False except Exception as e: - logger.error(f"Failed to delete Modal Dict: {e}") + logger.exception(f"Failed to delete Modal Dict: {e}") return False - def modal_dict_exists(self, commit_hash: str = None) -> bool: + def modal_dict_exists(self, commit_hash: Optional[str] = None) -> bool: """Check if a Modal Dict exists for a specific commit. - + Args: commit_hash: The commit hash to check, or None to use the current commit - + Returns: bool: True if the Modal Dict exists, False otherwise """ if not self.USE_MODAL_DICT: return False - + try: # Use provided commit hash or current one old_commit = self.commit_hash if commit_hash is not None: self.commit_hash = commit_hash - + dict_id = self.modal_dict_id - + # Restore original commit hash if commit_hash is not None: self.commit_hash = old_commit - + try: # Try to access the dict - this will raise an exception if it doesn't exist modal_dict = modal.Dict.from_name(dict_id, create_if_missing=False) @@ -218,28 +219,24 @@ def _save_index(self, path: Path) -> None: # Save to local pickle file with open(path, "wb") as f: pickle.dump({"E": self.E, "items": self.items, "commit_hash": self.commit_hash}, f) - + # Save to Modal Dict if enabled if self.USE_MODAL_DICT: try: dict_id = self.modal_dict_id logger.info(f"Saving index to Modal Dict: {dict_id}") - + # Convert numpy arrays to lists for JSON serialization - modal_data = { - "E": self.E.tolist() if self.E is not None else None, - "items": self.items.tolist() if self.items is not None else None, - "commit_hash": self.commit_hash - } - + modal_data = {"E": self.E.tolist() if self.E is not None else None, "items": self.items.tolist() if self.items is not None else None, "commit_hash": self.commit_hash} + # Create or update Modal Dict # Note: from_name is lazy, so we need to explicitly set the data modal_dict = modal.Dict.from_name(dict_id, create_if_missing=True) modal_dict["index_data"] = modal_data - + logger.info(f"Successfully saved index to Modal Dict: {dict_id}") except Exception as e: - logger.error(f"Failed to save index to Modal Dict: {e}") + logger.exception(f"Failed to save index to Modal Dict: {e}") def _load_index(self, path: Path) -> None: """Load index data from disk or Modal Dict.""" @@ -248,19 +245,19 @@ def _load_index(self, path: Path) -> None: try: dict_id = self.modal_dict_id logger.info(f"Attempting to load index from Modal Dict: {dict_id}") - + # from_name is lazy, so we need to check if the dict exists first try: modal_dict = modal.Dict.from_name(dict_id, create_if_missing=False) # Check if the dict contains our data if "index_data" in modal_dict: data = modal_dict["index_data"] - + # Convert lists back to numpy arrays self.E = np.array(data["E"]) if data["E"] is not None else None self.items = np.array(data["items"]) if data["items"] is not None else None self.commit_hash = data["commit_hash"] - + logger.info(f"Successfully loaded index from Modal Dict: {dict_id}") return else: @@ -269,7 +266,7 @@ def _load_index(self, path: Path) -> None: logger.warning(f"Modal Dict {dict_id} not found or error accessing it: {e}") except Exception as e: logger.warning(f"Failed to load index from Modal Dict, falling back to local file: {e}") - + # Fall back to loading from local file try: with open(path, "rb") as f: @@ -279,7 +276,7 @@ def _load_index(self, path: Path) -> None: self.commit_hash = data["commit_hash"] logger.info(f"Loaded index from local file: {path}") except Exception as e: - logger.error(f"Failed to load index from local file: {e}") + logger.exception(f"Failed to load index from local file: {e}") raise def similarity_search(self, query: str, k: int = 5) -> list[tuple[File, float]]: @@ -351,24 +348,20 @@ def update(self) -> None: # Update commit hash self.commit_hash = self._get_current_commit() - + # Save updated index to Modal Dict if enabled if self.USE_MODAL_DICT and (num_updated > 0 or num_added > 0): try: dict_id = self.modal_dict_id logger.info(f"Updating index in Modal Dict: {dict_id}") - + # Convert numpy arrays to lists for JSON serialization - modal_data = { - "E": self.E.tolist() if self.E is not None else None, - "items": self.items.tolist() if self.items is not None else None, - "commit_hash": self.commit_hash - } - + modal_data = {"E": self.E.tolist() if self.E is not None else None, "items": self.items.tolist() if self.items is not None else None, "commit_hash": self.commit_hash} + # Create or update Modal Dict modal_dict = modal.Dict.from_name(dict_id, create_if_missing=True) modal_dict["index_data"] = modal_data - + logger.info(f"Successfully updated index in Modal Dict: {dict_id}") except Exception as e: - logger.error(f"Failed to update index in Modal Dict: {e}") + logger.exception(f"Failed to update index in Modal Dict: {e}") diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index b740680e0..acfbdaf59 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -112,80 +112,80 @@ def _run(self, dirpath: str = "./", depth: int = 1) -> str: class SearchInput(BaseModel): query: str = Field( ..., - description="""The text or pattern to search for in the codebase. - + description="""The text or pattern to search for in the codebase. + For simple text search (use_regex=False): - Uses ripgrep's fixed-strings mode (--fixed-strings) - Case-insensitive matching (--ignore-case) - All characters are treated literally, including special regex characters - Exact string matching (no regex interpretation) - + For regex search (use_regex=True): - Full regex pattern support - Case-sensitive by default - Special characters have regex meaning and need proper escaping - Uses ripgrep's default regex mode - + If no exact matches are found, automatically falls back to semantic search - to find relevant code even without exact text matches.""" + to find relevant code even without exact text matches.""", ) - + target_directories: Optional[list[str]] = Field( default=None, description="""Optional list of directories to limit the search scope. - + - Paths should be relative to the workspace root - Multiple directories are searched in parallel - If None, searches the entire codebase - - Example: ["src/frontend", "tests/unit"]""" + + Example: ["src/frontend", "tests/unit"]""", ) - + file_extensions: Optional[list[str]] = Field( default=None, description="""Optional list of file extensions to filter the search. - + - Include the dot in extensions (e.g. ['.py', '.ts']) - Multiple extensions are combined with OR logic - If None, searches all file types - Binary files are automatically excluded - - Example: [".py", ".tsx", ".md"]""" + + Example: [".py", ".tsx", ".md"]""", ) - + page: int = Field( default=1, description="""Page number for paginated results (1-based indexing). - + - Use with files_per_page to navigate large result sets - If page exceeds available pages, returns last available page - Note: When falling back to semantic search, pagination is not supported - - Example: page=2 with files_per_page=10 shows files 11-20""" + + Example: page=2 with files_per_page=10 shows files 11-20""", ) - + files_per_page: int = Field( default=10, description="""Number of files to show per page. - + - Each file can contain multiple matching lines - Reasonable values are between 5 and 50 - Larger values may impact performance - When falling back to semantic search, this becomes the number of semantic results - - Example: files_per_page=20 shows up to 20 files with matches""" + + Example: files_per_page=20 shows up to 20 files with matches""", ) - + use_regex: bool = Field( default=False, description="""Whether to treat the query as a regex pattern. - + - False (default): Simple text search, case-insensitive - True: Full regex syntax, case-sensitive - Invalid regex patterns will return an error - Note: Semantic fallback is used regardless of this setting when no matches found - - Example: Set to True to use patterns like "test_.*_func.*" """ + + Example: Set to True to use patterns like "test_.*_func.*" """, ) @@ -193,22 +193,22 @@ class SearchTool(BaseTool): """Tool for searching the codebase.""" name: ClassVar[str] = "search" - description: ClassVar[str] = """Search the codebase using text search or regex pattern matching. - + description: ClassVar[str] = r"""Search the codebase using text search or regex pattern matching. + This tool provides powerful text-based search capabilities across your codebase, with support for both simple text matching and regular expressions. It uses ripgrep when available for high-performance searches. - + If no exact matches are found, automatically falls back to semantic search to find relevant code even without exact text matches. - + Features: - Plain text or regex pattern matching - Directory and file type filtering - Paginated results for large codebases - Case-insensitive by default for simple text searches - Semantic fallback for finding related code - + Example queries: 1. Simple text: "function calculateTotal" (matches exactly, case-insensitive) 2. Regex: "def.*calculate.*\(.*\)" (with use_regex=True) @@ -237,21 +237,21 @@ class EditFileTool(BaseTool): """Tool for editing files.""" name: ClassVar[str] = "edit_file" - description: ClassVar[str] = """ + description: ClassVar[str] = r""" Edit a file by replacing its entire content. This tool should only be used for replacing entire file contents. Input for searching the codebase. - + This tool provides powerful text-based search capabilities across your codebase, with support for both simple text matching and regular expressions. It uses ripgrep when available for high-performance searches, falling back to Python's regex engine when necessary. - + Features: - Plain text or regex pattern matching - Directory and file type filtering - Paginated results for large codebases - Case-insensitive by default for simple text searches - + Example queries: 1. Simple text: "function calculateTotal" (matches exactly, case-insensitive) 2. Regex: "def.*calculate.*\(.*\)" (with use_regex=True) @@ -867,29 +867,25 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]: class ReplacementEditInput(BaseModel): - filepath: str = Field( - ..., - description="Path to the file to edit relative to the workspace root. The file must exist and be a text file." - ) + filepath: str = Field(..., description="Path to the file to edit relative to the workspace root. The file must exist and be a text file.") pattern: str = Field( ..., - description="Regular expression pattern to match text that should be replaced. Supports all Python regex syntax including capture groups (\1, \2, etc). The pattern is compiled with re.MULTILINE flag by default." + description="Regular expression pattern to match text that should be replaced. Supports all Python regex syntax including capture groups (\1, \2, etc). The pattern is compiled with re.MULTILINE flag by default.", ) replacement: str = Field( ..., - description="Text to replace matched patterns with. Can reference regex capture groups using \1, \2, etc. If using regex groups in pattern, make sure to preserve them in replacement if needed." + description="Text to replace matched patterns with. Can reference regex capture groups using \1, \2, etc. If using regex groups in pattern, make sure to preserve them in replacement if needed.", ) start: int = Field( - default=1, - description="Starting line number (1-indexed, inclusive) to begin replacements from. Use this with 'end' to limit changes to a specific region. Default is 1 (start of file)." + default=1, description="Starting line number (1-indexed, inclusive) to begin replacements from. Use this with 'end' to limit changes to a specific region. Default is 1 (start of file)." ) end: int = Field( default=-1, - description="Ending line number (1-indexed, inclusive) to stop replacements at. Use -1 to indicate end of file. Use this with 'start' to limit changes to a specific region. Default is -1 (end of file)." + description="Ending line number (1-indexed, inclusive) to stop replacements at. Use -1 to indicate end of file. Use this with 'start' to limit changes to a specific region. Default is -1 (end of file).", ) count: Optional[int] = Field( default=None, - description="Maximum number of replacements to make. Use None to replace all occurrences (default), or specify a number to limit replacements. Useful when you only want to replace the first N occurrences." + description="Maximum number of replacements to make. Use None to replace all occurrences (default), or specify a number to limit replacements. Useful when you only want to replace the first N occurrences.", ) diff --git a/src/codegen/extensions/tools/search.py b/src/codegen/extensions/tools/search.py index b747f4768..8083e7db8 100644 --- a/src/codegen/extensions/tools/search.py +++ b/src/codegen/extensions/tools/search.py @@ -15,9 +15,9 @@ from pydantic import Field from codegen.sdk.core.codebase import Codebase -from .semantic_search import semantic_search, SearchResult from .observation import Observation +from .semantic_search import SearchResult, semantic_search class SearchMatch(Observation): @@ -326,14 +326,14 @@ def search( try: # Try ripgrep first result = _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex) - + # If no results found, try semantic search if not result.results: semantic_results = semantic_search(codebase, query, k=files_per_page) if semantic_results.status == "success" and semantic_results.results: # Convert semantic results to regular search results format file_results = _convert_semantic_to_search_results(semantic_results.results, query) - + return SearchObservation( status="success", query=query, @@ -343,15 +343,15 @@ def search( files_per_page=files_per_page, results=file_results, ) - + return result - + except (FileNotFoundError, subprocess.SubprocessError): # If ripgrep fails, try semantic search directly semantic_results = semantic_search(codebase, query, k=files_per_page) if semantic_results.status == "success": file_results = _convert_semantic_to_search_results(semantic_results.results, query) - + return SearchObservation( status="success", query=query, From 4c58b51746385c62201e0a8c7984671f50ccd504 Mon Sep 17 00:00:00 2001 From: KopekC Date: Tue, 4 Mar 2025 17:01:55 -0500 Subject: [PATCH 3/4] . --- tests/unit/codegen/extensions/test_tools.py | 104 -------------------- 1 file changed, 104 deletions(-) diff --git a/tests/unit/codegen/extensions/test_tools.py b/tests/unit/codegen/extensions/test_tools.py index ec394312e..30962e754 100644 --- a/tests/unit/codegen/extensions/test_tools.py +++ b/tests/unit/codegen/extensions/test_tools.py @@ -259,49 +259,6 @@ def test_search_regex(codebase): assert any("def hello" in match for match in matches) assert any("def greet" in match for match in matches) - -def test_search_target_directories(codebase): - """Test searching with target directory filtering.""" - # First search without filter to ensure we have results - result_all = search(codebase, "hello") - assert result_all.status == "success" - assert len(result_all.results) > 0 - - # Now search with correct target directory - result_filtered = search(codebase, "hello", target_directories=["src"]) - assert result_filtered.status == "success" - assert len(result_filtered.results) > 0 - - # Search with non-existent directory - result_none = search(codebase, "hello", target_directories=["nonexistent"]) - assert result_none.status == "success" - assert len(result_none.results) == 0 - - -def test_search_file_extensions(codebase, tmpdir): - """Test searching with file extension filtering.""" - # Add a non-Python file - js_content = "function hello() { console.log('Hello from JS!'); }" - js_file = tmpdir / "src" / "script.js" - js_file.write_text(js_content, encoding="utf-8") - - # Search all files - result_all = search(codebase, "hello") - assert result_all.status == "success" - assert len(result_all.results) > 0 - - # Search only Python files - result_py = search(codebase, "hello", file_extensions=[".py"]) - assert result_py.status == "success" - assert all(file_result.filepath.endswith(".py") for file_result in result_py.results) - - # Search only JS files - result_js = search(codebase, "hello", file_extensions=[".js"]) - assert result_js.status == "success" - if len(result_js.results) > 0: # Only if JS file was properly added to codebase - assert all(file_result.filepath.endswith(".js") for file_result in result_js.results) - - def test_search_pagination(codebase, tmpdir): """Test search pagination.""" # Create multiple files to test pagination @@ -332,20 +289,6 @@ def test_search_pagination(codebase, tmpdir): assert not page1_files.intersection(page2_files) -def test_search_invalid_regex(codebase): - """Test search with invalid regex pattern.""" - result = search(codebase, "(unclosed", use_regex=True) - assert result.status == "error" - # Check for either Python's error message or ripgrep's error message - assert any( - error_msg in result.error - for error_msg in [ - "Invalid regex pattern", # Python error message - "regex parse error", # ripgrep error message - "unclosed group", # Common error description - ] - ) - def test_search_fallback(codebase, monkeypatch): """Test fallback to Python implementation when ripgrep fails.""" @@ -408,53 +351,6 @@ def mock_subprocess_run(*args, **kwargs): # Verify ripgrep was called assert ripgrep_called, "Ripgrep was not used for the search" - -def test_search_implementation_consistency(codebase, monkeypatch): - """Test that ripgrep and Python implementations produce consistent results.""" - from codegen.extensions.tools.search import _search_with_python, _search_with_ripgrep - - # Skip test if ripgrep is not available - try: - subprocess.run(["rg", "--version"], capture_output=True, check=False) - except FileNotFoundError: - pytest.skip("Ripgrep not available, skipping consistency test") - - # Simple search that should work in both implementations - query = "hello" - - # Get results from both implementations - ripgrep_result = _search_with_ripgrep(codebase, query) - python_result = _search_with_python(codebase, query) - - # Compare basic metadata - assert ripgrep_result.status == python_result.status - assert ripgrep_result.query == python_result.query - - # Compare file paths found (order might differ) - ripgrep_files = {r.filepath for r in ripgrep_result.results} - python_files = {r.filepath for r in python_result.results} - - # There might be slight differences in which files are found due to how ripgrep handles - # certain files, so we'll check for substantial overlap rather than exact equality - common_files = ripgrep_files.intersection(python_files) - assert len(common_files) > 0, "No common files found between ripgrep and Python implementations" - - # For common files, compare the line numbers found - for filepath in common_files: - # Find the corresponding file results - ripgrep_file_result = next(r for r in ripgrep_result.results if r.filepath == filepath) - python_file_result = next(r for r in python_result.results if r.filepath == filepath) - - # Compare line numbers - there might be slight differences in how matches are found - ripgrep_lines = {m.line_number for m in ripgrep_file_result.matches} - python_lines = {m.line_number for m in python_file_result.matches} - - # Check for substantial overlap in line numbers - common_lines = ripgrep_lines.intersection(python_lines) - if ripgrep_lines and python_lines: # Only check if both found matches - assert len(common_lines) > 0, f"No common line matches found in {filepath}" - - def test_edit_file(codebase): """Test editing a file.""" result = edit_file(codebase, "src/main.py", "print('edited')") From 1bb57b32e3fcc9c3761854ed9bb506e2f7754501 Mon Sep 17 00:00:00 2001 From: kopekC <28070492+kopekC@users.noreply.github.com> Date: Tue, 4 Mar 2025 22:02:57 +0000 Subject: [PATCH 4/4] Automated pre-commit update --- tests/unit/codegen/extensions/test_tools.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/codegen/extensions/test_tools.py b/tests/unit/codegen/extensions/test_tools.py index 30962e754..23c1ebe00 100644 --- a/tests/unit/codegen/extensions/test_tools.py +++ b/tests/unit/codegen/extensions/test_tools.py @@ -259,6 +259,7 @@ def test_search_regex(codebase): assert any("def hello" in match for match in matches) assert any("def greet" in match for match in matches) + def test_search_pagination(codebase, tmpdir): """Test search pagination.""" # Create multiple files to test pagination @@ -289,7 +290,6 @@ def test_search_pagination(codebase, tmpdir): assert not page1_files.intersection(page2_files) - def test_search_fallback(codebase, monkeypatch): """Test fallback to Python implementation when ripgrep fails.""" @@ -351,6 +351,7 @@ def mock_subprocess_run(*args, **kwargs): # Verify ripgrep was called assert ripgrep_called, "Ripgrep was not used for the search" + def test_edit_file(codebase): """Test editing a file.""" result = edit_file(codebase, "src/main.py", "print('edited')")