Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 156 additions & 7 deletions src/codegen/extensions/index/file_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

import pickle
from pathlib import Path
from typing import Optional

import modal
import numpy as np
import tiktoken
from openai import OpenAI
Expand All @@ -26,6 +28,7 @@ class FileIndex(CodeIndex):
EMBEDDING_MODEL = "text-embedding-3-small"
MAX_TOKENS = 8000
BATCH_SIZE = 100
USE_MODAL_DICT = True # Flag to control whether to use Modal Dict

def __init__(self, codebase: Codebase):
"""Initialize the file index.
Expand All @@ -37,10 +40,87 @@ def __init__(self, codebase: Codebase):
self.client = OpenAI()
self.encoding = tiktoken.get_encoding("cl100k_base")

def set_use_modal_dict(self, use_modal: bool) -> None:
"""Set whether to use Modal Dict for storage.

Args:
use_modal: Whether to use Modal Dict for storage
"""
self.USE_MODAL_DICT = use_modal
logger.info(f"Modal Dict storage {'enabled' if use_modal else 'disabled'}")

@property
def save_file_name(self) -> str:
return "file_index_{commit}.pkl"

@property
def modal_dict_id(self) -> str:
"""Get the Modal Dict ID based on the same naming convention as the pickle file."""
if not self.commit_hash:
return "file_index_latest"
return f"file_index_{self.commit_hash}"

def delete_modal_dict(self) -> bool:
"""Delete the Modal Dict storage for this index.

Returns:
bool: True if successfully deleted, False otherwise
"""
if not self.USE_MODAL_DICT:
logger.warning("Modal Dict storage is disabled")
return False

try:
dict_id = self.modal_dict_id
logger.info(f"Deleting Modal Dict: {dict_id}")

# Check if the dict exists before trying to delete
try:
# Use modal.Dict.delete to properly delete the dict
modal.Dict.delete(dict_id)
logger.info(f"Successfully deleted Modal Dict: {dict_id}")
return True
except Exception as e:
logger.info(f"Modal Dict {dict_id} does not exist or cannot be deleted: {e}")
return False
except Exception as e:
logger.exception(f"Failed to delete Modal Dict: {e}")
return False

def modal_dict_exists(self, commit_hash: Optional[str] = None) -> bool:
"""Check if a Modal Dict exists for a specific commit.

Args:
commit_hash: The commit hash to check, or None to use the current commit

Returns:
bool: True if the Modal Dict exists, False otherwise
"""
if not self.USE_MODAL_DICT:
return False

try:
# Use provided commit hash or current one
old_commit = self.commit_hash
if commit_hash is not None:
self.commit_hash = commit_hash

dict_id = self.modal_dict_id

# Restore original commit hash
if commit_hash is not None:
self.commit_hash = old_commit

try:
# Try to access the dict - this will raise an exception if it doesn't exist
modal_dict = modal.Dict.from_name(dict_id, create_if_missing=False)
# Check if our data is in the dict
return "index_data" in modal_dict
except Exception:
return False
except Exception:
return False

def _split_by_tokens(self, text: str) -> list[str]:
"""Split text into chunks that fit within token limit."""
tokens = self.encoding.encode(text)
Expand Down Expand Up @@ -135,17 +215,69 @@ def _get_changed_items(self) -> set[File]:
return changed_files

def _save_index(self, path: Path) -> None:
"""Save index data to disk."""
"""Save index data to disk and optionally to Modal Dict."""
# Save to local pickle file
with open(path, "wb") as f:
pickle.dump({"E": self.E, "items": self.items, "commit_hash": self.commit_hash}, f)

# Save to Modal Dict if enabled
if self.USE_MODAL_DICT:
try:
dict_id = self.modal_dict_id
logger.info(f"Saving index to Modal Dict: {dict_id}")

# Convert numpy arrays to lists for JSON serialization
modal_data = {"E": self.E.tolist() if self.E is not None else None, "items": self.items.tolist() if self.items is not None else None, "commit_hash": self.commit_hash}

# Create or update Modal Dict
# Note: from_name is lazy, so we need to explicitly set the data
modal_dict = modal.Dict.from_name(dict_id, create_if_missing=True)
modal_dict["index_data"] = modal_data

logger.info(f"Successfully saved index to Modal Dict: {dict_id}")
except Exception as e:
logger.exception(f"Failed to save index to Modal Dict: {e}")

def _load_index(self, path: Path) -> None:
"""Load index data from disk."""
with open(path, "rb") as f:
data = pickle.load(f)
self.E = data["E"]
self.items = data["items"]
self.commit_hash = data["commit_hash"]
"""Load index data from disk or Modal Dict."""
# Try loading from Modal Dict first if enabled
if self.USE_MODAL_DICT:
try:
dict_id = self.modal_dict_id
logger.info(f"Attempting to load index from Modal Dict: {dict_id}")

# from_name is lazy, so we need to check if the dict exists first
try:
modal_dict = modal.Dict.from_name(dict_id, create_if_missing=False)
# Check if the dict contains our data
if "index_data" in modal_dict:
data = modal_dict["index_data"]

# Convert lists back to numpy arrays
self.E = np.array(data["E"]) if data["E"] is not None else None
self.items = np.array(data["items"]) if data["items"] is not None else None
self.commit_hash = data["commit_hash"]

logger.info(f"Successfully loaded index from Modal Dict: {dict_id}")
return
else:
logger.info(f"No index data found in Modal Dict: {dict_id}")
except Exception as e:
logger.warning(f"Modal Dict {dict_id} not found or error accessing it: {e}")
except Exception as e:
logger.warning(f"Failed to load index from Modal Dict, falling back to local file: {e}")

# Fall back to loading from local file
try:
with open(path, "rb") as f:
data = pickle.load(f)
self.E = data["E"]
self.items = data["items"]
self.commit_hash = data["commit_hash"]
logger.info(f"Loaded index from local file: {path}")
except Exception as e:
logger.exception(f"Failed to load index from local file: {e}")
raise

def similarity_search(self, query: str, k: int = 5) -> list[tuple[File, float]]:
"""Find the k most similar files to a query.
Expand Down Expand Up @@ -216,3 +348,20 @@ def update(self) -> None:

# Update commit hash
self.commit_hash = self._get_current_commit()

# Save updated index to Modal Dict if enabled
if self.USE_MODAL_DICT and (num_updated > 0 or num_added > 0):
try:
dict_id = self.modal_dict_id
logger.info(f"Updating index in Modal Dict: {dict_id}")

# Convert numpy arrays to lists for JSON serialization
modal_data = {"E": self.E.tolist() if self.E is not None else None, "items": self.items.tolist() if self.items is not None else None, "commit_hash": self.commit_hash}

# Create or update Modal Dict
modal_dict = modal.Dict.from_name(dict_id, create_if_missing=True)
modal_dict["index_data"] = modal_data

logger.info(f"Successfully updated index in Modal Dict: {dict_id}")
except Exception as e:
logger.exception(f"Failed to update index in Modal Dict: {e}")
158 changes: 138 additions & 20 deletions src/codegen/extensions/langchain/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,25 +110,111 @@ def _run(self, dirpath: str = "./", depth: int = 1) -> str:


class SearchInput(BaseModel):
"""Input for searching the codebase."""

query: str = Field(
...,
description="The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. "
"For regex searches, set use_regex=True. Ripgrep is the preferred method.",
description="""The text or pattern to search for in the codebase.

For simple text search (use_regex=False):
- Uses ripgrep's fixed-strings mode (--fixed-strings)
- Case-insensitive matching (--ignore-case)
- All characters are treated literally, including special regex characters
- Exact string matching (no regex interpretation)

For regex search (use_regex=True):
- Full regex pattern support
- Case-sensitive by default
- Special characters have regex meaning and need proper escaping
- Uses ripgrep's default regex mode

If no exact matches are found, automatically falls back to semantic search
to find relevant code even without exact text matches.""",
)

target_directories: Optional[list[str]] = Field(
default=None,
description="""Optional list of directories to limit the search scope.

- Paths should be relative to the workspace root
- Multiple directories are searched in parallel
- If None, searches the entire codebase

Example: ["src/frontend", "tests/unit"]""",
)

file_extensions: Optional[list[str]] = Field(
default=None,
description="""Optional list of file extensions to filter the search.

- Include the dot in extensions (e.g. ['.py', '.ts'])
- Multiple extensions are combined with OR logic
- If None, searches all file types
- Binary files are automatically excluded

Example: [".py", ".tsx", ".md"]""",
)

page: int = Field(
default=1,
description="""Page number for paginated results (1-based indexing).

- Use with files_per_page to navigate large result sets
- If page exceeds available pages, returns last available page
- Note: When falling back to semantic search, pagination is not supported

Example: page=2 with files_per_page=10 shows files 11-20""",
)

files_per_page: int = Field(
default=10,
description="""Number of files to show per page.

- Each file can contain multiple matching lines
- Reasonable values are between 5 and 50
- Larger values may impact performance
- When falling back to semantic search, this becomes the number of semantic results

Example: files_per_page=20 shows up to 20 files with matches""",
)

use_regex: bool = Field(
default=False,
description="""Whether to treat the query as a regex pattern.

- False (default): Simple text search, case-insensitive
- True: Full regex syntax, case-sensitive
- Invalid regex patterns will return an error
- Note: Semantic fallback is used regardless of this setting when no matches found

Example: Set to True to use patterns like "test_.*_func.*" """,
)
target_directories: Optional[list[str]] = Field(default=None, description="Optional list of directories to search in")
file_extensions: Optional[list[str]] = Field(default=None, description="Optional list of file extensions to search (e.g. ['.py', '.ts'])")
page: int = Field(default=1, description="Page number to return (1-based, default: 1)")
files_per_page: int = Field(default=10, description="Number of files to return per page (default: 10)")
use_regex: bool = Field(default=False, description="Whether to treat query as a regex pattern (default: False)")


class SearchTool(BaseTool):
"""Tool for searching the codebase."""

name: ClassVar[str] = "search"
description: ClassVar[str] = "Search the codebase using text search or regex pattern matching"
description: ClassVar[str] = r"""Search the codebase using text search or regex pattern matching.

This tool provides powerful text-based search capabilities across your codebase,
with support for both simple text matching and regular expressions. It uses ripgrep
when available for high-performance searches.

If no exact matches are found, automatically falls back to semantic search to find
relevant code even without exact text matches.

Features:
- Plain text or regex pattern matching
- Directory and file type filtering
- Paginated results for large codebases
- Case-insensitive by default for simple text searches
- Semantic fallback for finding related code

Example queries:
1. Simple text: "function calculateTotal" (matches exactly, case-insensitive)
2. Regex: "def.*calculate.*\(.*\)" (with use_regex=True)
3. File-specific: "TODO" with file_extensions=[".py", ".ts"]
4. Directory-specific: "api" with target_directories=["src/backend"]
"""
args_schema: ClassVar[type[BaseModel]] = SearchInput
codebase: Codebase = Field(exclude=True)

Expand All @@ -151,7 +237,27 @@ class EditFileTool(BaseTool):
"""Tool for editing files."""

name: ClassVar[str] = "edit_file"
description: ClassVar[str] = "Edit a file by replacing its entire content. This tool should only be used for replacing entire file contents."
description: ClassVar[str] = r"""
Edit a file by replacing its entire content. This tool should only be used for replacing entire file contents.
Input for searching the codebase.

This tool provides powerful text-based search capabilities across your codebase,
with support for both simple text matching and regular expressions. It uses ripgrep
when available for high-performance searches, falling back to Python's regex engine
when necessary.

Features:
- Plain text or regex pattern matching
- Directory and file type filtering
- Paginated results for large codebases
- Case-insensitive by default for simple text searches

Example queries:
1. Simple text: "function calculateTotal" (matches exactly, case-insensitive)
2. Regex: "def.*calculate.*\(.*\)" (with use_regex=True)
3. File-specific: "TODO" with file_extensions=[".py", ".ts"]
4. Directory-specific: "api" with target_directories=["src/backend"]
"""
args_schema: ClassVar[type[BaseModel]] = EditFileInput
codebase: Codebase = Field(exclude=True)

Expand Down Expand Up @@ -741,7 +847,7 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]:
RunBashCommandTool(), # Note: This tool doesn't need the codebase
SearchTool(codebase),
# SemanticEditTool(codebase),
SemanticSearchTool(codebase),
# SemanticSearchTool(codebase),
ViewFileTool(codebase),
RelaceEditTool(codebase),
ReflectionTool(codebase),
Expand All @@ -761,14 +867,26 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]:


class ReplacementEditInput(BaseModel):
"""Input for regex-based replacement editing."""

filepath: str = Field(..., description="Path to the file to edit")
pattern: str = Field(..., description="Regex pattern to match")
replacement: str = Field(..., description="Replacement text (can include regex groups)")
start: int = Field(default=1, description="Starting line number (1-indexed, inclusive). Default is 1.")
end: int = Field(default=-1, description="Ending line number (1-indexed, inclusive). Default is -1 (end of file).")
count: Optional[int] = Field(default=None, description="Maximum number of replacements. Default is None (replace all).")
filepath: str = Field(..., description="Path to the file to edit relative to the workspace root. The file must exist and be a text file.")
pattern: str = Field(
...,
description="Regular expression pattern to match text that should be replaced. Supports all Python regex syntax including capture groups (\1, \2, etc). The pattern is compiled with re.MULTILINE flag by default.",
)
replacement: str = Field(
...,
description="Text to replace matched patterns with. Can reference regex capture groups using \1, \2, etc. If using regex groups in pattern, make sure to preserve them in replacement if needed.",
)
start: int = Field(
default=1, description="Starting line number (1-indexed, inclusive) to begin replacements from. Use this with 'end' to limit changes to a specific region. Default is 1 (start of file)."
)
end: int = Field(
default=-1,
description="Ending line number (1-indexed, inclusive) to stop replacements at. Use -1 to indicate end of file. Use this with 'start' to limit changes to a specific region. Default is -1 (end of file).",
)
count: Optional[int] = Field(
default=None,
description="Maximum number of replacements to make. Use None to replace all occurrences (default), or specify a number to limit replacements. Useful when you only want to replace the first N occurrences.",
)


class ReplacementEditTool(BaseTool):
Expand Down
Loading
Loading