From bce4249f3617c995edf804f0f56a2f747e54e555 Mon Sep 17 00:00:00 2001 From: Vishal Shenoy Date: Tue, 11 Feb 2025 10:39:02 -0800 Subject: [PATCH 1/3] . --- src/codegen/extensions/langchain/agent.py | 4 +- src/codegen/extensions/langchain/tools.py | 21 +-- tinygen.py | 213 ++++++++++++++++++++++ 3 files changed, 226 insertions(+), 12 deletions(-) create mode 100644 tinygen.py diff --git a/src/codegen/extensions/langchain/agent.py b/src/codegen/extensions/langchain/agent.py index 458903c24..57a9b146f 100644 --- a/src/codegen/extensions/langchain/agent.py +++ b/src/codegen/extensions/langchain/agent.py @@ -3,7 +3,7 @@ from langchain import hub from langchain.agents import AgentExecutor from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent -from langchain_core.chat_history import ChatMessageHistory +from langchain_community.chat_message_histories import ChatMessageHistory from langchain_core.runnables.history import RunnableWithMessageHistory from langchain_openai import ChatOpenAI @@ -21,6 +21,7 @@ SearchTool, SemanticEditTool, ViewFileTool, + SemanticSearchTool, ) @@ -59,6 +60,7 @@ def create_codebase_agent( MoveSymbolTool(codebase), RevealSymbolTool(codebase), SemanticEditTool(codebase), + SemanticSearchTool(codebase), CommitTool(codebase), ] diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index fcfcd2997..1b9d4c752 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -1,7 +1,7 @@ """Langchain tools for workspace operations.""" import json -from typing import ClassVar, Literal, Optional +from typing import ClassVar, Literal, Optional, Type from langchain.tools import BaseTool from pydantic import BaseModel, Field @@ -233,6 +233,8 @@ def new_function(): ) + + class SemanticEditTool(BaseTool): """Tool for semantic editing of files.""" @@ -312,20 +314,17 @@ def _run( return json.dumps(result, indent=2) +class SemanticSearchInput(BaseModel): + query: str = Field(..., description="The natural language search query") + k: int = Field(default=5, description="Number of results to return") + preview_length: int = Field(default=200, description="Length of content preview in characters") + class SemanticSearchTool(BaseTool): """Tool for semantic code search.""" name: ClassVar[str] = "semantic_search" description: ClassVar[str] = "Search the codebase using natural language queries and semantic similarity" - args_schema: ClassVar[type[BaseModel]] = type( - "SemanticSearchInput", - (BaseModel,), - { - "query": (str, Field(..., description="The natural language search query")), - "k": (int, Field(default=5, description="Number of results to return")), - "preview_length": (int, Field(default=200, description="Length of content preview in characters")), - }, - ) + args_schema: ClassVar[Type[BaseModel]] = SemanticSearchInput codebase: Codebase = Field(exclude=True) def __init__(self, codebase: Codebase) -> None: @@ -333,4 +332,4 @@ def __init__(self, codebase: Codebase) -> None: def _run(self, query: str, k: int = 5, preview_length: int = 200) -> str: result = semantic_search(self.codebase, query, k=k, preview_length=preview_length) - return json.dumps(result, indent=2) + return json.dumps(result, indent=2) \ No newline at end of file diff --git a/tinygen.py b/tinygen.py new file mode 100644 index 000000000..40fe5da56 --- /dev/null +++ b/tinygen.py @@ -0,0 +1,213 @@ +from typing import List, Tuple +from codegen import Codebase +from codegen.extensions.vector_index import VectorIndex +from codegen.extensions.langchain.agent import Agent, create_codebase_agent +import shutil +import tempfile +import git +from dotenv import load_dotenv + +load_dotenv() + + +def setup_vector_search(repo_path: str) -> VectorIndex: + """Initialize and create vector index for a codebase.""" + # Initialize codebase + codebase = Codebase(repo_path) + + # Create vector index + index = VectorIndex(codebase) + + # Try to load existing index, create if not found + try: + index.load() + print("✓ Loaded existing vector index") + except FileNotFoundError: + print("⚡ Creating new vector index...") + index.create() + index.save() + print("✓ Created and saved vector index") + + return index + + +def find_relevant_files( + index: VectorIndex, query: str, k: int = 10, min_similarity: float = 0.1 +) -> List[Tuple[str, float, str]]: + """Find most relevant files for a query with previews.""" + # Perform semantic search + results = index.similarity_search(query, k=k) + + relevant_files = [] + for filepath, similarity in results: + print(filepath, similarity) + # Skip if similarity is too low + if similarity < min_similarity: + continue + + # Get file content preview + try: + file = index.codebase.get_file(filepath) + preview = file.content[:200].replace("\n", " ").strip() + if len(file.content) > 200: + preview += "..." + + relevant_files.append((filepath, similarity, preview)) + except Exception as e: + print(f"Warning: Could not read file {filepath}: {e}") + + return relevant_files + + +def clone_repo(repo_url: str) -> str: + temp_dir = tempfile.mkdtemp() + try: + print(f"Cloning repository from {repo_url} to {temp_dir}") + git.Repo.clone_from(repo_url, temp_dir) + except git.exc.GitError as e: + shutil.rmtree(temp_dir) + raise ValueError(f"Failed to clone repository: {e}") + return temp_dir + + +def process_files_with_agent( + codebase: Codebase, + files_to_process: List[Tuple[str, float, str]], + query: str, + model_name: str = "gpt-4", + temperature: float = 0, +) -> None: + """Process relevant files using the LangChain agent. + + Args: + codebase: The codebase object containing the files + files_to_process: List of tuples containing (filepath, similarity, preview) + query: The original query/prompt describing the changes to make + model_name: Name of the model to use + temperature: Model temperature + """ + # Create the agent with the codebase tools + print("\nInitializing AI agent...") + agent = create_codebase_agent( + codebase=codebase, + model_name=model_name, + temperature=temperature, + verbose=True # Enable verbose mode to see agent's thought process + ) + + print("\nProcessing files with AI agent...") + print("=" * 80) + + modifications_made = False + + # Process each file + for filepath, similarity, preview in files_to_process: + if not codebase.has_file(filepath): + print(f"⚠️ File not found: {filepath}") + continue + + print(f"\nProcessing file: {filepath}") + print(f"Similarity score: {similarity:.2f}") + print(f"Preview: {preview[:100]}...") + + # Create a specific prompt for this file + file_prompt = f""" +Analyze and modify the file {filepath} based on this request: "{query}" + +Follow these steps: +1. First use the view_file tool to see the current content +2. Analyze the changes needed based on the request +3. Use the edit_file tool to make the necessary modifications +4. Explain what changes were made and why + +Guidelines: +- Preserve the overall structure and functionality +- Only make changes that align with the request +- Ensure code quality and consistency +""" + + try: + # Invoke the agent with the file-specific prompt + result = agent.invoke( + { + "input": file_prompt, + "config": {"configurable": {"session_id": filepath}}, + } + ) + + print(f"✅ Processed {filepath}") + print(f"Agent output: {result['output']}") + print("-" * 80) + modifications_made = True + + except Exception as e: + print(f"❌ Error processing {filepath}: {str(e)}") + print("-" * 80) + + # Only commit if changes were made + if modifications_made: + print("\nCommitting changes to disk...") + try: + codebase.commit() + print("✅ Changes committed successfully") + except Exception as e: + print(f"❌ Error committing changes: {str(e)}") + else: + print("\nℹ️ No changes were made to commit") + + +def main(): + # Example usage + repo_url = "https://github.com/Textualize/rich" + query = "Delete dead code" + + repo_path = clone_repo(repo_url) + + # Setup vector search + index = setup_vector_search(repo_path) + + # Find relevant files + print("\nSearching for relevant files...") + results = find_relevant_files(index, query) + + # Display results + print("\nMost relevant files:") + print("-" * 80) + for filepath, similarity, preview in results: + print(f"\n📄 {filepath}") + print(f"Similarity: {similarity:.2f}") + print(f"Preview: {preview}") + print("-" * 80) + + # Store original file contents for diff + original_contents = {} + for filepath, _, _ in results: + if index.codebase.has_file(filepath): + original_contents[filepath] = index.codebase.get_file(filepath).content + + # Process files with AI agent + process_files_with_agent(index.codebase, results, query) + + # Print diffs for modified files + print("\nFile modifications:") + print("=" * 80) + for filepath in original_contents: + if index.codebase.has_file(filepath): + new_content = index.codebase.get_file(filepath).content + if new_content != original_contents[filepath]: + from difflib import unified_diff + + diff = unified_diff( + original_contents[filepath].splitlines(keepends=True), + new_content.splitlines(keepends=True), + fromfile=f"a/{filepath}", + tofile=f"b/{filepath}", + ) + print(f"\nChanges in {filepath}:") + print("".join(diff)) + else: + print(f"\nNo changes made to {filepath}") + + +if __name__ == "__main__": + main() From 6f9c9958a8414a5a3733d29859d7bfdcfac602c0 Mon Sep 17 00:00:00 2001 From: Vishal Shenoy Date: Tue, 11 Feb 2025 11:32:49 -0800 Subject: [PATCH 2/3] delete my tinygen --- tinygen.py | 213 ----------------------------------------------------- 1 file changed, 213 deletions(-) delete mode 100644 tinygen.py diff --git a/tinygen.py b/tinygen.py deleted file mode 100644 index 40fe5da56..000000000 --- a/tinygen.py +++ /dev/null @@ -1,213 +0,0 @@ -from typing import List, Tuple -from codegen import Codebase -from codegen.extensions.vector_index import VectorIndex -from codegen.extensions.langchain.agent import Agent, create_codebase_agent -import shutil -import tempfile -import git -from dotenv import load_dotenv - -load_dotenv() - - -def setup_vector_search(repo_path: str) -> VectorIndex: - """Initialize and create vector index for a codebase.""" - # Initialize codebase - codebase = Codebase(repo_path) - - # Create vector index - index = VectorIndex(codebase) - - # Try to load existing index, create if not found - try: - index.load() - print("✓ Loaded existing vector index") - except FileNotFoundError: - print("⚡ Creating new vector index...") - index.create() - index.save() - print("✓ Created and saved vector index") - - return index - - -def find_relevant_files( - index: VectorIndex, query: str, k: int = 10, min_similarity: float = 0.1 -) -> List[Tuple[str, float, str]]: - """Find most relevant files for a query with previews.""" - # Perform semantic search - results = index.similarity_search(query, k=k) - - relevant_files = [] - for filepath, similarity in results: - print(filepath, similarity) - # Skip if similarity is too low - if similarity < min_similarity: - continue - - # Get file content preview - try: - file = index.codebase.get_file(filepath) - preview = file.content[:200].replace("\n", " ").strip() - if len(file.content) > 200: - preview += "..." - - relevant_files.append((filepath, similarity, preview)) - except Exception as e: - print(f"Warning: Could not read file {filepath}: {e}") - - return relevant_files - - -def clone_repo(repo_url: str) -> str: - temp_dir = tempfile.mkdtemp() - try: - print(f"Cloning repository from {repo_url} to {temp_dir}") - git.Repo.clone_from(repo_url, temp_dir) - except git.exc.GitError as e: - shutil.rmtree(temp_dir) - raise ValueError(f"Failed to clone repository: {e}") - return temp_dir - - -def process_files_with_agent( - codebase: Codebase, - files_to_process: List[Tuple[str, float, str]], - query: str, - model_name: str = "gpt-4", - temperature: float = 0, -) -> None: - """Process relevant files using the LangChain agent. - - Args: - codebase: The codebase object containing the files - files_to_process: List of tuples containing (filepath, similarity, preview) - query: The original query/prompt describing the changes to make - model_name: Name of the model to use - temperature: Model temperature - """ - # Create the agent with the codebase tools - print("\nInitializing AI agent...") - agent = create_codebase_agent( - codebase=codebase, - model_name=model_name, - temperature=temperature, - verbose=True # Enable verbose mode to see agent's thought process - ) - - print("\nProcessing files with AI agent...") - print("=" * 80) - - modifications_made = False - - # Process each file - for filepath, similarity, preview in files_to_process: - if not codebase.has_file(filepath): - print(f"⚠️ File not found: {filepath}") - continue - - print(f"\nProcessing file: {filepath}") - print(f"Similarity score: {similarity:.2f}") - print(f"Preview: {preview[:100]}...") - - # Create a specific prompt for this file - file_prompt = f""" -Analyze and modify the file {filepath} based on this request: "{query}" - -Follow these steps: -1. First use the view_file tool to see the current content -2. Analyze the changes needed based on the request -3. Use the edit_file tool to make the necessary modifications -4. Explain what changes were made and why - -Guidelines: -- Preserve the overall structure and functionality -- Only make changes that align with the request -- Ensure code quality and consistency -""" - - try: - # Invoke the agent with the file-specific prompt - result = agent.invoke( - { - "input": file_prompt, - "config": {"configurable": {"session_id": filepath}}, - } - ) - - print(f"✅ Processed {filepath}") - print(f"Agent output: {result['output']}") - print("-" * 80) - modifications_made = True - - except Exception as e: - print(f"❌ Error processing {filepath}: {str(e)}") - print("-" * 80) - - # Only commit if changes were made - if modifications_made: - print("\nCommitting changes to disk...") - try: - codebase.commit() - print("✅ Changes committed successfully") - except Exception as e: - print(f"❌ Error committing changes: {str(e)}") - else: - print("\nℹ️ No changes were made to commit") - - -def main(): - # Example usage - repo_url = "https://github.com/Textualize/rich" - query = "Delete dead code" - - repo_path = clone_repo(repo_url) - - # Setup vector search - index = setup_vector_search(repo_path) - - # Find relevant files - print("\nSearching for relevant files...") - results = find_relevant_files(index, query) - - # Display results - print("\nMost relevant files:") - print("-" * 80) - for filepath, similarity, preview in results: - print(f"\n📄 {filepath}") - print(f"Similarity: {similarity:.2f}") - print(f"Preview: {preview}") - print("-" * 80) - - # Store original file contents for diff - original_contents = {} - for filepath, _, _ in results: - if index.codebase.has_file(filepath): - original_contents[filepath] = index.codebase.get_file(filepath).content - - # Process files with AI agent - process_files_with_agent(index.codebase, results, query) - - # Print diffs for modified files - print("\nFile modifications:") - print("=" * 80) - for filepath in original_contents: - if index.codebase.has_file(filepath): - new_content = index.codebase.get_file(filepath).content - if new_content != original_contents[filepath]: - from difflib import unified_diff - - diff = unified_diff( - original_contents[filepath].splitlines(keepends=True), - new_content.splitlines(keepends=True), - fromfile=f"a/{filepath}", - tofile=f"b/{filepath}", - ) - print(f"\nChanges in {filepath}:") - print("".join(diff)) - else: - print(f"\nNo changes made to {filepath}") - - -if __name__ == "__main__": - main() From 7ec8c50aab4b6f59e31a86704ba8db1590fda844 Mon Sep 17 00:00:00 2001 From: vishalshenoy <34020235+vishalshenoy@users.noreply.github.com> Date: Tue, 11 Feb 2025 19:35:35 +0000 Subject: [PATCH 3/3] Automated pre-commit update --- src/codegen/extensions/langchain/agent.py | 2 +- src/codegen/extensions/langchain/tools.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/codegen/extensions/langchain/agent.py b/src/codegen/extensions/langchain/agent.py index 57a9b146f..dc1a1b47d 100644 --- a/src/codegen/extensions/langchain/agent.py +++ b/src/codegen/extensions/langchain/agent.py @@ -20,8 +20,8 @@ RevealSymbolTool, SearchTool, SemanticEditTool, - ViewFileTool, SemanticSearchTool, + ViewFileTool, ) diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index 1b9d4c752..f00b193a3 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -1,7 +1,7 @@ """Langchain tools for workspace operations.""" import json -from typing import ClassVar, Literal, Optional, Type +from typing import ClassVar, Literal, Optional from langchain.tools import BaseTool from pydantic import BaseModel, Field @@ -233,8 +233,6 @@ def new_function(): ) - - class SemanticEditTool(BaseTool): """Tool for semantic editing of files.""" @@ -319,12 +317,13 @@ class SemanticSearchInput(BaseModel): k: int = Field(default=5, description="Number of results to return") preview_length: int = Field(default=200, description="Length of content preview in characters") + class SemanticSearchTool(BaseTool): """Tool for semantic code search.""" name: ClassVar[str] = "semantic_search" description: ClassVar[str] = "Search the codebase using natural language queries and semantic similarity" - args_schema: ClassVar[Type[BaseModel]] = SemanticSearchInput + args_schema: ClassVar[type[BaseModel]] = SemanticSearchInput codebase: Codebase = Field(exclude=True) def __init__(self, codebase: Codebase) -> None: @@ -332,4 +331,4 @@ def __init__(self, codebase: Codebase) -> None: def _run(self, query: str, k: int = 5, preview_length: int = 200) -> str: result = semantic_search(self.codebase, query, k=k, preview_length=preview_length) - return json.dumps(result, indent=2) \ No newline at end of file + return json.dumps(result, indent=2)