https://claude.ai/chat/4fd9b8d3-454b-49ba-9857-0b8e8ecbc619

Summary - Cell 2 given by claude, just gives you huge flat file of all classes and functions.

IMPROVED Cell 3 - only particular module 

AI prompt for NEXT notebook
is it possible to use changed logic, instead of parsing all repo (witch is big) allow user to input directlory of particular module (which is located in that directory) and parse it first and only related classes and functions that are references in that particular set of code (example of directory with module C:\python\erpnext\erpnext\projects)

In [4]:
# Python Repository Map Generator for AI Processing
# Inspired by aider's approach to create concise code context for LLMs

import os
import ast
import json
import re
from typing import Dict, List, Any, Optional, Union
import sys
import importlib.util
from collections import defaultdict, Counter

print(f"Python version: {sys.version}")

class RepoMapGenerator:
    def __init__(self, root_dir, max_tokens=1000):
        self.root_dir = root_dir
        self.max_tokens = max_tokens
        self.files_data = []
        self.symbol_references = defaultdict(int)
        self.file_dependencies = defaultdict(set)
        self.symbol_definitions = {}  # filepath -> symbol -> line info
        
    def process_repository(self):
        """Process the entire repository to build the repo map"""
        print(f"Processing repository: {self.root_dir}")
        
        # First pass: collect all symbol definitions and references
        for root, _, files in os.walk(self.root_dir):
            for filename in files:
                if filename.endswith(".py"):
                    filepath = os.path.join(root, filename)
                    rel_path = os.path.relpath(filepath, self.root_dir)
                    
                    print(f"Processing: {rel_path}", end="\r")
                    file_data = self.parse_python_file(filepath)
                    
                    if file_data:
                        self.files_data.append(file_data)
                        self.extract_symbols_and_references(file_data)
        
        # Second pass: compute importance scores
        self.compute_symbol_importance()
        
        # Third pass: build the repo map
        repo_map = self.build_repo_map()
        
        print(f"\nFinished processing repository. Map contains {len(repo_map)} files.")
        return repo_map
    
    def parse_python_file(self, filepath: str) -> Optional[Dict[str, Any]]:
        """Parse a Python file using AST"""
        try:
            with open(filepath, 'r', encoding='utf-8') as file:
                source_code = file.read()
                source_lines = source_code.splitlines()
            
            # Parse the source code into an AST
            tree = ast.parse(source_code)
            
            # Extract file information
            rel_path = os.path.relpath(filepath, self.root_dir)
            
            file_data = {
                "filepath": filepath,
                "rel_filepath": rel_path,
                "source_lines": source_lines,
                "symbols": self.extract_symbols(tree, source_lines),
                "imports": self.extract_imports(tree),
                "references": self.extract_references(tree)
            }
            
            return file_data
        
        except SyntaxError as e:
            print(f"Syntax error in {filepath}: {e}")
            return None
        except Exception as e:
            print(f"Error parsing {filepath}: {e}")
            return None
    
    def extract_symbols(self, tree: ast.AST, source_lines: List[str]) -> List[Dict[str, Any]]:
        """Extract symbol definitions from AST"""
        symbols = []
        
        for node in ast.iter_child_nodes(tree):
            if isinstance(node, ast.ClassDef):
                # Extract class definition
                class_symbol = {
                    "type": "class",
                    "name": node.name,
                    "line_start": node.lineno,
                    "line_end": getattr(node, 'end_lineno', node.lineno),
                    "code_text": self.get_definition_text(source_lines, node),
                    "methods": []
                }
                
                # Extract class methods
                for item in node.body:
                    if isinstance(item, ast.FunctionDef):
                        method_symbol = {
                            "type": "method",
                            "name": f"{node.name}.{item.name}",
                            "method_name": item.name,
                            "line_start": item.lineno,
                            "line_end": getattr(item, 'end_lineno', item.lineno),
                            "code_text": self.get_definition_text(source_lines, item),
                            "args": self.get_function_args(item)
                        }
                        class_symbol["methods"].append(method_symbol)
                
                symbols.append(class_symbol)
            
            elif isinstance(node, ast.FunctionDef):
                # Extract function definition
                func_symbol = {
                    "type": "function",
                    "name": node.name,
                    "line_start": node.lineno,
                    "line_end": getattr(node, 'end_lineno', node.lineno),
                    "code_text": self.get_definition_text(source_lines, node),
                    "args": self.get_function_args(node)
                }
                symbols.append(func_symbol)
        
        return symbols
    
    def get_definition_text(self, source_lines: List[str], node: Union[ast.ClassDef, ast.FunctionDef]) -> str:
        """Get the definition line(s) for a symbol"""
        start_line = node.lineno - 1  # 0-indexed
        
        # For single-line functions or classes
        if hasattr(node, 'end_lineno'):
            if node.end_lineno - node.lineno <= 5:  # If definition is short (≤ 5 lines)
                # Get full definition 
                return "\n".join(source_lines[start_line:node.end_lineno])
        
        # For longer definitions or when end_lineno is not available
        # Get just the signature/declaration
        if isinstance(node, ast.ClassDef):
            # For classes, include inheritance info
            line = source_lines[start_line]
            # If declaration continues to next lines, include them
            current_line = start_line + 1
            while current_line < len(source_lines) and ':' not in line:
                line += "\n" + source_lines[current_line]
                current_line += 1
            return line
        
        elif isinstance(node, ast.FunctionDef):
            # For functions, include the full signature
            signature_lines = []
            signature_lines.append(source_lines[start_line])
            
            # If parameter list continues to next lines
            line = source_lines[start_line]
            paren_count = line.count('(') - line.count(')')
            current_line = start_line + 1
            
            # Keep adding lines until we have a balanced parenthesis count or hit a colon
            while paren_count > 0 and current_line < len(source_lines):
                next_line = source_lines[current_line]
                signature_lines.append(next_line)
                paren_count += next_line.count('(') - next_line.count(')')
                
                if ':' in next_line:
                    break
                    
                current_line += 1
            
            return "\n".join(signature_lines)
    
    def get_function_args(self, node: ast.FunctionDef) -> List[str]:
        """Extract function arguments"""
        args = []
        for arg in node.args.args:
            arg_str = arg.arg
            if arg.annotation:
                if isinstance(arg.annotation, ast.Name):
                    arg_str += f": {arg.annotation.id}"
                elif isinstance(arg.annotation, ast.Attribute):
                    arg_str += f": {self.get_attribute_name(arg.annotation)}"
            args.append(arg_str)
        
        # Add *args if present
        if node.args.vararg:
            args.append(f"*{node.args.vararg.arg}")
        
        # Add **kwargs if present
        if node.args.kwarg:
            args.append(f"**{node.args.kwarg.arg}")
            
        return args
    
    def get_attribute_name(self, node: ast.Attribute) -> str:
        """Get the full name of an attribute node"""
        if isinstance(node.value, ast.Name):
            return f"{node.value.id}.{node.attr}"
        elif isinstance(node.value, ast.Attribute):
            return f"{self.get_attribute_name(node.value)}.{node.attr}"
        return f"?.{node.attr}"
    
    def extract_imports(self, tree: ast.AST) -> List[Dict[str, Any]]:
        """Extract import statements"""
        imports = []
        
        for node in ast.iter_child_nodes(tree):
            if isinstance(node, ast.Import):
                for name in node.names:
                    imports.append({
                        "type": "import",
                        "name": name.name,
                        "asname": name.asname
                    })
            elif isinstance(node, ast.ImportFrom):
                module = node.module or ""
                for name in node.names:
                    imports.append({
                        "type": "import_from",
                        "module": module,
                        "name": name.name,
                        "asname": name.asname
                    })
        
        return imports
    
    def extract_references(self, tree: ast.AST) -> List[str]:
        """Extract symbol references from AST"""
        references = []
        
        class ReferenceVisitor(ast.NodeVisitor):
            def __init__(self):
                self.refs = []
                
            def visit_Name(self, node):
                if isinstance(node.ctx, ast.Load):
                    self.refs.append(node.id)
                self.generic_visit(node)
                
            def visit_Attribute(self, node):
                if isinstance(node.value, ast.Name):
                    self.refs.append(f"{node.value.id}.{node.attr}")
                self.generic_visit(node)
        
        visitor = ReferenceVisitor()
        visitor.visit(tree)
        return visitor.refs
    
    def extract_symbols_and_references(self, file_data: Dict[str, Any]):
        """Extract symbols and references from file data"""
        filepath = file_data["rel_filepath"]
        
        # Store symbol definitions
        self.symbol_definitions[filepath] = {}
        for symbol in file_data["symbols"]:
            if symbol["type"] == "class":
                self.symbol_definitions[filepath][symbol["name"]] = {
                    "type": "class",
                    "line_start": symbol["line_start"],
                    "code_text": symbol["code_text"]
                }
                
                # Add methods
                for method in symbol["methods"]:
                    self.symbol_definitions[filepath][method["name"]] = {
                        "type": "method",
                        "line_start": method["line_start"],
                        "code_text": method["code_text"],
                        "args": method["args"]
                    }
            
            elif symbol["type"] == "function":
                self.symbol_definitions[filepath][symbol["name"]] = {
                    "type": "function",
                    "line_start": symbol["line_start"],
                    "code_text": symbol["code_text"],
                    "args": symbol["args"]
                }
        
        # Count references
        for ref in file_data["references"]:
            self.symbol_references[ref] += 1
        
        # Track file dependencies based on imports
        for imp in file_data["imports"]:
            if imp["type"] == "import":
                self.file_dependencies[filepath].add(imp["name"].split(".")[0])
            elif imp["type"] == "import_from":
                self.file_dependencies[filepath].add(imp["module"].split(".")[0])
    
    def compute_symbol_importance(self):
        """Compute importance scores for symbols"""
        self.symbol_importance = {}
        
        # Start with reference counts as base importance
        for symbol, count in self.symbol_references.items():
            self.symbol_importance[symbol] = count
        
        # Add importance based on dependency graph - symbols in more depended-upon files
        # are more important
        dependency_counts = Counter()
        for file, deps in self.file_dependencies.items():
            for dep in deps:
                dependency_counts[dep] += 1
        
        # Find Python files that match dependency names
        for root, _, files in os.walk(self.root_dir):
            for filename in files:
                if filename.endswith(".py"):
                    basename = os.path.splitext(filename)[0]
                    if basename in dependency_counts:
                        filepath = os.path.relpath(os.path.join(root, filename), self.root_dir)
                        
                        # Boost importance of symbols in this file
                        if filepath in self.symbol_definitions:
                            for symbol in self.symbol_definitions[filepath]:
                                if symbol in self.symbol_importance:
                                    self.symbol_importance[symbol] += dependency_counts[basename]
                                else:
                                    self.symbol_importance[symbol] = dependency_counts[basename]
    
    def estimate_token_count(self, text):
        """Estimate token count based on words and punctuation"""
        # Simple token count estimation based on GPT tokenization heuristics
        # This is a rough approximation
        return len(re.findall(r'\w+|[^\w\s]', text))
    
    def build_repo_map(self):
        """Build the repository map optimized for token count"""
        repo_map = {}
        total_tokens = 0
        
        # Sort files by their dependency count (most depended upon first)
        file_importance = {}
        for filepath in self.symbol_definitions:
            # Calculate file importance based on symbols it contains
            importance = sum(
                self.symbol_importance.get(symbol, 0)
                for symbol in self.symbol_definitions[filepath]
            )
            file_importance[filepath] = importance
        
        sorted_files = sorted(file_importance.keys(), key=lambda f: file_importance[f], reverse=True)
        
        # Build map with the most important files first
        for filepath in sorted_files:
            file_symbols = self.symbol_definitions[filepath]
            
            # Sort symbols by importance
            sorted_symbols = sorted(
                file_symbols.keys(),
                key=lambda s: self.symbol_importance.get(s, 0),
                reverse=True
            )
            
            file_map = []
            file_token_count = self.estimate_token_count(filepath) + 10  # Base tokens for the file path
            
            for symbol in sorted_symbols:
                symbol_info = file_symbols[symbol]
                symbol_text = symbol_info["code_text"]
                symbol_tokens = self.estimate_token_count(symbol_text) + 5  # Extra tokens for formatting
                
                # Check if adding this symbol would exceed token budget
                if total_tokens + file_token_count + symbol_tokens > self.max_tokens:
                    # If we haven't added any symbols to this file yet, add at least the most important one
                    if not file_map and symbol == sorted_symbols[0]:
                        file_map.append(symbol_info)
                        total_tokens += symbol_tokens
                    break
                
                file_map.append(symbol_info)
                file_token_count += symbol_tokens
            
            if file_map:
                repo_map[filepath] = file_map
                total_tokens += file_token_count
            
            # Stop adding files if we're near the token budget
            if total_tokens >= self.max_tokens * 0.95:
                break
        
        return repo_map
    
    def format_repo_map_for_ai(self, repo_map):
        """Format the repository map for AI consumption"""
        formatted_map = []
        
        for filepath, symbols in repo_map.items():
            file_section = [f"{filepath}:"]
            
            # Add indentation to show file structure
            for symbol in symbols:
                code_text = symbol["code_text"]
                # Add indentation and pipe character to every line
                indented_code = "\n".join(f"│{line}" for line in code_text.split("\n"))
                file_section.append(indented_code)
                file_section.append("⋮...")  # Ellipsis to indicate there's more code in the file
            
            formatted_map.append("\n".join(file_section))
        
        return "\n".join(formatted_map)
    
    def output_repo_map(self, repo_map, format_type="ai", output_file=None):
        """Output the repository map in the specified format"""
        if format_type == "ai":
            formatted_map = self.format_repo_map_for_ai(repo_map)
            if output_file:
                with open(output_file, "w", encoding="utf-8") as f:
                    f.write(formatted_map)
            return formatted_map
        elif format_type == "json":
            if output_file:
                with open(output_file, "w", encoding="utf-8") as f:
                    json.dump(repo_map, f, indent=2)
            return json.dumps(repo_map, indent=2)
        else:
            raise ValueError(f"Unknown format type: {format_type}")

# Main execution
if __name__ == "__main__":
    # For notebook execution
    from IPython.display import display
    import ipywidgets as widgets
    
    # Input widget for repository path
    dir_input = widgets.Text(
        value='',
        placeholder='Enter the path to your Python repository',
        description='Repository Path:',
        disabled=False,
        layout=widgets.Layout(width='50%')
    )
    display(dir_input)
    
    # Default repository path - adjust as needed
    root_directory = r"C:\python\erpnext\erpnext"
    
    # Maximum tokens for the repo map (adjust based on your AI system's limitations)
    max_tokens = 1000000
    
    # Create and run the repo map generator
    generator = RepoMapGenerator(root_directory, max_tokens)
    repo_map = generator.process_repository()
    
    # Output the repo map in both formats
    ai_format = generator.output_repo_map(repo_map, "ai", "repo_map_ai.txt")
    json_format = generator.output_repo_map(repo_map, "json", "repo_map.json")
    
    print("\nRepository map has been generated.")
    print("AI-friendly format saved to: repo_map_ai.txt")
    print("JSON format saved to: repo_map.json")
    
    # Show a preview of the AI-friendly format
    print("\nPreview of the AI-friendly format:")
    preview_lines = ai_format.split("\n")[:20]
    print("\n".join(preview_lines))
    if len(preview_lines) < len(ai_format.split("\n")):
        print("... (more content in repo_map_ai.txt)")

Python version: 3.12.7 (tags/v3.12.7:0b05ead, Oct  1 2024, 03:06:41) [MSC v.1941 64 bit (AMD64)]


Text(value='', description='Repository Path:', layout=Layout(width='50%'), placeholder='Enter the path to your…

Processing repository: C:\python\erpnext\erpnext
Processing: www\support\__init__.pyinit__.pyit__.pyyynit__.pyeractions.pys.pyngs.pyg_schedule.pyupplied_item.pyparison.pyem_group.pypy
Finished processing repository. Map contains 1524 files.

Repository map has been generated.
AI-friendly format saved to: repo_map_ai.txt
JSON format saved to: repo_map.json

Preview of the AI-friendly format:
setup\setup_wizard\operations\install_fixtures.py:
│def _(x, *args, **kwargs):
│	"""Redefine the translation function to return the string as is.
│
│	We want to create english records but still mark the strings as translatable.
│	The respective DocTypes have 'Translate Link Fields' enabled."""
│	return x
⋮...
│def create_bank_account(args):
⋮...
│def add_uom_data():
⋮...
│def get_fy_details(fy_start_date, fy_end_date):
⋮...
│def read_lines(filename: str) -> list[str]:
│	"""Return a list of lines from a file in the data directory."""
│	return (Path(__file__).parent.parent / "data" / filename).read_te

In [6]:
# Python Repository Map Generator for AI Processing
# Optimized for processing specific modules within larger repositories
import os
import ast
import json
import re
from typing import Dict, List, Any, Optional, Union, Set
import sys
from collections import defaultdict, Counter

print(f"Python version: {sys.version}")

class RepoMapGenerator:
    def __init__(self, root_dir, module_dir=None, max_tokens=1000):
        self.root_dir = os.path.abspath(root_dir)
        self.module_dir = os.path.abspath(module_dir) if module_dir else self.root_dir
        self.max_tokens = max_tokens
        self.files_data = []
        self.symbol_references = defaultdict(int)
        self.file_dependencies = defaultdict(set)
        self.symbol_definitions = {}  # filepath -> symbol -> line info
        self.module_files = set()  # Files that are part of the target module
        self.processed_files = set()  # Track files we've already processed
        self.relevant_symbols = set()  # Symbols that are relevant to the module
        
    def process_repository(self):
        """Process the repository focusing on the specified module"""
        print(f"Processing repository: {self.root_dir}")
        print(f"Target module: {self.module_dir}")
        
        # First pass: process the target module files
        self.process_module_files()
        
        # Second pass: identify and process related files from the references
        self.process_related_files()
        
        # Third pass: compute importance scores
        self.compute_symbol_importance()
        
        # Fourth pass: build the repo map
        repo_map = self.build_repo_map()
        
        print(f"\nFinished processing. Map contains {len(repo_map)} files.")
        return repo_map
    
    def process_module_files(self):
        """First pass: process all Python files in the target module"""
        for root, _, files in os.walk(self.module_dir):
            for filename in files:
                if filename.endswith(".py"):
                    filepath = os.path.join(root, filename)
                    rel_path = os.path.relpath(filepath, self.root_dir)
                    
                    print(f"Processing module file: {rel_path}", end="\r")
                    self.module_files.add(filepath)
                    file_data = self.parse_python_file(filepath)
                    
                    if file_data:
                        self.files_data.append(file_data)
                        self.extract_symbols_and_references(file_data)
                        self.processed_files.add(filepath)
                        
                        # Collect all symbols defined in this module
                        if filepath in self.symbol_definitions:
                            for symbol in self.symbol_definitions[filepath]:
                                self.relevant_symbols.add(symbol)
    
    def process_related_files(self):
        """Second pass: follow dependencies and references to process related files"""
        # Start with the dependencies from module files
        to_process = set()
        for file_path in self.processed_files:
            rel_path = os.path.relpath(file_path, self.root_dir)
            deps = self.file_dependencies.get(rel_path, set())
            to_process.update(deps)
        
        # Also collect referenced symbols that we don't have definitions for yet
        referenced_symbols = set(self.symbol_references.keys())
        
        # Continue until we've processed all related files
        iteration = 1
        while to_process and iteration <= 3:  # Limit depth to prevent processing the entire repo
            print(f"\nIteration {iteration}: Processing {len(to_process)} related dependencies")
            iteration += 1
            
            # Find Python files matching the dependency names
            new_files = set()
            for dep_name in to_process:
                for root, _, files in os.walk(self.root_dir):
                    for filename in files:
                        if filename == f"{dep_name}.py" or filename.endswith(".py") and os.path.basename(root) == dep_name:
                            filepath = os.path.join(root, filename)
                            if filepath not in self.processed_files:
                                new_files.add(filepath)
            
            # Also look for files that might define our referenced symbols
            for root, _, files in os.walk(self.root_dir):
                for filename in files:
                    if not filename.endswith(".py"):
                        continue
                    
                    # Skip if already processed
                    filepath = os.path.join(root, filename)
                    if filepath in self.processed_files or filepath in new_files:
                        continue
                    
                    # Simple heuristic: check if the filename matches any part of our referenced symbols
                    basename = os.path.splitext(filename)[0]
                    for symbol in referenced_symbols:
                        if basename.lower() in symbol.lower() or symbol.lower() in basename.lower():
                            new_files.add(filepath)
                            break
            
            # Process the new files
            new_deps = set()
            for filepath in new_files:
                rel_path = os.path.relpath(filepath, self.root_dir)
                print(f"Processing related file: {rel_path}", end="\r")
                
                file_data = self.parse_python_file(filepath)
                if file_data:
                    self.files_data.append(file_data)
                    self.extract_symbols_and_references(file_data)
                    self.processed_files.add(filepath)
                    
                    # Add symbols from this file to relevant symbols
                    if filepath in self.symbol_definitions:
                        for symbol in self.symbol_definitions[filepath]:
                            self.relevant_symbols.add(symbol)
                    
                    # Collect new dependencies
                    rel_path = file_data["rel_filepath"]
                    deps = self.file_dependencies.get(rel_path, set())
                    new_deps.update(deps)
            
            # Update to_process with new dependencies, excluding ones we've already processed
            to_process = new_deps - set(dep for dep in new_deps if any(
                os.path.join(self.root_dir, f"{dep}.py") == f or 
                os.path.basename(os.path.dirname(f)) == dep 
                for f in self.processed_files
            ))
    
    def parse_python_file(self, filepath: str) -> Optional[Dict[str, Any]]:
        """Parse a Python file using AST"""
        try:
            with open(filepath, 'r', encoding='utf-8') as file:
                source_code = file.read()
                source_lines = source_code.splitlines()
            
            # Parse the source code into an AST
            tree = ast.parse(source_code)
            
            # Extract file information
            rel_path = os.path.relpath(filepath, self.root_dir)
            
            file_data = {
                "filepath": filepath,
                "rel_filepath": rel_path,
                "source_lines": source_lines,
                "symbols": self.extract_symbols(tree, source_lines),
                "imports": self.extract_imports(tree),
                "references": self.extract_references(tree)
            }
            
            return file_data
        
        except SyntaxError as e:
            print(f"Syntax error in {filepath}: {e}")
            return None
        except Exception as e:
            print(f"Error parsing {filepath}: {e}")
            return None
    
    def extract_symbols(self, tree: ast.AST, source_lines: List[str]) -> List[Dict[str, Any]]:
        """Extract symbol definitions from AST"""
        symbols = []
        
        for node in ast.iter_child_nodes(tree):
            if isinstance(node, ast.ClassDef):
                # Extract class definition
                class_symbol = {
                    "type": "class",
                    "name": node.name,
                    "line_start": node.lineno,
                    "line_end": getattr(node, 'end_lineno', node.lineno),
                    "code_text": self.get_definition_text(source_lines, node),
                    "methods": []
                }
                
                # Extract class methods
                for item in node.body:
                    if isinstance(item, ast.FunctionDef):
                        method_symbol = {
                            "type": "method",
                            "name": f"{node.name}.{item.name}",
                            "method_name": item.name,
                            "line_start": item.lineno,
                            "line_end": getattr(item, 'end_lineno', item.lineno),
                            "code_text": self.get_definition_text(source_lines, item),
                            "args": self.get_function_args(item)
                        }
                        class_symbol["methods"].append(method_symbol)
                
                symbols.append(class_symbol)
            
            elif isinstance(node, ast.FunctionDef):
                # Extract function definition
                func_symbol = {
                    "type": "function",
                    "name": node.name,
                    "line_start": node.lineno,
                    "line_end": getattr(node, 'end_lineno', node.lineno),
                    "code_text": self.get_definition_text(source_lines, node),
                    "args": self.get_function_args(node)
                }
                symbols.append(func_symbol)
        
        return symbols
    
    def get_definition_text(self, source_lines: List[str], node: Union[ast.ClassDef, ast.FunctionDef]) -> str:
        """Get the definition line(s) for a symbol"""
        start_line = node.lineno - 1  # 0-indexed
        
        # For single-line functions or classes
        if hasattr(node, 'end_lineno'):
            if node.end_lineno - node.lineno <= 5:  # If definition is short (≤ 5 lines)
                # Get full definition 
                return "\n".join(source_lines[start_line:node.end_lineno])
        
        # For longer definitions or when end_lineno is not available
        # Get just the signature/declaration
        if isinstance(node, ast.ClassDef):
            # For classes, include inheritance info
            line = source_lines[start_line]
            # If declaration continues to next lines, include them
            current_line = start_line + 1
            while current_line < len(source_lines) and ':' not in line:
                line += "\n" + source_lines[current_line]
                current_line += 1
            return line
        
        elif isinstance(node, ast.FunctionDef):
            # For functions, include the full signature
            signature_lines = []
            signature_lines.append(source_lines[start_line])
            
            # If parameter list continues to next lines
            line = source_lines[start_line]
            paren_count = line.count('(') - line.count(')')
            current_line = start_line + 1
            
            # Keep adding lines until we have a balanced parenthesis count or hit a colon
            while paren_count > 0 and current_line < len(source_lines):
                next_line = source_lines[current_line]
                signature_lines.append(next_line)
                paren_count += next_line.count('(') - next_line.count(')')
                
                if ':' in next_line:
                    break
                    
                current_line += 1
            
            return "\n".join(signature_lines)
    
    def get_function_args(self, node: ast.FunctionDef) -> List[str]:
        """Extract function arguments"""
        args = []
        for arg in node.args.args:
            arg_str = arg.arg
            if arg.annotation:
                if isinstance(arg.annotation, ast.Name):
                    arg_str += f": {arg.annotation.id}"
                elif isinstance(arg.annotation, ast.Attribute):
                    arg_str += f": {self.get_attribute_name(arg.annotation)}"
            args.append(arg_str)
        
        # Add *args if present
        if node.args.vararg:
            args.append(f"*{node.args.vararg.arg}")
        
        # Add **kwargs if present
        if node.args.kwarg:
            args.append(f"**{node.args.kwarg.arg}")
            
        return args
    
    def get_attribute_name(self, node: ast.Attribute) -> str:
        """Get the full name of an attribute node"""
        if isinstance(node.value, ast.Name):
            return f"{node.value.id}.{node.attr}"
        elif isinstance(node.value, ast.Attribute):
            return f"{self.get_attribute_name(node.value)}.{node.attr}"
        return f"?.{node.attr}"
    
    def extract_imports(self, tree: ast.AST) -> List[Dict[str, Any]]:
        """Extract import statements"""
        imports = []
        
        for node in ast.iter_child_nodes(tree):
            if isinstance(node, ast.Import):
                for name in node.names:
                    imports.append({
                        "type": "import",
                        "name": name.name,
                        "asname": name.asname
                    })
            elif isinstance(node, ast.ImportFrom):
                module = node.module or ""
                for name in node.names:
                    imports.append({
                        "type": "import_from",
                        "module": module,
                        "name": name.name,
                        "asname": name.asname
                    })
        
        return imports
    
    def extract_references(self, tree: ast.AST) -> List[str]:
        """Extract symbol references from AST"""
        references = []
        
        class ReferenceVisitor(ast.NodeVisitor):
            def __init__(self):
                self.refs = []
                
            def visit_Name(self, node):
                if isinstance(node.ctx, ast.Load):
                    self.refs.append(node.id)
                self.generic_visit(node)
                
            def visit_Attribute(self, node):
                if isinstance(node.value, ast.Name):
                    self.refs.append(f"{node.value.id}.{node.attr}")
                self.generic_visit(node)
        
        visitor = ReferenceVisitor()
        visitor.visit(tree)
        return visitor.refs
    
    def extract_symbols_and_references(self, file_data: Dict[str, Any]):
        """Extract symbols and references from file data"""
        filepath = file_data["rel_filepath"]
        
        # Store symbol definitions
        self.symbol_definitions[filepath] = {}
        for symbol in file_data["symbols"]:
            if symbol["type"] == "class":
                self.symbol_definitions[filepath][symbol["name"]] = {
                    "type": "class",
                    "line_start": symbol["line_start"],
                    "code_text": symbol["code_text"]
                }
                
                # Add methods
                for method in symbol["methods"]:
                    self.symbol_definitions[filepath][method["name"]] = {
                        "type": "method",
                        "line_start": method["line_start"],
                        "code_text": method["code_text"],
                        "args": method["args"]
                    }
            
            elif symbol["type"] == "function":
                self.symbol_definitions[filepath][symbol["name"]] = {
                    "type": "function",
                    "line_start": symbol["line_start"],
                    "code_text": symbol["code_text"],
                    "args": symbol["args"]
                }
        
        # Count references
        for ref in file_data["references"]:
            self.symbol_references[ref] += 1
        
        # Track file dependencies based on imports
        for imp in file_data["imports"]:
            if imp["type"] == "import":
                self.file_dependencies[filepath].add(imp["name"].split(".")[0])
            elif imp["type"] == "import_from":
                self.file_dependencies[filepath].add(imp["module"].split(".")[0])
    
    def compute_symbol_importance(self):
        """Compute importance scores for symbols with focus on module relevance"""
        self.symbol_importance = {}
        
        # Start with reference counts as base importance
        for symbol, count in self.symbol_references.items():
            self.symbol_importance[symbol] = count
        
        # Boost importance for symbols defined in our target module
        for filepath in self.symbol_definitions:
            # Check if this file is in our target module
            abs_path = os.path.join(self.root_dir, filepath)
            is_module_file = abs_path in self.module_files
            
            for symbol in self.symbol_definitions[filepath]:
                # Boost module symbols significantly
                if is_module_file:
                    if symbol in self.symbol_importance:
                        self.symbol_importance[symbol] += 10  # Higher boost for module symbols
                    else:
                        self.symbol_importance[symbol] = 10
                
                # Ensure all symbols have a base importance
                elif symbol not in self.symbol_importance:
                    self.symbol_importance[symbol] = 1
        
        # Add importance based on dependency graph
        dependency_counts = Counter()
        for file, deps in self.file_dependencies.items():
            for dep in deps:
                dependency_counts[dep] += 1
        
        # Find Python files that match dependency names and boost their symbols
        for filepath in self.symbol_definitions:
            basename = os.path.splitext(os.path.basename(filepath))[0]
            dirname = os.path.basename(os.path.dirname(os.path.join(self.root_dir, filepath)))
            
            boost = dependency_counts.get(basename, 0) + dependency_counts.get(dirname, 0)
            
            if boost > 0:
                for symbol in self.symbol_definitions[filepath]:
                    if symbol in self.symbol_importance:
                        self.symbol_importance[symbol] += boost
    
    def estimate_token_count(self, text):
        """Estimate token count based on words and punctuation"""
        # Simple token count estimation based on GPT tokenization heuristics
        # This is a rough approximation
        return len(re.findall(r'\w+|[^\w\s]', text))
    
    def build_repo_map(self):
        """Build the repository map optimized for token count with focus on module relevance"""
        repo_map = {}
        total_tokens = 0
        
        # Ensure module files are included first
        module_filepaths = []
        related_filepaths = []
        
        for filepath in self.symbol_definitions:
            abs_path = os.path.join(self.root_dir, filepath)
            if abs_path in self.module_files:
                module_filepaths.append(filepath)
            else:
                related_filepaths.append(filepath)
        
        # Sort files by importance within their category
        file_importance = {}
        for filepath in self.symbol_definitions:
            # Calculate file importance based on symbols it contains
            importance = sum(
                self.symbol_importance.get(symbol, 0)
                for symbol in self.symbol_definitions[filepath]
            )
            file_importance[filepath] = importance
        
        sorted_module_files = sorted(module_filepaths, key=lambda f: file_importance[f], reverse=True)
        sorted_related_files = sorted(related_filepaths, key=lambda f: file_importance[f], reverse=True)
        
        # Combine the lists with module files first
        sorted_files = sorted_module_files + sorted_related_files
        
        # Build map with the most important files first
        for filepath in sorted_files:
            file_symbols = self.symbol_definitions[filepath]
            
            # Sort symbols by importance
            sorted_symbols = sorted(
                file_symbols.keys(),
                key=lambda s: self.symbol_importance.get(s, 0),
                reverse=True
            )
            
            file_map = []
            file_token_count = self.estimate_token_count(filepath) + 10  # Base tokens for the file path
            
            for symbol in sorted_symbols:
                symbol_info = file_symbols[symbol]
                symbol_text = symbol_info["code_text"]
                symbol_tokens = self.estimate_token_count(symbol_text) + 5  # Extra tokens for formatting
                
                # Check if adding this symbol would exceed token budget
                if total_tokens + file_token_count + symbol_tokens > self.max_tokens:
                    # If we haven't added any symbols to this file yet, add at least the most important one
                    if not file_map and symbol == sorted_symbols[0]:
                        file_map.append(symbol_info)
                        total_tokens += symbol_tokens
                    break
                
                file_map.append(symbol_info)
                file_token_count += symbol_tokens
            
            if file_map:
                repo_map[filepath] = file_map
                total_tokens += file_token_count
            
            # Stop adding files if we're near the token budget
            if total_tokens >= self.max_tokens * 0.95:
                break
        
        return repo_map
    
    def format_repo_map_for_ai(self, repo_map):
        """Format the repository map for AI consumption"""
        formatted_map = []
        
        # First add module files
        module_files = []
        related_files = []
        
        for filepath, symbols in repo_map.items():
            abs_path = os.path.join(self.root_dir, filepath) 
            is_module_file = abs_path in self.module_files
            
            file_section = [f"{filepath}:"]
            
            # Add indentation to show file structure
            for symbol in symbols:
                code_text = symbol["code_text"]
                # Add indentation and pipe character to every line
                indented_code = "\n".join(f"│{line}" for line in code_text.split("\n"))
                file_section.append(indented_code)
                file_section.append("⋮...")  # Ellipsis to indicate there's more code in the file
            
            formatted_section = "\n".join(file_section)
            
            if is_module_file:
                module_files.append(formatted_section)
            else:
                related_files.append(formatted_section)
        
        # Add module files first, then related files
        formatted_map.extend(module_files)
        
        if module_files and related_files:
            formatted_map.append("\n--- Related Dependencies ---\n")
        
        formatted_map.extend(related_files)
        
        return "\n\n".join(formatted_map)
    
    def output_repo_map(self, repo_map, format_type="ai", output_file=None):
        """Output the repository map in the specified format"""
        if format_type == "ai":
            formatted_map = self.format_repo_map_for_ai(repo_map)
            if output_file:
                with open(output_file, "w", encoding="utf-8") as f:
                    f.write(formatted_map)
            return formatted_map
        elif format_type == "json":
            if output_file:
                with open(output_file, "w", encoding="utf-8") as f:
                    json.dump(repo_map, f, indent=2)
            return json.dumps(repo_map, indent=2)
        else:
            raise ValueError(f"Unknown format type: {format_type}")

# For Jupyter notebook execution - no argparse required
from IPython.display import display
import ipywidgets as widgets

# Input widgets for repository path and module path
repo_input = widgets.Text(
    value='',
    placeholder='Enter the path to your Python repository',
    description='Repository Path:',
    disabled=False,
    layout=widgets.Layout(width='80%')
)

module_input = widgets.Text(
    value='',
    placeholder='Enter the path to specific module (leave empty for entire repo)',
    description='Module Path:',
    disabled=False,
    layout=widgets.Layout(width='80%')
)

max_tokens_input = widgets.IntSlider(
    value=100000,
    min=10000,
    max=1000000,
    step=10000,
    description='Max Tokens:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

output_ai_input = widgets.Text(
    value='repo_map_ai.txt',
    placeholder='Path for AI-friendly output file',
    description='AI Output:',
    disabled=False,
    layout=widgets.Layout(width='80%')
)

output_json_input = widgets.Text(
    value='repo_map.json',
    placeholder='Path for JSON output file',
    description='JSON Output:',
    disabled=False,
    layout=widgets.Layout(width='80%')
)

display(repo_input)
display(module_input)
display(max_tokens_input)
display(output_ai_input)
display(output_json_input)

def run_generator(b):
    repo_dir = repo_input.value
    module_dir = module_input.value
    max_tokens = max_tokens_input.value
    output_ai = output_ai_input.value
    output_json = output_json_input.value
    
    if not repo_dir:
        print("Please enter a repository path")
        return
    
    if module_dir and not os.path.isdir(module_dir):
        print(f"Module directory {module_dir} not found")
        return
    
    print(f"Processing repo: {repo_dir}")
    print(f"Target module: {module_dir or 'Entire repository'}")
    print(f"Max tokens: {max_tokens}")
    
    generator = RepoMapGenerator(repo_dir, module_dir, max_tokens)
    repo_map = generator.process_repository()
    
    ai_format = generator.output_repo_map(repo_map, "ai", output_ai)
    json_format = generator.output_repo_map(repo_map, "json", output_json)
    
    print("\nRepository map has been generated.")
    print(f"AI-friendly format saved to: {output_ai}")
    print(f"JSON format saved to: {output_json}")
    
    # Show a preview
    print("\nPreview of the AI-friendly format:")
    preview_lines = ai_format.split("\n")[:20]
    print("\n".join(preview_lines))
    if len(preview_lines) < len(ai_format.split("\n")):
        print("... (more content in the output file)")

run_button = widgets.Button(description="Generate Repo Map")
run_button.on_click(run_generator)
display(run_button)

Python version: 3.12.7 (tags/v3.12.7:0b05ead, Oct  1 2024, 03:06:41) [MSC v.1941 64 bit (AMD64)]


Text(value='', description='Repository Path:', layout=Layout(width='80%'), placeholder='Enter the path to your…

Text(value='', description='Module Path:', layout=Layout(width='80%'), placeholder='Enter the path to specific…

IntSlider(value=100000, continuous_update=False, description='Max Tokens:', max=1000000, min=10000, step=10000…

Text(value='repo_map_ai.txt', description='AI Output:', layout=Layout(width='80%'), placeholder='Path for AI-f…

Text(value='repo_map.json', description='JSON Output:', layout=Layout(width='80%'), placeholder='Path for JSON…

Button(description='Generate Repo Map', style=ButtonStyle())

In [8]:
import os
import ast

REPO_PATH = r"C:\python\frappe"
OUTPUT_FILE = "frappe_tree.txt"

def get_definitions(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            tree = ast.parse(f.read(), filename=filepath)
        classes = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
        funcs = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]
        return classes, funcs
    except Exception:
        return [], []

def build_tree(startpath):
    lines = []
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent_str = '│   ' * level + '├── '
        lines.append(f"{indent_str}{os.path.basename(root)}/")
        sub_indent = '│   ' * (level + 1)
        for f in sorted(files):
            if f.endswith('.py'):
                file_path = os.path.join(root, f)
                lines.append(f"{sub_indent}├── {f}")
                classes, funcs = get_definitions(file_path)
                for cls in classes:
                    lines.append(f"{sub_indent}│   ├── class {cls}")
                for func in funcs:
                    lines.append(f"{sub_indent}│   ├── def {func}()")
    return lines

# Run and save output
tree_lines = build_tree(REPO_PATH)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as out_file:
    out_file.write('\n'.join(tree_lines))
