wip context algorithm

NicolasIRAGNE · NicolasIRAGNE · commit 3e51a6df748a · 2025-08-05T01:55:57.000+02:00
diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py
@@ -62,6 +62,66 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str,
     return summary, tree, content
 
 
+def format_node_with_context_limit(
+    node: FileSystemNode, 
+    query: IngestionQuery, 
+    max_tokens: int
+) -> tuple[str, str, str]:
+    """Generate optimized content that fits within token limit using greedy knapsack algorithm.
+    
+    Uses relevance scores to prioritize files and maximize value within token constraints.
+    
+    Parameters
+    ----------
+    node : FileSystemNode
+        The file system node to be summarized.
+    query : IngestionQuery
+        The parsed query object containing information about the repository and query parameters.
+    max_tokens : int
+        Maximum tokens allowed for the output.
+        
+    Returns
+    -------
+    tuple[str, str, str]
+        A tuple containing the summary, directory structure, and optimized file contents.
+    """
+    is_single_file = node.type == FileSystemNodeType.FILE
+    summary = _create_summary_prefix(query, single_file=is_single_file)
+    
+    # Generate tree structure (always include this)
+    tree = "Directory structure:\n" + _create_tree_structure(query, node=node)
+    tree_tokens = _count_tokens(tree)
+    
+    # Reserve tokens for summary and tree
+    summary_base_tokens = _count_tokens(summary) + 100  # 100 buffer for final summary additions
+    available_tokens = max_tokens - tree_tokens - summary_base_tokens
+    
+    if available_tokens <= 0:
+        # Not enough space even for tree, just return minimal content
+        content = "[Content omitted - insufficient token space]"
+        summary += f"\nEstimated tokens: {_format_token_count(summary + tree + content)}"
+        return summary, tree, content
+    
+    # Apply greedy knapsack algorithm to select optimal file contents
+    optimized_content = _optimize_content_with_knapsack(node, available_tokens)
+    
+    # Update summary with final info
+    if node.type == FileSystemNodeType.DIRECTORY:
+        # Count how many files were actually included
+        included_files = len([line for line in optimized_content.split('\n') if line.startswith('=' * 48)])
+        summary += f"Files included: {included_files} (optimized for {max_tokens:,} tokens)\n"
+    elif node.type == FileSystemNodeType.FILE:
+        summary += f"File: {node.name}\n"
+        summary += f"Lines: {len(node.content.splitlines()):,}\n"
+    
+    final_content = summary + "\n" + tree + "\n" + optimized_content
+    token_estimate = _format_token_count(final_content)
+    if token_estimate:
+        summary += f"\nEstimated tokens: {token_estimate}"
+    
+    return summary, tree, optimized_content
+
+
 def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False) -> str:
     """Create a prefix string for summarizing a repository or local directory.
 
@@ -191,6 +251,27 @@ def _create_tree_structure(
     return tree_str
 
 
+def _count_tokens(text: str) -> int:
+    """Count actual tokens in text using tiktoken.
+    
+    Parameters
+    ----------
+    text : str
+        The text to count tokens for.
+        
+    Returns
+    -------
+    int
+        Number of tokens, or character/4 estimate if tiktoken fails.
+    """
+    try:
+        encoding = tiktoken.get_encoding("o200k_base")
+        return len(encoding.encode(text, disallowed_special=()))
+    except Exception:
+        # Fallback to character-based estimation
+        return len(text) // 4
+
+
 def _format_token_count(text: str) -> str | None:
     """Return a human-readable token-count string (e.g. 1.2k, 1.2 M).
 
@@ -206,8 +287,7 @@ def _format_token_count(text: str) -> str | None:
 
     """
     try:
-        encoding = tiktoken.get_encoding("o200k_base")  # gpt-4o, gpt-4o-mini
-        total_tokens = len(encoding.encode(text, disallowed_special=()))
+        total_tokens = _count_tokens(text)
     except (ValueError, UnicodeEncodeError) as exc:
         logger.warning("Failed to estimate token size", extra={"error": str(exc)})
         return None
@@ -221,3 +301,184 @@ def _format_token_count(text: str) -> str | None:
             return f"{total_tokens / threshold:.1f}{suffix}"
 
     return str(total_tokens)
+
+
+def _optimize_content_with_knapsack(node: FileSystemNode, max_tokens: int) -> str:
+    """Apply greedy knapsack algorithm to select optimal file contents within token limit.
+    
+    Parameters
+    ----------
+    node : FileSystemNode
+        Root node to extract files from.
+    max_tokens : int  
+        Maximum tokens available for content.
+        
+    Returns
+    -------
+    str
+        Optimized content string with selected files.
+    """
+    # Collect all files with their metadata
+    file_items = []
+    _collect_file_items(node, file_items)
+    
+    if not file_items:
+        return "[No files found]"
+    
+    # Calculate value/cost ratio for each file and sort by it
+    for item in file_items:
+        relevance_score = max(item['relevance'], 1)  # Avoid division by zero
+        file_type_multiplier = _get_file_type_multiplier(item['path'])
+        
+        # Value = relevance * type_multiplier * content_quality
+        content_quality = _estimate_content_quality(item['content'])
+        value = relevance_score * file_type_multiplier * content_quality
+        
+        # Cost = token count
+        cost = item['tokens']
+        
+        # Ratio = value per token (higher is better)
+        item['ratio'] = value / max(cost, 1)
+    
+    # Sort by ratio (descending - best value first)
+    sorted_items = sorted(file_items, key=lambda x: x['ratio'], reverse=True)
+    
+    # Greedy selection: pick highest ratio items that fit
+    selected_items = []
+    total_tokens = 0
+    
+    for item in sorted_items:
+        if total_tokens + item['tokens'] <= max_tokens:
+            selected_items.append(item)
+            total_tokens += item['tokens']
+    
+    # Build final content string
+    if not selected_items:
+        return "[No files fit within token limit]"
+    
+    content_parts = []
+    for item in selected_items:
+        content_parts.append(item['content_string'])
+    
+    result = "\n".join(content_parts)
+    
+    logger.info(
+        f"Knapsack optimization: selected {len(selected_items)}/{len(file_items)} files, "
+        f"using {total_tokens}/{max_tokens} tokens"
+    )
+    
+    return result
+
+
+def _collect_file_items(node: FileSystemNode, items: list) -> None:
+    """Recursively collect file metadata for knapsack optimization.
+    
+    Parameters
+    ----------
+    node : FileSystemNode
+        Current node to process.
+    items : list
+        List to append file items to.
+    """
+    if node.type == FileSystemNodeType.FILE:
+        content_string = node.content_string
+        tokens = _count_tokens(content_string)
+        
+        items.append({
+            'path': node.path_str or node.name,
+            'content': node.content,
+            'content_string': content_string,
+            'tokens': tokens,
+            'relevance': node.likelihood_score,
+            'size': node.size,
+            'node': node
+        })
+    
+    elif node.type == FileSystemNodeType.DIRECTORY and node.children:
+        for child in node.children:
+            _collect_file_items(child, items)
+
+
+def _get_file_type_multiplier(file_path: str) -> float:
+    """Get relevance multiplier based on file type/name.
+    
+    Parameters
+    ---------- 
+    file_path : str
+        Path to the file.
+        
+    Returns
+    -------
+    float
+        Multiplier for this file type (higher = more important).
+    """
+    from pathlib import Path
+    
+    path = Path(file_path)
+    name_lower = path.name.lower()
+    ext_lower = path.suffix.lower()
+    
+    # High priority files
+    if any(pattern in name_lower for pattern in ['readme', 'main', 'index', 'app', 'server', '__init__']):
+        return 2.0
+    
+    # Important code files
+    if ext_lower in {'.py', '.js', '.ts', '.java', '.cpp', '.c', '.go', '.rs', '.rb'}:
+        return 1.5
+    
+    # Config and setup files
+    if ext_lower in {'.json', '.yaml', '.yml', '.toml', '.ini', '.env'} or name_lower in {'dockerfile', 'makefile'}:
+        return 1.3
+    
+    # Documentation
+    if ext_lower in {'.md', '.txt', '.rst'}:
+        return 1.1
+    
+    # Default
+    return 1.0
+
+
+def _estimate_content_quality(content: str) -> float:
+    """Estimate content quality/informativeness.
+    
+    Parameters
+    ----------
+    content : str
+        File content to analyze.
+        
+    Returns
+    -------
+    float
+        Quality score (higher = more informative).
+    """
+    if not content or content.strip() in ['[Binary file]', '[Empty file]', 'Error reading file']:
+        return 0.1
+    
+    lines = content.splitlines()
+    non_empty_lines = [line for line in lines if line.strip()]
+    
+    if not non_empty_lines:
+        return 0.2
+    
+    # Base score from content density
+    density = len(non_empty_lines) / max(len(lines), 1)
+    
+    # Bonus for code-like content
+    code_indicators = 0
+    for line in non_empty_lines[:50]:  # Check first 50 lines
+        line_stripped = line.strip()
+        if any(indicator in line_stripped for indicator in ['def ', 'class ', 'function ', 'import ', 'from ', 'const ', 'let ', 'var ']):
+            code_indicators += 1
+        if any(char in line_stripped for char in ['{', '}', '(', ')', ';', ':']):
+            code_indicators += 0.5
+    
+    code_bonus = min(code_indicators / 10, 1.0)
+    
+    # Penalty for very long files (diminishing returns)
+    length_penalty = 1.0
+    if len(lines) > 1000:
+        length_penalty = 0.8
+    elif len(lines) > 2000:
+        length_penalty = 0.6
+    
+    return (density + code_bonus) * length_penalty
diff --git a/src/server/ai_ingestion.py b/src/server/ai_ingestion.py
@@ -98,9 +98,14 @@ async def ai_ingest_query(
     # Generate digest using existing mechanism
     logger.info("Generating digest with AI-selected files")
     
-    # Create filtered query with selected files
-    filtered_query = _create_filtered_query(query, selected_files)
-    final_summary, final_tree, final_content = ingest_query(filtered_query)
+    # Parse context size to tokens for optimization
+    context_tokens = _parse_context_size_to_tokens(context_size)
+    
+    # Use context-aware formatting instead of regular ingestion
+    from gitingest.output_formatter import format_node_with_context_limit
+    final_summary, final_tree, final_content = format_node_with_context_limit(
+        root_node, query, context_tokens
+    )
     final_selected_files = selected_files
     
     # Update summary with AI selection info