Skip to content

Commit 3e51a6d

Browse files
committed
wip context algorithm
1 parent 6b606bf commit 3e51a6d

File tree

2 files changed

+271
-5
lines changed

2 files changed

+271
-5
lines changed

src/gitingest/output_formatter.py

Lines changed: 263 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,66 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str,
6262
return summary, tree, content
6363

6464

65+
def format_node_with_context_limit(
66+
node: FileSystemNode,
67+
query: IngestionQuery,
68+
max_tokens: int
69+
) -> tuple[str, str, str]:
70+
"""Generate optimized content that fits within token limit using greedy knapsack algorithm.
71+
72+
Uses relevance scores to prioritize files and maximize value within token constraints.
73+
74+
Parameters
75+
----------
76+
node : FileSystemNode
77+
The file system node to be summarized.
78+
query : IngestionQuery
79+
The parsed query object containing information about the repository and query parameters.
80+
max_tokens : int
81+
Maximum tokens allowed for the output.
82+
83+
Returns
84+
-------
85+
tuple[str, str, str]
86+
A tuple containing the summary, directory structure, and optimized file contents.
87+
"""
88+
is_single_file = node.type == FileSystemNodeType.FILE
89+
summary = _create_summary_prefix(query, single_file=is_single_file)
90+
91+
# Generate tree structure (always include this)
92+
tree = "Directory structure:\n" + _create_tree_structure(query, node=node)
93+
tree_tokens = _count_tokens(tree)
94+
95+
# Reserve tokens for summary and tree
96+
summary_base_tokens = _count_tokens(summary) + 100 # 100 buffer for final summary additions
97+
available_tokens = max_tokens - tree_tokens - summary_base_tokens
98+
99+
if available_tokens <= 0:
100+
# Not enough space even for tree, just return minimal content
101+
content = "[Content omitted - insufficient token space]"
102+
summary += f"\nEstimated tokens: {_format_token_count(summary + tree + content)}"
103+
return summary, tree, content
104+
105+
# Apply greedy knapsack algorithm to select optimal file contents
106+
optimized_content = _optimize_content_with_knapsack(node, available_tokens)
107+
108+
# Update summary with final info
109+
if node.type == FileSystemNodeType.DIRECTORY:
110+
# Count how many files were actually included
111+
included_files = len([line for line in optimized_content.split('\n') if line.startswith('=' * 48)])
112+
summary += f"Files included: {included_files} (optimized for {max_tokens:,} tokens)\n"
113+
elif node.type == FileSystemNodeType.FILE:
114+
summary += f"File: {node.name}\n"
115+
summary += f"Lines: {len(node.content.splitlines()):,}\n"
116+
117+
final_content = summary + "\n" + tree + "\n" + optimized_content
118+
token_estimate = _format_token_count(final_content)
119+
if token_estimate:
120+
summary += f"\nEstimated tokens: {token_estimate}"
121+
122+
return summary, tree, optimized_content
123+
124+
65125
def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False) -> str:
66126
"""Create a prefix string for summarizing a repository or local directory.
67127
@@ -191,6 +251,27 @@ def _create_tree_structure(
191251
return tree_str
192252

193253

254+
def _count_tokens(text: str) -> int:
255+
"""Count actual tokens in text using tiktoken.
256+
257+
Parameters
258+
----------
259+
text : str
260+
The text to count tokens for.
261+
262+
Returns
263+
-------
264+
int
265+
Number of tokens, or character/4 estimate if tiktoken fails.
266+
"""
267+
try:
268+
encoding = tiktoken.get_encoding("o200k_base")
269+
return len(encoding.encode(text, disallowed_special=()))
270+
except Exception:
271+
# Fallback to character-based estimation
272+
return len(text) // 4
273+
274+
194275
def _format_token_count(text: str) -> str | None:
195276
"""Return a human-readable token-count string (e.g. 1.2k, 1.2 M).
196277
@@ -206,8 +287,7 @@ def _format_token_count(text: str) -> str | None:
206287
207288
"""
208289
try:
209-
encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini
210-
total_tokens = len(encoding.encode(text, disallowed_special=()))
290+
total_tokens = _count_tokens(text)
211291
except (ValueError, UnicodeEncodeError) as exc:
212292
logger.warning("Failed to estimate token size", extra={"error": str(exc)})
213293
return None
@@ -221,3 +301,184 @@ def _format_token_count(text: str) -> str | None:
221301
return f"{total_tokens / threshold:.1f}{suffix}"
222302

223303
return str(total_tokens)
304+
305+
306+
def _optimize_content_with_knapsack(node: FileSystemNode, max_tokens: int) -> str:
307+
"""Apply greedy knapsack algorithm to select optimal file contents within token limit.
308+
309+
Parameters
310+
----------
311+
node : FileSystemNode
312+
Root node to extract files from.
313+
max_tokens : int
314+
Maximum tokens available for content.
315+
316+
Returns
317+
-------
318+
str
319+
Optimized content string with selected files.
320+
"""
321+
# Collect all files with their metadata
322+
file_items = []
323+
_collect_file_items(node, file_items)
324+
325+
if not file_items:
326+
return "[No files found]"
327+
328+
# Calculate value/cost ratio for each file and sort by it
329+
for item in file_items:
330+
relevance_score = max(item['relevance'], 1) # Avoid division by zero
331+
file_type_multiplier = _get_file_type_multiplier(item['path'])
332+
333+
# Value = relevance * type_multiplier * content_quality
334+
content_quality = _estimate_content_quality(item['content'])
335+
value = relevance_score * file_type_multiplier * content_quality
336+
337+
# Cost = token count
338+
cost = item['tokens']
339+
340+
# Ratio = value per token (higher is better)
341+
item['ratio'] = value / max(cost, 1)
342+
343+
# Sort by ratio (descending - best value first)
344+
sorted_items = sorted(file_items, key=lambda x: x['ratio'], reverse=True)
345+
346+
# Greedy selection: pick highest ratio items that fit
347+
selected_items = []
348+
total_tokens = 0
349+
350+
for item in sorted_items:
351+
if total_tokens + item['tokens'] <= max_tokens:
352+
selected_items.append(item)
353+
total_tokens += item['tokens']
354+
355+
# Build final content string
356+
if not selected_items:
357+
return "[No files fit within token limit]"
358+
359+
content_parts = []
360+
for item in selected_items:
361+
content_parts.append(item['content_string'])
362+
363+
result = "\n".join(content_parts)
364+
365+
logger.info(
366+
f"Knapsack optimization: selected {len(selected_items)}/{len(file_items)} files, "
367+
f"using {total_tokens}/{max_tokens} tokens"
368+
)
369+
370+
return result
371+
372+
373+
def _collect_file_items(node: FileSystemNode, items: list) -> None:
374+
"""Recursively collect file metadata for knapsack optimization.
375+
376+
Parameters
377+
----------
378+
node : FileSystemNode
379+
Current node to process.
380+
items : list
381+
List to append file items to.
382+
"""
383+
if node.type == FileSystemNodeType.FILE:
384+
content_string = node.content_string
385+
tokens = _count_tokens(content_string)
386+
387+
items.append({
388+
'path': node.path_str or node.name,
389+
'content': node.content,
390+
'content_string': content_string,
391+
'tokens': tokens,
392+
'relevance': node.likelihood_score,
393+
'size': node.size,
394+
'node': node
395+
})
396+
397+
elif node.type == FileSystemNodeType.DIRECTORY and node.children:
398+
for child in node.children:
399+
_collect_file_items(child, items)
400+
401+
402+
def _get_file_type_multiplier(file_path: str) -> float:
403+
"""Get relevance multiplier based on file type/name.
404+
405+
Parameters
406+
----------
407+
file_path : str
408+
Path to the file.
409+
410+
Returns
411+
-------
412+
float
413+
Multiplier for this file type (higher = more important).
414+
"""
415+
from pathlib import Path
416+
417+
path = Path(file_path)
418+
name_lower = path.name.lower()
419+
ext_lower = path.suffix.lower()
420+
421+
# High priority files
422+
if any(pattern in name_lower for pattern in ['readme', 'main', 'index', 'app', 'server', '__init__']):
423+
return 2.0
424+
425+
# Important code files
426+
if ext_lower in {'.py', '.js', '.ts', '.java', '.cpp', '.c', '.go', '.rs', '.rb'}:
427+
return 1.5
428+
429+
# Config and setup files
430+
if ext_lower in {'.json', '.yaml', '.yml', '.toml', '.ini', '.env'} or name_lower in {'dockerfile', 'makefile'}:
431+
return 1.3
432+
433+
# Documentation
434+
if ext_lower in {'.md', '.txt', '.rst'}:
435+
return 1.1
436+
437+
# Default
438+
return 1.0
439+
440+
441+
def _estimate_content_quality(content: str) -> float:
442+
"""Estimate content quality/informativeness.
443+
444+
Parameters
445+
----------
446+
content : str
447+
File content to analyze.
448+
449+
Returns
450+
-------
451+
float
452+
Quality score (higher = more informative).
453+
"""
454+
if not content or content.strip() in ['[Binary file]', '[Empty file]', 'Error reading file']:
455+
return 0.1
456+
457+
lines = content.splitlines()
458+
non_empty_lines = [line for line in lines if line.strip()]
459+
460+
if not non_empty_lines:
461+
return 0.2
462+
463+
# Base score from content density
464+
density = len(non_empty_lines) / max(len(lines), 1)
465+
466+
# Bonus for code-like content
467+
code_indicators = 0
468+
for line in non_empty_lines[:50]: # Check first 50 lines
469+
line_stripped = line.strip()
470+
if any(indicator in line_stripped for indicator in ['def ', 'class ', 'function ', 'import ', 'from ', 'const ', 'let ', 'var ']):
471+
code_indicators += 1
472+
if any(char in line_stripped for char in ['{', '}', '(', ')', ';', ':']):
473+
code_indicators += 0.5
474+
475+
code_bonus = min(code_indicators / 10, 1.0)
476+
477+
# Penalty for very long files (diminishing returns)
478+
length_penalty = 1.0
479+
if len(lines) > 1000:
480+
length_penalty = 0.8
481+
elif len(lines) > 2000:
482+
length_penalty = 0.6
483+
484+
return (density + code_bonus) * length_penalty

src/server/ai_ingestion.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,14 @@ async def ai_ingest_query(
9898
# Generate digest using existing mechanism
9999
logger.info("Generating digest with AI-selected files")
100100

101-
# Create filtered query with selected files
102-
filtered_query = _create_filtered_query(query, selected_files)
103-
final_summary, final_tree, final_content = ingest_query(filtered_query)
101+
# Parse context size to tokens for optimization
102+
context_tokens = _parse_context_size_to_tokens(context_size)
103+
104+
# Use context-aware formatting instead of regular ingestion
105+
from gitingest.output_formatter import format_node_with_context_limit
106+
final_summary, final_tree, final_content = format_node_with_context_limit(
107+
root_node, query, context_tokens
108+
)
104109
final_selected_files = selected_files
105110

106111
# Update summary with AI selection info

0 commit comments

Comments
 (0)