@@ -62,6 +62,66 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str,
62
62
return summary , tree , content
63
63
64
64
65
+ def format_node_with_context_limit (
66
+ node : FileSystemNode ,
67
+ query : IngestionQuery ,
68
+ max_tokens : int
69
+ ) -> tuple [str , str , str ]:
70
+ """Generate optimized content that fits within token limit using greedy knapsack algorithm.
71
+
72
+ Uses relevance scores to prioritize files and maximize value within token constraints.
73
+
74
+ Parameters
75
+ ----------
76
+ node : FileSystemNode
77
+ The file system node to be summarized.
78
+ query : IngestionQuery
79
+ The parsed query object containing information about the repository and query parameters.
80
+ max_tokens : int
81
+ Maximum tokens allowed for the output.
82
+
83
+ Returns
84
+ -------
85
+ tuple[str, str, str]
86
+ A tuple containing the summary, directory structure, and optimized file contents.
87
+ """
88
+ is_single_file = node .type == FileSystemNodeType .FILE
89
+ summary = _create_summary_prefix (query , single_file = is_single_file )
90
+
91
+ # Generate tree structure (always include this)
92
+ tree = "Directory structure:\n " + _create_tree_structure (query , node = node )
93
+ tree_tokens = _count_tokens (tree )
94
+
95
+ # Reserve tokens for summary and tree
96
+ summary_base_tokens = _count_tokens (summary ) + 100 # 100 buffer for final summary additions
97
+ available_tokens = max_tokens - tree_tokens - summary_base_tokens
98
+
99
+ if available_tokens <= 0 :
100
+ # Not enough space even for tree, just return minimal content
101
+ content = "[Content omitted - insufficient token space]"
102
+ summary += f"\n Estimated tokens: { _format_token_count (summary + tree + content )} "
103
+ return summary , tree , content
104
+
105
+ # Apply greedy knapsack algorithm to select optimal file contents
106
+ optimized_content = _optimize_content_with_knapsack (node , available_tokens )
107
+
108
+ # Update summary with final info
109
+ if node .type == FileSystemNodeType .DIRECTORY :
110
+ # Count how many files were actually included
111
+ included_files = len ([line for line in optimized_content .split ('\n ' ) if line .startswith ('=' * 48 )])
112
+ summary += f"Files included: { included_files } (optimized for { max_tokens :,} tokens)\n "
113
+ elif node .type == FileSystemNodeType .FILE :
114
+ summary += f"File: { node .name } \n "
115
+ summary += f"Lines: { len (node .content .splitlines ()):,} \n "
116
+
117
+ final_content = summary + "\n " + tree + "\n " + optimized_content
118
+ token_estimate = _format_token_count (final_content )
119
+ if token_estimate :
120
+ summary += f"\n Estimated tokens: { token_estimate } "
121
+
122
+ return summary , tree , optimized_content
123
+
124
+
65
125
def _create_summary_prefix (query : IngestionQuery , * , single_file : bool = False ) -> str :
66
126
"""Create a prefix string for summarizing a repository or local directory.
67
127
@@ -191,6 +251,27 @@ def _create_tree_structure(
191
251
return tree_str
192
252
193
253
254
+ def _count_tokens (text : str ) -> int :
255
+ """Count actual tokens in text using tiktoken.
256
+
257
+ Parameters
258
+ ----------
259
+ text : str
260
+ The text to count tokens for.
261
+
262
+ Returns
263
+ -------
264
+ int
265
+ Number of tokens, or character/4 estimate if tiktoken fails.
266
+ """
267
+ try :
268
+ encoding = tiktoken .get_encoding ("o200k_base" )
269
+ return len (encoding .encode (text , disallowed_special = ()))
270
+ except Exception :
271
+ # Fallback to character-based estimation
272
+ return len (text ) // 4
273
+
274
+
194
275
def _format_token_count (text : str ) -> str | None :
195
276
"""Return a human-readable token-count string (e.g. 1.2k, 1.2 M).
196
277
@@ -206,8 +287,7 @@ def _format_token_count(text: str) -> str | None:
206
287
207
288
"""
208
289
try :
209
- encoding = tiktoken .get_encoding ("o200k_base" ) # gpt-4o, gpt-4o-mini
210
- total_tokens = len (encoding .encode (text , disallowed_special = ()))
290
+ total_tokens = _count_tokens (text )
211
291
except (ValueError , UnicodeEncodeError ) as exc :
212
292
logger .warning ("Failed to estimate token size" , extra = {"error" : str (exc )})
213
293
return None
@@ -221,3 +301,184 @@ def _format_token_count(text: str) -> str | None:
221
301
return f"{ total_tokens / threshold :.1f} { suffix } "
222
302
223
303
return str (total_tokens )
304
+
305
+
306
+ def _optimize_content_with_knapsack (node : FileSystemNode , max_tokens : int ) -> str :
307
+ """Apply greedy knapsack algorithm to select optimal file contents within token limit.
308
+
309
+ Parameters
310
+ ----------
311
+ node : FileSystemNode
312
+ Root node to extract files from.
313
+ max_tokens : int
314
+ Maximum tokens available for content.
315
+
316
+ Returns
317
+ -------
318
+ str
319
+ Optimized content string with selected files.
320
+ """
321
+ # Collect all files with their metadata
322
+ file_items = []
323
+ _collect_file_items (node , file_items )
324
+
325
+ if not file_items :
326
+ return "[No files found]"
327
+
328
+ # Calculate value/cost ratio for each file and sort by it
329
+ for item in file_items :
330
+ relevance_score = max (item ['relevance' ], 1 ) # Avoid division by zero
331
+ file_type_multiplier = _get_file_type_multiplier (item ['path' ])
332
+
333
+ # Value = relevance * type_multiplier * content_quality
334
+ content_quality = _estimate_content_quality (item ['content' ])
335
+ value = relevance_score * file_type_multiplier * content_quality
336
+
337
+ # Cost = token count
338
+ cost = item ['tokens' ]
339
+
340
+ # Ratio = value per token (higher is better)
341
+ item ['ratio' ] = value / max (cost , 1 )
342
+
343
+ # Sort by ratio (descending - best value first)
344
+ sorted_items = sorted (file_items , key = lambda x : x ['ratio' ], reverse = True )
345
+
346
+ # Greedy selection: pick highest ratio items that fit
347
+ selected_items = []
348
+ total_tokens = 0
349
+
350
+ for item in sorted_items :
351
+ if total_tokens + item ['tokens' ] <= max_tokens :
352
+ selected_items .append (item )
353
+ total_tokens += item ['tokens' ]
354
+
355
+ # Build final content string
356
+ if not selected_items :
357
+ return "[No files fit within token limit]"
358
+
359
+ content_parts = []
360
+ for item in selected_items :
361
+ content_parts .append (item ['content_string' ])
362
+
363
+ result = "\n " .join (content_parts )
364
+
365
+ logger .info (
366
+ f"Knapsack optimization: selected { len (selected_items )} /{ len (file_items )} files, "
367
+ f"using { total_tokens } /{ max_tokens } tokens"
368
+ )
369
+
370
+ return result
371
+
372
+
373
+ def _collect_file_items (node : FileSystemNode , items : list ) -> None :
374
+ """Recursively collect file metadata for knapsack optimization.
375
+
376
+ Parameters
377
+ ----------
378
+ node : FileSystemNode
379
+ Current node to process.
380
+ items : list
381
+ List to append file items to.
382
+ """
383
+ if node .type == FileSystemNodeType .FILE :
384
+ content_string = node .content_string
385
+ tokens = _count_tokens (content_string )
386
+
387
+ items .append ({
388
+ 'path' : node .path_str or node .name ,
389
+ 'content' : node .content ,
390
+ 'content_string' : content_string ,
391
+ 'tokens' : tokens ,
392
+ 'relevance' : node .likelihood_score ,
393
+ 'size' : node .size ,
394
+ 'node' : node
395
+ })
396
+
397
+ elif node .type == FileSystemNodeType .DIRECTORY and node .children :
398
+ for child in node .children :
399
+ _collect_file_items (child , items )
400
+
401
+
402
+ def _get_file_type_multiplier (file_path : str ) -> float :
403
+ """Get relevance multiplier based on file type/name.
404
+
405
+ Parameters
406
+ ----------
407
+ file_path : str
408
+ Path to the file.
409
+
410
+ Returns
411
+ -------
412
+ float
413
+ Multiplier for this file type (higher = more important).
414
+ """
415
+ from pathlib import Path
416
+
417
+ path = Path (file_path )
418
+ name_lower = path .name .lower ()
419
+ ext_lower = path .suffix .lower ()
420
+
421
+ # High priority files
422
+ if any (pattern in name_lower for pattern in ['readme' , 'main' , 'index' , 'app' , 'server' , '__init__' ]):
423
+ return 2.0
424
+
425
+ # Important code files
426
+ if ext_lower in {'.py' , '.js' , '.ts' , '.java' , '.cpp' , '.c' , '.go' , '.rs' , '.rb' }:
427
+ return 1.5
428
+
429
+ # Config and setup files
430
+ if ext_lower in {'.json' , '.yaml' , '.yml' , '.toml' , '.ini' , '.env' } or name_lower in {'dockerfile' , 'makefile' }:
431
+ return 1.3
432
+
433
+ # Documentation
434
+ if ext_lower in {'.md' , '.txt' , '.rst' }:
435
+ return 1.1
436
+
437
+ # Default
438
+ return 1.0
439
+
440
+
441
+ def _estimate_content_quality (content : str ) -> float :
442
+ """Estimate content quality/informativeness.
443
+
444
+ Parameters
445
+ ----------
446
+ content : str
447
+ File content to analyze.
448
+
449
+ Returns
450
+ -------
451
+ float
452
+ Quality score (higher = more informative).
453
+ """
454
+ if not content or content .strip () in ['[Binary file]' , '[Empty file]' , 'Error reading file' ]:
455
+ return 0.1
456
+
457
+ lines = content .splitlines ()
458
+ non_empty_lines = [line for line in lines if line .strip ()]
459
+
460
+ if not non_empty_lines :
461
+ return 0.2
462
+
463
+ # Base score from content density
464
+ density = len (non_empty_lines ) / max (len (lines ), 1 )
465
+
466
+ # Bonus for code-like content
467
+ code_indicators = 0
468
+ for line in non_empty_lines [:50 ]: # Check first 50 lines
469
+ line_stripped = line .strip ()
470
+ if any (indicator in line_stripped for indicator in ['def ' , 'class ' , 'function ' , 'import ' , 'from ' , 'const ' , 'let ' , 'var ' ]):
471
+ code_indicators += 1
472
+ if any (char in line_stripped for char in ['{' , '}' , '(' , ')' , ';' , ':' ]):
473
+ code_indicators += 0.5
474
+
475
+ code_bonus = min (code_indicators / 10 , 1.0 )
476
+
477
+ # Penalty for very long files (diminishing returns)
478
+ length_penalty = 1.0
479
+ if len (lines ) > 1000 :
480
+ length_penalty = 0.8
481
+ elif len (lines ) > 2000 :
482
+ length_penalty = 0.6
483
+
484
+ return (density + code_bonus ) * length_penalty
0 commit comments