From ca83835da18fca30da40449258f60dd5c6e7c7c6 Mon Sep 17 00:00:00 2001
From: Meeran Tofiq <101665499+Meeran-Tofiq@users.noreply.github.com>
Date: Thu, 5 Feb 2026 15:13:20 +0300
Subject: [PATCH] fix aggregate data step missing some entities

Change 1 (HIGH IMPACT): Harvest tables from ALL files' touches_data

 File: nodes.py, insert after line 4022 (after the for result in results: loop)

 Add a new step that collects table references from touches_data of ALL files across ALL components - not just core-kind files. This catches tables from migrations,
 schemas, configs, etc.

 Change 2: Better dedup that merges operations

 File: nodes.py, line 4025

 Replace the last-wins dict comprehension with a loop that merges operations and from_component when the same table appears from multiple sources.

 Change 3: Expand core_kinds and raise limits

 File: nodes.py
 - Line 3430: Add migration, schema, config, seed, factory, type, interface, middleware to core_kinds
 - Line 3443: Raise cap from 25 to 50
 - Line 3470: Raise truncation from 30K to 80K chars (or slim down per-file payload to just path/kind/summary/touches_data)

 Change 4: Make LLM cleanup less aggressive

 File: nodes.py, in _cleanup_extracted_tables prompt (~line 3584)

 Add instruction: "When in doubt, KEEP the table. Only filter entries that are clearly NOT data storage identifiers."

 Change 5 (OPTIONAL): Persist data_structures in SummarizeFiles

 File: nodes.py, ~line 1867

 Currently data_structures is requested from the LLM but silently dropped from the result. Adding it would enable downstream nodes to use it. Requires re-running
 SummarizeFiles for existing projects.
---
 nodes.py | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 79 insertions(+), 5 deletions(-)

diff --git a/nodes.py b/nodes.py
index 2ec05217..08db930f 100644
--- a/nodes.py
+++ b/nodes.py
@@ -3427,7 +3427,11 @@ def prep(self, shared):
 
     def _get_core_files_for_component(self, component, file_summaries):
         """Get file summaries for component's CRUD-relevant files."""
-        core_kinds = {'handler', 'service', 'repository', 'model', 'crud', 'controller', 'api', 'route', 'endpoint'}
+        # Expanded core_kinds to include more file types that may reference tables
+        core_kinds = {
+            'handler', 'service', 'repository', 'model', 'crud', 'controller', 'api', 'route', 'endpoint',
+            'migration', 'schema', 'config', 'seed', 'factory', 'type', 'interface', 'middleware'
+        }
 
         core_files = []
         for file_idx in component.get("files", []):
@@ -3440,7 +3444,7 @@ def _get_core_files_for_component(self, component, file_summaries):
             core_files = [file_summaries.get(idx) for idx in component.get("files", [])
                           if file_summaries.get(idx)]
 
-        return core_files[:25]  # Increased limit - context is just file summaries
+        return core_files[:50]  # Raised limit from 25 to 50
 
     def _extract_crud_entities(self, component, core_files, use_cache):
         """Extract CRUD entities from a component's core files."""
@@ -3467,7 +3471,7 @@ def _extract_crud_entities(self, component, core_files, use_cache):
 COMPONENT TYPE: {component.get("type", "unknown")}
 
 FILES:
-{json.dumps(file_context, indent=2)[:30000]}
+{json.dumps(file_context, indent=2)[:80000]}
 
 TASK 1 - CRUD ENTITIES:
 Identify entities with Create, Read, Update, or Delete operations.
@@ -3586,6 +3590,9 @@ def _cleanup_extracted_tables(self, tables, use_cache=True):
 - API endpoints or URLs
 - Configuration stores
 
+IMPORTANT: When in doubt, KEEP the table. Only filter entries that are clearly NOT data storage identifiers.
+Be CONSERVATIVE - it's better to include a questionable table than to lose a valid one.
+
 OUTPUT FORMAT (JSON array):
 [
   {{"name": "cleaned_table_name", "type": "database_type"}}
@@ -3780,6 +3787,7 @@ def _classify_entities(self, merged_entities, use_cache):
    - Usually have high CRUD coverage (create, read, update, delete)
    - Used by many components
    - Represent the primary data this system manages
+   - You may read the file summary, look at relationships and entities it interacts with. If any of them appear as system entities then you MUST immediately classify this as core
 
 2. "supporting": Important but not central entities
    - Support core workflows
@@ -4021,8 +4029,74 @@ def extract_for_component(comp):
             all_entities.extend(result.get("entities", []))
             all_tables.extend(result.get("all_tables", []))
 
-        # Dedupe tables by name, then filter noise
-        unique_tables = list({t["name"]: t for t in all_tables if t.get("name")}.values())
+        # CHANGE 1: Harvest tables from ALL files' touches_data (not just core-kind files)
+        # This catches tables from migrations, schemas, configs, etc. that core_kinds may miss
+        print("  - Step 1b: Harvesting tables from ALL files' touches_data...")
+        for file_idx, summary in file_summaries.items():
+            touches_data = summary.get("touches_data", {})
+            if not touches_data:
+                continue
+
+            # Extract tables/stores from touches_data
+            for store_type in ["reads", "writes", "tables", "collections", "stores"]:
+                stores = touches_data.get(store_type, [])
+                if isinstance(stores, list):
+                    for store in stores:
+                        if isinstance(store, str) and store.strip():
+                            all_tables.append({
+                                "name": store.strip(),
+                                "type": "database",
+                                "operations": ["read"] if store_type == "reads" else ["write"] if store_type == "writes" else ["read", "write"],
+                                "from_component": summary.get("component_id", "unknown"),
+                                "source_file": summary.get("path", "unknown")
+                            })
+                        elif isinstance(store, dict) and store.get("name"):
+                            all_tables.append({
+                                "name": store["name"],
+                                "type": store.get("type", "database"),
+                                "operations": store.get("operations", ["read", "write"]),
+                                "from_component": summary.get("component_id", "unknown"),
+                                "source_file": summary.get("path", "unknown")
+                            })
+
+        print(f"    Total tables after touches_data harvest: {len(all_tables)}")
+
+        # CHANGE 2: Better dedup that merges operations instead of last-wins
+        table_map = {}
+        for t in all_tables:
+            name = t.get("name")
+            if not name:
+                continue
+
+            if name not in table_map:
+                table_map[name] = {
+                    "name": name,
+                    "type": t.get("type", "database"),
+                    "operations": set(t.get("operations", [])),
+                    "from_components": [t.get("from_component")] if t.get("from_component") else [],
+                    "source_files": [t.get("source_file")] if t.get("source_file") else []
+                }
+            else:
+                # Merge operations
+                table_map[name]["operations"].update(t.get("operations", []))
+                # Merge components
+                comp = t.get("from_component")
+                if comp and comp not in table_map[name]["from_components"]:
+                    table_map[name]["from_components"].append(comp)
+                # Merge source files
+                src = t.get("source_file")
+                if src and src not in table_map[name]["source_files"]:
+                    table_map[name]["source_files"].append(src)
+
+        # Convert back to list format
+        unique_tables = []
+        for name, data in table_map.items():
+            unique_tables.append({
+                "name": data["name"],
+                "type": data["type"],
+                "operations": list(data["operations"]),
+                "from_component": data["from_components"][0] if data["from_components"] else "unknown"
+            })
 
         # Post-filter: ONLY formatting cleanup - no semantic filtering (that's the LLM's job)
         def is_valid_table(table):