dep0we · dep0we · May 7, 2026 · May 7, 2026
diff --git a/atomic_agents/agent.py b/atomic_agents/agent.py
@@ -83,6 +83,11 @@ def __init__(
         # populated for paths shaped <system>/projects/<project>/agents/<role>/.
         self.cascade: _cascade.CascadePaths | None = _cascade.detect_cascade(self.agent_root)
 
+        # Per-call helper-provenance rollup (spec/13 Layer 3). Reset at the
+        # start of each call(); appended to by helper_call(). Empty list
+        # means either no helpers ran or the call started outside call().
+        self._helpers_this_run: list[dict] = []
+
         # Loaded later via load() — populated in __init__ for clarity
         self._persona_text: str = ""
         self._tools_text: str = ""
@@ -353,6 +358,8 @@ def call(
             raise
 
         try:
+            # Reset helper-provenance rollup for this run (spec/13 Layer 3)
+            self._helpers_this_run = []
             # Cost guardrails check
             check = self._check_cost_guardrails(critical=critical)
             if not check.allow:
@@ -458,6 +465,11 @@ def call(
                 log_record["critical"] = True
             if parse_failures:
                 log_record["capture_parse_failures"] = len(parse_failures)
+            if self._helpers_this_run:
+                # Spec/13 Layer 3 — research log: roll up helper provenance
+                # into the parent run record so an audit can trace every fact
+                # back to the helper invocation that produced it.
+                log_record["helper_provenance"] = list(self._helpers_this_run)
             self._log(log_record)
 
             return response
@@ -534,6 +546,19 @@ def helper_call(
             log_record["provenance_preserved"] = provenance_preserved
         self._log(log_record)
 
+        # Append to the in-memory rollup for spec/13 Layer 3 (research log).
+        # The parent run's log record will include this list at end-of-call.
+        rollup_entry = {
+            "model": actual_model,
+            "summary": summary or "helper call",
+            "cost_usd": cost,
+            "latency_ms": latency_ms,
+        }
+        if sources_list:
+            rollup_entry["sources_summarized"] = sources_list
+            rollup_entry["provenance_preserved"] = provenance_preserved
+        self._helpers_this_run.append(rollup_entry)
+
         return HelperResult(
             text=raw.text,
             model=actual_model,

diff --git a/atomic_agents/eval.py b/atomic_agents/eval.py
@@ -459,28 +459,78 @@ def run_suite(
     # Internals
 
     def _build_judge_prompt(self, test: EvalTest, agent_response: str) -> str:
-        """Render the judge.md template with this test's content."""
-        # The judge template uses {placeholder} substitutions
+        """Render the judge.md template with this test's content.
+
+        When the test declares ``expected_facts`` (per spec/13 Layer 2), an
+        additional "Factual accuracy check" section is appended to the prompt
+        instructing the judge to verify each fact and emit a ``factual_checks``
+        array in its JSON response.
+        """
         try:
-            return self.judge_template.format(
+            base = self.judge_template.format(
                 rubric=self.rubric_body,
                 test_input=test.input,
                 expected_behavior=test.expected_behavior,
                 pass_criteria=test.pass_criteria,
                 agent_response=agent_response,
                 trajectory="(trajectory capture not implemented in v0.2)",
             )
-        except KeyError as e:
-            # Template has a placeholder we don't provide — just return the
-            # template + appended content so the judge has something to work with
-            return (
+        except KeyError:
+            base = (
                 f"{self.judge_template}\n\n"
                 f"---\n\n## Rubric\n\n{self.rubric_body}\n\n"
                 f"## Test input\n\n{test.input}\n\n"
                 f"## Expected behavior\n\n{test.expected_behavior}\n\n"
                 f"## Pass criteria\n\n{test.pass_criteria}\n\n"
                 f"## Agent's response\n\n{agent_response}"
             )
+        if test.expected_facts:
+            base = base + "\n\n" + self._render_factual_check_section(test.expected_facts)
+        return base
+
+    @staticmethod
+    def _render_factual_check_section(expected_facts: list[dict]) -> str:
+        """Build the spec/13 Layer-2 'Factual accuracy check' addendum.
+
+        Instructs the judge to emit a ``factual_checks: [...]`` array
+        alongside its rubric scores, with per-fact verdicts on whether
+        the agent stated the claim, used the correct value, and cited
+        a source.
+        """
+        bullets = []
+        for f in expected_facts:
+            claim = f.get("claim", "")
+            source = f.get("source", "")
+            expected = f.get("expected_value", "")
+            bullets.append(
+                f'- claim: "{claim}"\n'
+                f'  source: {source}\n'
+                f'  expected_value: "{expected}"'
+            )
+        bullet_text = "\n".join(bullets)
+        return (
+            "## Factual accuracy check\n\n"
+            "In addition to scoring rubric dimensions, verify these facts in the\n"
+            "agent's response. For each expected_fact:\n\n"
+            "1. Did the agent state this claim?\n"
+            "2. If yes, did the agent's value match expected_value?\n"
+            "3. If yes, did the agent cite a source?\n\n"
+            "Add a `factual_checks` array to your JSON response with one entry\n"
+            "per expected_fact:\n\n"
+            "```json\n"
+            '"factual_checks": [\n'
+            "  {\n"
+            '    "claim": "<claim text>",\n'
+            '    "stated_in_response": true|false,\n'
+            '    "value_correct": true|false|null,\n'
+            '    "cited": true|false|null\n'
+            "  }\n"
+            "]\n"
+            "```\n\n"
+            "Use `null` for value_correct/cited when stated_in_response is false.\n\n"
+            "Expected facts:\n\n"
+            f"{bullet_text}"
+        )
 
     @staticmethod
     def _parse_judge_response(text: str) -> dict:
@@ -496,7 +546,29 @@ def _parse_judge_response(text: str) -> dict:
         return json.loads(text)
 
     def _compute_weighted_score(self, scores_dict: dict) -> float:
-        """Apply rubric weights to the judge's per-dimension scores."""
+        """Apply rubric weights to the judge's per-dimension scores.
+
+        Per spec/13 Layer 2: when the rubric declares ``factual_accuracy`` as
+        a weighted dimension and the judge returned ``factual_checks``, the
+        runner derives the dimension's score from the checks (proportion of
+        verified facts × 5, on the same 1–5 scale as other dimensions). If
+        the judge already returned a numeric score for ``factual_accuracy``,
+        the judge's score takes priority (the LLM may apply nuance the bare
+        proportion misses).
+        """
+        # Inject a derived factual_accuracy score if the rubric expects one
+        # but the judge didn't return a numeric score for it.
+        if "factual_accuracy" in self.weights:
+            existing = scores_dict.get("factual_accuracy")
+            if not (isinstance(existing, dict) and "score" in existing):
+                checks = scores_dict.get("factual_checks", [])
+                derived = compute_factual_accuracy_from_checks(checks)
+                if derived is not None:
+                    scores_dict["factual_accuracy"] = {
+                        "score": derived,
+                        "justification": "derived from factual_checks proportion",
+                    }
+
         total = 0.0
         weight_sum = 0.0
         for dim, weight_pct in self.weights.items():
@@ -551,6 +623,38 @@ def _write_run_log(self, result: EvalResult) -> None:
         atomic_append_jsonl(log_path, json.dumps(line))
 
 
+# ──────────────────────────────────────────────────────────────────
+# Layer-2 factual accuracy helper (module-level for testability)
+
+
+def compute_factual_accuracy_from_checks(checks: list[dict]) -> float | None:
+    """Compute a 1-5 dimension score from a list of ``factual_checks`` entries.
+
+    Per spec/13 Layer 2:
+    - A check is "verified" iff stated_in_response AND value_correct AND cited.
+    - The dimension score is ``round(5 * verified / total)`` clamped to 1.
+    - Returns ``None`` when ``checks`` is empty (no signal to score from).
+
+    A claim that's correctly stated but uncited counts as half-verified
+    (we still want some signal — the value is right, but it's not auditable).
+    """
+    if not checks:
+        return None
+    total = len(checks)
+    verified = 0.0
+    for c in checks:
+        stated = bool(c.get("stated_in_response"))
+        value_ok = bool(c.get("value_correct"))
+        cited = bool(c.get("cited"))
+        if stated and value_ok and cited:
+            verified += 1.0
+        elif stated and value_ok:
+            verified += 0.5  # right value, uncited — partial credit
+    proportion = verified / total
+    score = round(5 * proportion)
+    return max(1, min(5, int(score)))
+
+
 # ──────────────────────────────────────────────────────────────────
 # Helpers