futuresearch · petermuehlbacher · Feb 19, 2026 · Feb 18, 2026 · sentry · Feb 18, 2026
diff --git a/.github/workflows/deploy-docs.yaml b/.github/workflows/deploy-docs.yaml
@@ -51,6 +51,9 @@ jobs:
       # - name: Validate notebook structure
       #   run: python docs-site/scripts/validate-notebooks.py
 
+      - name: Validate notebook patterns
+        run: python docs-site/scripts/validate-notebook-patterns.py
+
       - name: Convert notebooks to HTML
         run: uv run --group case-studies python docs-site/scripts/convert-notebooks.py
 

diff --git a/docs-site/scripts/validate-notebook-patterns.py b/docs-site/scripts/validate-notebook-patterns.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+"""Validate that case study notebooks follow required patterns.
+
+Notebooks that call any everyrow operation (merge, agent_map, screen, rank,
+dedupe) must:
+1. Conditionally install everyrow (try/except ImportError + pip install)
+2. Conditionally set EVERYROW_API_KEY (check os.environ before setting)
+3. Wrap all tool calls inside `async with create_session(name="...") as session:`
+   blocks, with `session.get_url()` printed for observability.
+
+Notebooks that don't use any everyrow ops are skipped.
+"""
+
+import json
+import re
+import sys
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+DOCS_SITE_DIR = SCRIPT_DIR.parent
+REPO_ROOT = DOCS_SITE_DIR.parent
+NOTEBOOKS_DIR = REPO_ROOT / "docs" / "case_studies"
+
+# everyrow operations that must be wrapped in create_session
+EVERYROW_OPS = {"merge", "agent_map", "screen", "rank", "dedupe"}
+
+# Pattern: function call like `await merge(`, `await screen(`, etc.
+# Also matches direct calls without await, and _async variants
+OP_CALL_RE = re.compile(
+    r"\b(?:await\s+)?(?:" + "|".join(EVERYROW_OPS) + r")(?:_async)?\s*\("
+)
+
+
+def get_code_cells(notebook_path: Path) -> list[str]:
+    """Extract source code from all code cells in a notebook."""
+    with open(notebook_path) as f:
+        nb = json.load(f)
+    cells = []
+    for cell in nb.get("cells", []):
+        if cell.get("cell_type") == "code":
+            source = cell.get("source", [])
+            if isinstance(source, list):
+                cells.append("".join(source))
+            else:
+                cells.append(source)
+    return cells
+
+
+def check_conditional_pip_install(code_cells: list[str]) -> list[str]:
+    """Check for conditional pip install of everyrow.
+
+    Accepted patterns:
+      try:
+          import everyrow
+      except ImportError:
+          %pip install everyrow    (or !pip install everyrow)
+    """
+    errors = []
+    all_code = "\n".join(code_cells)
+
+    has_pip_install = bool(re.search(r"[%!]pip install\b.*\beveryrow\b", all_code))
+    has_try_except = bool(
+        re.search(
+            r"try\s*:.*?import\s+everyrow.*?except\s+(?:Import|Module)(?:Error|NotFoundError)",
+            all_code,
+            re.DOTALL,
+        )
+    )
+
+    if not has_pip_install:
+        errors.append(
+            "Missing `%pip install everyrow`. "
+            "Add a setup cell with: try/except ImportError -> %pip install everyrow"
+        )
+    elif not has_try_except:
+        errors.append(
+            "pip install everyrow is not conditional. "
+            "Wrap it in: try: import everyrow / except ImportError: %pip install everyrow"
+        )
+
+    return errors
+
+
+def check_conditional_api_key(code_cells: list[str]) -> list[str]:
+    """Check for conditional EVERYROW_API_KEY setup.
+
+    Accepted pattern:
+      if "EVERYROW_API_KEY" not in os.environ:
+          os.environ["EVERYROW_API_KEY"] = "..."
+    """
+    errors = []
+    all_code = "\n".join(code_cells)
+
+    has_key_reference = "EVERYROW_API_KEY" in all_code
+    has_conditional = bool(
+        re.search(
+            r'if\s+["\']EVERYROW_API_KEY["\']\s+not\s+in\s+os\.environ',
+            all_code,
+        )
+    )
+
+    if not has_key_reference:
+        errors.append(
+            "Missing EVERYROW_API_KEY setup. "
+            'Add: if "EVERYROW_API_KEY" not in os.environ: os.environ["EVERYROW_API_KEY"] = "..."'
+        )
+    elif not has_conditional:
+        errors.append(
+            "EVERYROW_API_KEY is not set conditionally. "
+            'Use: if "EVERYROW_API_KEY" not in os.environ: os.environ["EVERYROW_API_KEY"] = "..."'
+        )
+
+    return errors
+
+
+def check_create_session_wrapping(code_cells: list[str]) -> list[str]:
+    """Check that everyrow tool calls are wrapped in create_session.
+
+    Requirements:
+    - If any everyrow op is called, `create_session(name=` must appear in the notebook
+    - `session.get_url()` or `task_id` must be printed for observability
+    """
+    errors = []
+    all_code = "\n".join(code_cells)
+
+    # Find all everyrow op calls
+    op_calls = OP_CALL_RE.findall(all_code)
+    if not op_calls:
+        return []  # No everyrow ops used, nothing to check
+
+    # Check that create_session is used with a name
+    has_create_session = bool(
+        re.search(r"create_session\s*\(\s*name\s*=", all_code)
+    )
+    if not has_create_session:
+        errors.append(
+            "everyrow operations found but not wrapped in "
+            '`async with create_session(name="...") as session:`. '
+            "All tool calls must run inside a named session."
+        )
+
+    # Check for observability: session.get_url() or task_id printed
+    has_observability = bool(
+        re.search(r"session\.get_url\(\)|\.task_id|\.session_id", all_code)
+    )
+    if not has_observability:
+        errors.append(
+            "Missing session observability. "
+            "Add `print(f\"Session URL: {session.get_url()}\")` inside the create_session block."
+        )
+
+    return errors
+
+
+def uses_everyrow_ops(code_cells: list[str]) -> bool:
+    """Check if any everyrow operations are called in the notebook."""
+    all_code = "\n".join(code_cells)
+    return bool(OP_CALL_RE.search(all_code))
+
+
+def validate_notebook(notebook_path: Path) -> list[str]:
+    """Validate a notebook's patterns. Returns list of error messages."""
+    slug = notebook_path.parent.name
+    code_cells = get_code_cells(notebook_path)
+
+    if not code_cells:
+        return [f"{slug}: No code cells found"]
+
+    # Only enforce setup and session checks if notebook actually calls everyrow ops
+    if not uses_everyrow_ops(code_cells):
+        return []
+
+    all_errors = []
+    for check_fn in [
+        check_conditional_pip_install,
+        check_conditional_api_key,
+        check_create_session_wrapping,
+    ]:
+        for error in check_fn(code_cells):
+            all_errors.append(f"{slug}: {error}")
+
+    return all_errors
+
+
+def main() -> int:
+    notebooks = sorted(NOTEBOOKS_DIR.glob("*/notebook.ipynb"))
+
+    if not notebooks:
+        print(f"No notebooks found in {NOTEBOOKS_DIR}")
+        return 1
+
+    all_errors = []
+    passed = 0
+    for notebook in notebooks:
+        errors = validate_notebook(notebook)
+        if errors:
+            all_errors.extend(errors)
+        else:
+            passed += 1
+
+    if all_errors:
+        print("Notebook pattern validation failed:\n")
+        for error in all_errors:
+            print(f"  - {error}")
+        print(f"\n{len(all_errors)} error(s) across {len(notebooks)} notebooks")
+        print(f"{passed}/{len(notebooks)} notebooks passed all checks")
+        return 1
+
+    print(f"All {len(notebooks)} notebooks pass pattern checks")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/docs/case_studies/dedupe-crm-company-records/notebook.ipynb b/docs/case_studies/dedupe-crm-company-records/notebook.ipynb
@@ -3,10 +3,24 @@
   {
    "cell_type": "markdown",
    "metadata": {},
+   "source": "# How to use LLMs to deduplicate CRM Data\n\nThis notebook demonstrates how to use the everyrow SDK's `dedupe` operation to deduplicate messy CRM data using AI-powered semantic matching."
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup-5c9f444a",
+   "metadata": {},
+   "outputs": [],
    "source": [
-    "# How to use LLMs to deduplicate CRM Data\n",
+    "# Setup: install everyrow if needed and configure API key\n",
+    "try:\n",
+    "    import everyrow\n",
+    "except ImportError:\n",
+    "    %pip install everyrow\n",
     "\n",
-    "This notebook demonstrates how to use the everyrow SDK's `dedupe` operation to deduplicate messy CRM data using AI-powered semantic matching."
+    "import os\n",
+    "if \"EVERYROW_API_KEY\" not in os.environ:\n",
+    "    os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\"  # Get one at everyrow.io\n"
    ]
   },
   {
@@ -23,19 +37,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# !pip install everyrow\n",
-    "from datetime import datetime\n",
-    "from textwrap import dedent\n",
-    "\n",
-    "import pandas as pd\n",
-    "# load API key from environment/.env file or set it directly in the notebook\n",
-    "from dotenv import load_dotenv\n",
-    "load_dotenv()\n",
-    "# import os\n",
-    "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n",
-    "from everyrow.ops import dedupe\n"
-   ]
+   "source": "from datetime import datetime\nfrom textwrap import dedent\n\nimport pandas as pd\nfrom dotenv import load_dotenv\n\nfrom everyrow import create_session\nfrom everyrow.ops import dedupe\n\nload_dotenv()"
   },
   {
    "cell_type": "markdown",
@@ -204,7 +206,7 @@
     }
    ],
    "source": [
-    "data = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/case_01_crm_data.csv\", engine=\"pyarrow\")\n",
+    "data = pd.read_csv(\"../data/case_01_crm_data.csv\", engine=\"pyarrow\")\n",
     "\n",
     "print(f\"Total records: {len(data)}\")\n",
     "data.sort_values(by=\"company_name\").head(15)"
@@ -235,18 +237,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "equivalence_relation = dedent(\"\"\"\n",
-    "    Two entries are duplicates if they include data for the same legal entity.\n",
-    "\"\"\")\n",
-    "\n",
-    "print(\"Deduplicating CRM data...\\n\")\n",
-    "\n",
-    "result = await dedupe(\n",
-    "    input=data,\n",
-    "    equivalence_relation=equivalence_relation,\n",
-    ")"
-   ]
+   "source": "equivalence_relation = dedent(\"\"\"\n    Two entries are duplicates if they include data for the same legal entity.\n\"\"\")\n\nprint(\"Deduplicating CRM data...\\n\")\n\nasync with create_session(name=\"CRM Deduplication\") as session:\n    print(f\"Session URL: {session.get_url()}\")\n    result = await dedupe(\n        session=session,\n        input=data,\n        equivalence_relation=equivalence_relation,\n    )"
   },
   {
    "cell_type": "markdown",
@@ -611,9 +602,6 @@
   }
  ],
  "metadata": {
-  "everyrow": {
-   "description": "Python notebook cleaning 500 CRM records with inconsistent company names, missing contacts, and partial email matches. Uses everyrow's dedupe() with a plain-English equivalence relation to find and group semantic duplicates."
-  },
   "kernelspec": {
    "display_name": ".venv",
    "language": "python",
@@ -631,8 +619,11 @@
    "pygments_lexer": "ipython3",
    "version": "3.12.11"
   },
-  "language_version": "3.12"
+  "language_version": "3.12",
+  "everyrow": {
+   "description": "Python notebook cleaning 500 CRM records with inconsistent company names, missing contacts, and partial email matches. Uses everyrow's dedupe() with a plain-English equivalence relation to find and group semantic duplicates."
+  }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/docs/case_studies/deep-research-bench-pareto-analysis/notebook.ipynb b/docs/case_studies/deep-research-bench-pareto-analysis/notebook.ipynb
@@ -19,6 +19,24 @@
     "`MEDIUM` and `HIGH` run full research agents that search, read, and cross-reference sources. For these, model selection matters a lot — and we choose models based on their position on the **Pareto frontier** of accuracy, cost, and speed. This notebook shows how."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup-8e478d18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Setup: install everyrow if needed and configure API key\n",
+    "try:\n",
+    "    import everyrow\n",
+    "except ImportError:\n",
+    "    %pip install everyrow\n",
+    "\n",
+    "import os\n",
+    "if \"EVERYROW_API_KEY\" not in os.environ:\n",
+    "    os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\"  # Get one at everyrow.io\n"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -667,7 +685,7 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "## 6. Choosing the right effort level\n\n**`LOW`** is the default, and it's the right choice for most tasks that don't require web research — classifying rows, extracting fields, reformatting data. It runs a single LLM call with no tool use, so it's fast and cheap. Because DRB measures agentic information retrieval, the DRB score for the `LOW` model isn't very meaningful here: in practice `LOW` doesn't do research at all.\n\n**`MEDIUM`** turns on the research agent. Gemini 3 Flash (low) sits on the cost Pareto frontier — it's the cheapest model that delivers strong research accuracy. Use this when you need agents to look things up on the web but want to keep costs down.\n\n**`HIGH`** uses Claude 4.6 Opus (low), which sits on both the cost and speed Pareto frontiers. It's the fastest high-accuracy model on DRB and delivers the best score-per-dollar among top-tier models. Use this when accuracy matters and you're willing to pay more per row.\n\n**Want the absolute best accuracy?** You can override the model directly by setting `effort_level=None` and specifying all parameters explicitly:\n\n```python\nfrom everyrow.ops import agent_map\nfrom everyrow.task import LLM\n\nresult = await agent_map(\n    task=\"Find each company's latest funding round\",\n    input=companies_df,\n    effort_level=None,\n    llm=LLM.CLAUDE_4_6_OPUS_HIGH,\n    iteration_budget=10,\n    include_reasoning=True,\n)\n```\n\nClaude 4.6 Opus (high) is the top-scoring model on DRB, but it costs roughly twice as much and takes about three times as long as the `HIGH` default. For most workloads the `HIGH` preset already captures the bulk of that accuracy at a fraction of the price — but the option is there when you need it.\n\nWe re-run these benchmarks as new models launch, so the model behind each effort level may change over time. You always get the current best trade-off without changing your code."
+   "source": "## 6. Choosing the right effort level\n\n**`LOW`** is the default, and it's the right choice for most tasks that don't require web research — classifying rows, extracting fields, reformatting data. It runs a single LLM call with no tool use, so it's fast and cheap. Because DRB measures agentic information retrieval, the DRB score for the `LOW` model isn't very meaningful here: in practice `LOW` doesn't do research at all.\n\n**`MEDIUM`** turns on the research agent. Gemini 3 Flash (low) sits on the cost Pareto frontier — it's the cheapest model that delivers strong research accuracy. Use this when you need agents to look things up on the web but want to keep costs down.\n\n**`HIGH`** uses Claude 4.6 Opus (low), which sits on both the cost and speed Pareto frontiers. It's the fastest high-accuracy model on DRB and delivers the best score-per-dollar among top-tier models. Use this when accuracy matters and you're willing to pay more per row.\n\n**Want the absolute best accuracy?** You can override the model directly by setting `effort_level=None` and specifying all parameters explicitly:\n\n```python\nfrom everyrow.ops import agent_map\nfrom everyrow.task import LLM\n\nresult = await agent_map(\n    task=\"Find each company's latest funding round\",\n    input=companies_df,\n    effort_level=None,\n    llm=LLM.CLAUDE_4_6_OPUS_HIGH,\n    iteration_budget=10,\n    include_research=True,\n)\n```\n\nClaude 4.6 Opus (high) is the top-scoring model on DRB, but it costs roughly twice as much and takes about three times as long as the `HIGH` default. For most workloads the `HIGH` preset already captures the bulk of that accuracy at a fraction of the price — but the option is there when you need it.\n\nWe re-run these benchmarks as new models launch, so the model behind each effort level may change over time. You always get the current best trade-off without changing your code."
   }
  ],
  "metadata": {
@@ -691,4 +709,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}