diff --git a/.github/workflows/deploy-docs.yaml b/.github/workflows/deploy-docs.yaml index 42da7dbe..2e208244 100644 --- a/.github/workflows/deploy-docs.yaml +++ b/.github/workflows/deploy-docs.yaml @@ -51,6 +51,9 @@ jobs: # - name: Validate notebook structure # run: python docs-site/scripts/validate-notebooks.py + - name: Validate notebook patterns + run: python docs-site/scripts/validate-notebook-patterns.py + - name: Convert notebooks to HTML run: uv run --group case-studies python docs-site/scripts/convert-notebooks.py diff --git a/docs-site/scripts/validate-notebook-patterns.py b/docs-site/scripts/validate-notebook-patterns.py new file mode 100644 index 00000000..ee805a73 --- /dev/null +++ b/docs-site/scripts/validate-notebook-patterns.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +"""Validate that case study notebooks follow required patterns. + +Notebooks that call any everyrow operation (merge, agent_map, screen, rank, +dedupe) must: +1. Conditionally install everyrow (try/except ImportError + pip install) +2. Conditionally set EVERYROW_API_KEY (check os.environ before setting) +3. Wrap all tool calls inside `async with create_session(name="...") as session:` + blocks, with `session.get_url()` printed for observability. + +Notebooks that don't use any everyrow ops are skipped. +""" + +import json +import re +import sys +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent +DOCS_SITE_DIR = SCRIPT_DIR.parent +REPO_ROOT = DOCS_SITE_DIR.parent +NOTEBOOKS_DIR = REPO_ROOT / "docs" / "case_studies" + +# everyrow operations that must be wrapped in create_session +EVERYROW_OPS = {"merge", "agent_map", "screen", "rank", "dedupe"} + +# Pattern: function call like `await merge(`, `await screen(`, etc. +# Also matches direct calls without await, and _async variants +OP_CALL_RE = re.compile( + r"\b(?:await\s+)?(?:" + "|".join(EVERYROW_OPS) + r")(?:_async)?\s*\(" +) + + +def get_code_cells(notebook_path: Path) -> list[str]: + """Extract source code from all code cells in a notebook.""" + with open(notebook_path) as f: + nb = json.load(f) + cells = [] + for cell in nb.get("cells", []): + if cell.get("cell_type") == "code": + source = cell.get("source", []) + if isinstance(source, list): + cells.append("".join(source)) + else: + cells.append(source) + return cells + + +def check_conditional_pip_install(code_cells: list[str]) -> list[str]: + """Check for conditional pip install of everyrow. + + Accepted patterns: + try: + import everyrow + except ImportError: + %pip install everyrow (or !pip install everyrow) + """ + errors = [] + all_code = "\n".join(code_cells) + + has_pip_install = bool(re.search(r"[%!]pip install\b.*\beveryrow\b", all_code)) + has_try_except = bool( + re.search( + r"try\s*:.*?import\s+everyrow.*?except\s+(?:Import|Module)(?:Error|NotFoundError)", + all_code, + re.DOTALL, + ) + ) + + if not has_pip_install: + errors.append( + "Missing `%pip install everyrow`. " + "Add a setup cell with: try/except ImportError -> %pip install everyrow" + ) + elif not has_try_except: + errors.append( + "pip install everyrow is not conditional. " + "Wrap it in: try: import everyrow / except ImportError: %pip install everyrow" + ) + + return errors + + +def check_conditional_api_key(code_cells: list[str]) -> list[str]: + """Check for conditional EVERYROW_API_KEY setup. + + Accepted pattern: + if "EVERYROW_API_KEY" not in os.environ: + os.environ["EVERYROW_API_KEY"] = "..." + """ + errors = [] + all_code = "\n".join(code_cells) + + has_key_reference = "EVERYROW_API_KEY" in all_code + has_conditional = bool( + re.search( + r'if\s+["\']EVERYROW_API_KEY["\']\s+not\s+in\s+os\.environ', + all_code, + ) + ) + + if not has_key_reference: + errors.append( + "Missing EVERYROW_API_KEY setup. " + 'Add: if "EVERYROW_API_KEY" not in os.environ: os.environ["EVERYROW_API_KEY"] = "..."' + ) + elif not has_conditional: + errors.append( + "EVERYROW_API_KEY is not set conditionally. " + 'Use: if "EVERYROW_API_KEY" not in os.environ: os.environ["EVERYROW_API_KEY"] = "..."' + ) + + return errors + + +def check_create_session_wrapping(code_cells: list[str]) -> list[str]: + """Check that everyrow tool calls are wrapped in create_session. + + Requirements: + - If any everyrow op is called, `create_session(name=` must appear in the notebook + - `session.get_url()` or `task_id` must be printed for observability + """ + errors = [] + all_code = "\n".join(code_cells) + + # Find all everyrow op calls + op_calls = OP_CALL_RE.findall(all_code) + if not op_calls: + return [] # No everyrow ops used, nothing to check + + # Check that create_session is used with a name + has_create_session = bool( + re.search(r"create_session\s*\(\s*name\s*=", all_code) + ) + if not has_create_session: + errors.append( + "everyrow operations found but not wrapped in " + '`async with create_session(name="...") as session:`. ' + "All tool calls must run inside a named session." + ) + + # Check for observability: session.get_url() or task_id printed + has_observability = bool( + re.search(r"session\.get_url\(\)|\.task_id|\.session_id", all_code) + ) + if not has_observability: + errors.append( + "Missing session observability. " + "Add `print(f\"Session URL: {session.get_url()}\")` inside the create_session block." + ) + + return errors + + +def uses_everyrow_ops(code_cells: list[str]) -> bool: + """Check if any everyrow operations are called in the notebook.""" + all_code = "\n".join(code_cells) + return bool(OP_CALL_RE.search(all_code)) + + +def validate_notebook(notebook_path: Path) -> list[str]: + """Validate a notebook's patterns. Returns list of error messages.""" + slug = notebook_path.parent.name + code_cells = get_code_cells(notebook_path) + + if not code_cells: + return [f"{slug}: No code cells found"] + + # Only enforce setup and session checks if notebook actually calls everyrow ops + if not uses_everyrow_ops(code_cells): + return [] + + all_errors = [] + for check_fn in [ + check_conditional_pip_install, + check_conditional_api_key, + check_create_session_wrapping, + ]: + for error in check_fn(code_cells): + all_errors.append(f"{slug}: {error}") + + return all_errors + + +def main() -> int: + notebooks = sorted(NOTEBOOKS_DIR.glob("*/notebook.ipynb")) + + if not notebooks: + print(f"No notebooks found in {NOTEBOOKS_DIR}") + return 1 + + all_errors = [] + passed = 0 + for notebook in notebooks: + errors = validate_notebook(notebook) + if errors: + all_errors.extend(errors) + else: + passed += 1 + + if all_errors: + print("Notebook pattern validation failed:\n") + for error in all_errors: + print(f" - {error}") + print(f"\n{len(all_errors)} error(s) across {len(notebooks)} notebooks") + print(f"{passed}/{len(notebooks)} notebooks passed all checks") + return 1 + + print(f"All {len(notebooks)} notebooks pass pattern checks") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docs/case_studies/dedupe-crm-company-records/notebook.ipynb b/docs/case_studies/dedupe-crm-company-records/notebook.ipynb index eda7ea44..69f24693 100644 --- a/docs/case_studies/dedupe-crm-company-records/notebook.ipynb +++ b/docs/case_studies/dedupe-crm-company-records/notebook.ipynb @@ -3,10 +3,24 @@ { "cell_type": "markdown", "metadata": {}, + "source": "# How to use LLMs to deduplicate CRM Data\n\nThis notebook demonstrates how to use the everyrow SDK's `dedupe` operation to deduplicate messy CRM data using AI-powered semantic matching." + }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-5c9f444a", + "metadata": {}, + "outputs": [], "source": [ - "# How to use LLMs to deduplicate CRM Data\n", + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", "\n", - "This notebook demonstrates how to use the everyrow SDK's `dedupe` operation to deduplicate messy CRM data using AI-powered semantic matching." + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" ] }, { @@ -23,19 +37,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# !pip install everyrow\n", - "from datetime import datetime\n", - "from textwrap import dedent\n", - "\n", - "import pandas as pd\n", - "# load API key from environment/.env file or set it directly in the notebook\n", - "from dotenv import load_dotenv\n", - "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n", - "from everyrow.ops import dedupe\n" - ] + "source": "from datetime import datetime\nfrom textwrap import dedent\n\nimport pandas as pd\nfrom dotenv import load_dotenv\n\nfrom everyrow import create_session\nfrom everyrow.ops import dedupe\n\nload_dotenv()" }, { "cell_type": "markdown", @@ -204,7 +206,7 @@ } ], "source": [ - "data = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/case_01_crm_data.csv\", engine=\"pyarrow\")\n", + "data = pd.read_csv(\"../data/case_01_crm_data.csv\", engine=\"pyarrow\")\n", "\n", "print(f\"Total records: {len(data)}\")\n", "data.sort_values(by=\"company_name\").head(15)" @@ -235,18 +237,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "equivalence_relation = dedent(\"\"\"\n", - " Two entries are duplicates if they include data for the same legal entity.\n", - "\"\"\")\n", - "\n", - "print(\"Deduplicating CRM data...\\n\")\n", - "\n", - "result = await dedupe(\n", - " input=data,\n", - " equivalence_relation=equivalence_relation,\n", - ")" - ] + "source": "equivalence_relation = dedent(\"\"\"\n Two entries are duplicates if they include data for the same legal entity.\n\"\"\")\n\nprint(\"Deduplicating CRM data...\\n\")\n\nasync with create_session(name=\"CRM Deduplication\") as session:\n print(f\"Session URL: {session.get_url()}\")\n result = await dedupe(\n session=session,\n input=data,\n equivalence_relation=equivalence_relation,\n )" }, { "cell_type": "markdown", @@ -611,9 +602,6 @@ } ], "metadata": { - "everyrow": { - "description": "Python notebook cleaning 500 CRM records with inconsistent company names, missing contacts, and partial email matches. Uses everyrow's dedupe() with a plain-English equivalence relation to find and group semantic duplicates." - }, "kernelspec": { "display_name": ".venv", "language": "python", @@ -631,8 +619,11 @@ "pygments_lexer": "ipython3", "version": "3.12.11" }, - "language_version": "3.12" + "language_version": "3.12", + "everyrow": { + "description": "Python notebook cleaning 500 CRM records with inconsistent company names, missing contacts, and partial email matches. Uses everyrow's dedupe() with a plain-English equivalence relation to find and group semantic duplicates." + } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/docs/case_studies/deep-research-bench-pareto-analysis/notebook.ipynb b/docs/case_studies/deep-research-bench-pareto-analysis/notebook.ipynb index b50c692e..efcddc9f 100644 --- a/docs/case_studies/deep-research-bench-pareto-analysis/notebook.ipynb +++ b/docs/case_studies/deep-research-bench-pareto-analysis/notebook.ipynb @@ -19,6 +19,24 @@ "`MEDIUM` and `HIGH` run full research agents that search, read, and cross-reference sources. For these, model selection matters a lot — and we choose models based on their position on the **Pareto frontier** of accuracy, cost, and speed. This notebook shows how." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-8e478d18", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -667,7 +685,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": "## 6. Choosing the right effort level\n\n**`LOW`** is the default, and it's the right choice for most tasks that don't require web research — classifying rows, extracting fields, reformatting data. It runs a single LLM call with no tool use, so it's fast and cheap. Because DRB measures agentic information retrieval, the DRB score for the `LOW` model isn't very meaningful here: in practice `LOW` doesn't do research at all.\n\n**`MEDIUM`** turns on the research agent. Gemini 3 Flash (low) sits on the cost Pareto frontier — it's the cheapest model that delivers strong research accuracy. Use this when you need agents to look things up on the web but want to keep costs down.\n\n**`HIGH`** uses Claude 4.6 Opus (low), which sits on both the cost and speed Pareto frontiers. It's the fastest high-accuracy model on DRB and delivers the best score-per-dollar among top-tier models. Use this when accuracy matters and you're willing to pay more per row.\n\n**Want the absolute best accuracy?** You can override the model directly by setting `effort_level=None` and specifying all parameters explicitly:\n\n```python\nfrom everyrow.ops import agent_map\nfrom everyrow.task import LLM\n\nresult = await agent_map(\n task=\"Find each company's latest funding round\",\n input=companies_df,\n effort_level=None,\n llm=LLM.CLAUDE_4_6_OPUS_HIGH,\n iteration_budget=10,\n include_reasoning=True,\n)\n```\n\nClaude 4.6 Opus (high) is the top-scoring model on DRB, but it costs roughly twice as much and takes about three times as long as the `HIGH` default. For most workloads the `HIGH` preset already captures the bulk of that accuracy at a fraction of the price — but the option is there when you need it.\n\nWe re-run these benchmarks as new models launch, so the model behind each effort level may change over time. You always get the current best trade-off without changing your code." + "source": "## 6. Choosing the right effort level\n\n**`LOW`** is the default, and it's the right choice for most tasks that don't require web research — classifying rows, extracting fields, reformatting data. It runs a single LLM call with no tool use, so it's fast and cheap. Because DRB measures agentic information retrieval, the DRB score for the `LOW` model isn't very meaningful here: in practice `LOW` doesn't do research at all.\n\n**`MEDIUM`** turns on the research agent. Gemini 3 Flash (low) sits on the cost Pareto frontier — it's the cheapest model that delivers strong research accuracy. Use this when you need agents to look things up on the web but want to keep costs down.\n\n**`HIGH`** uses Claude 4.6 Opus (low), which sits on both the cost and speed Pareto frontiers. It's the fastest high-accuracy model on DRB and delivers the best score-per-dollar among top-tier models. Use this when accuracy matters and you're willing to pay more per row.\n\n**Want the absolute best accuracy?** You can override the model directly by setting `effort_level=None` and specifying all parameters explicitly:\n\n```python\nfrom everyrow.ops import agent_map\nfrom everyrow.task import LLM\n\nresult = await agent_map(\n task=\"Find each company's latest funding round\",\n input=companies_df,\n effort_level=None,\n llm=LLM.CLAUDE_4_6_OPUS_HIGH,\n iteration_budget=10,\n include_research=True,\n)\n```\n\nClaude 4.6 Opus (high) is the top-scoring model on DRB, but it costs roughly twice as much and takes about three times as long as the `HIGH` default. For most workloads the `HIGH` preset already captures the bulk of that accuracy at a fraction of the price — but the option is there when you need it.\n\nWe re-run these benchmarks as new models launch, so the model behind each effort level may change over time. You always get the current best trade-off without changing your code." } ], "metadata": { @@ -691,4 +709,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/docs/case_studies/llm-powered-merging-at-scale/notebook.ipynb b/docs/case_studies/llm-powered-merging-at-scale/notebook.ipynb index fd29c59e..f200a39d 100644 --- a/docs/case_studies/llm-powered-merging-at-scale/notebook.ipynb +++ b/docs/case_studies/llm-powered-merging-at-scale/notebook.ipynb @@ -12,6 +12,24 @@ "Cost grows super linearly with the number of rows. At small scale (100 to 400 rows) the cost is negligible; at 2,246 x 2,246 rows, this cost $26.80." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-40d38ee5", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, { "cell_type": "markdown", "id": "mkoy1995el", @@ -38,19 +56,7 @@ "id": "my38zwvuk2n", "metadata": {}, "outputs": [], - "source": [ - "# !pip install everyrow\n", - "# load API key from environment/.env file or set it directly in the notebook\n", - "from dotenv import load_dotenv\n", - "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n", - "import numpy as np\n", - "import pandas as pd\n", - "from everyrow.ops import merge\n", - "\n", - "pd.set_option(\"display.max_colwidth\", None)" - ] + "source": "import numpy as np\nimport pandas as pd\nfrom everyrow import create_session\nfrom everyrow.ops import merge\n\npd.set_option(\"display.max_colwidth\", None)" }, { "cell_type": "code", @@ -121,8 +127,8 @@ } ], "source": [ - "left_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-merging-at-scale/merge_websites_input_left_2246.csv\")\n", - "right_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-merging-at-scale/merge_websites_input_right_2246.csv\")\n", + "left_df = pd.read_csv(\"merge_websites_input_left_2246.csv\")\n", + "right_df = pd.read_csv(\"merge_websites_input_right_2246.csv\")\n", "\n", "print(f\"Left table: {len(left_df)} rows\")\n", "left_df.head(3)" @@ -200,60 +206,8 @@ "execution_count": null, "id": "8f8cf350", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "n=100\n", - "num of matched rows: 100\n", - "num of LLM matches: 95\n", - "num of web search matches: 5\n", - "----------------------------------------------------------------------------------------------------\n", - "\n", - "n=200\n", - "num of matched rows: 200\n", - "num of LLM matches: 196\n", - "num of web search matches: 4\n", - "----------------------------------------------------------------------------------------------------\n", - "\n", - "n=400\n", - "num of matched rows: 400\n", - "num of LLM matches: 386\n", - "num of web search matches: 14\n", - "----------------------------------------------------------------------------------------------------\n", - "\n", - "n=800\n", - "num of matched rows: 800\n", - "num of LLM matches: 780\n", - "num of web search matches: 20\n", - "----------------------------------------------------------------------------------------------------\n", - "\n", - "n=1600\n", - "num of matched rows: 1600\n", - "----------------------------------------------------------------------------------------------------\n", - "\n", - "n=2246\n", - "num of matched rows: 2246\n", - "num of LLM matches: 2228\n", - "num of web search matches: 18\n", - "----------------------------------------------------------------------------------------------------\n", - "\n" - ] - } - ], - "source": [ - "for n in [100, 200, 400, 800, 1600, 2246]:\n", - " result = await merge(\n", - " task=\"Match each person to their website(s).\",\n", - " left_table=pd.read_csv(f\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-merging-at-scale/merge_websites_input_left_{n}.csv\"),\n", - " right_table=pd.read_csv(f\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-merging-at-scale/merge_websites_input_right_{n}.csv\"),\n", - " )\n", - " print(f\"n={n}\")\n", - " print(\"num of matched rows:\", len(result.data))\n", - " print(\"-\" * 100)\n", - " print()" - ] + "outputs": [], + "source": "for n in [100, 200, 400, 800, 1600, 2246]:\n async with create_session(name=f\"Website Matching (n={n})\") as session:\n print(f\"Session URL: {session.get_url()}\")\n result = await merge(\n session=session,\n task=\"Match each person to their website(s).\",\n left_table=pd.read_csv(f\"merge_websites_input_left_{n}.csv\"),\n right_table=pd.read_csv(f\"merge_websites_input_right_{n}.csv\"),\n )\n print(f\"n={n}\")\n print(\"num of matched rows:\", len(result.data))\n print(\"-\" * 100)\n print()" }, { "cell_type": "markdown", @@ -313,7 +267,7 @@ "metadata": {}, "outputs": [], "source": [ - "results_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-merging-at-scale/merge_websites_output_800.csv\")" + "results_df = pd.read_csv(\"merge_websites_output_800.csv\")" ] }, { @@ -465,4 +419,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/case_studies/llm-powered-screening-at-scale/notebook.ipynb b/docs/case_studies/llm-powered-screening-at-scale/notebook.ipynb index 4972ad37..edd2887c 100644 --- a/docs/case_studies/llm-powered-screening-at-scale/notebook.ipynb +++ b/docs/case_studies/llm-powered-screening-at-scale/notebook.ipynb @@ -10,6 +10,24 @@ "The everyrow `screen()` function filters a dataframe by applying LLMs, and LLM research agents, to every row to determine if the criteria are met. This notebook demonstrates how this scales to screening 10,000 rows. Since tricky rows get LLM agents that themselves make dozens of LLM calls, this results in running vastly more LLM calls than is generally feasible without dedicated orchestration or infrastructure. The total cost is ~$0.001 per row." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-08cd9b7f", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, { "cell_type": "markdown", "id": "09zeehb0muql", @@ -35,7 +53,7 @@ "metadata": {}, "outputs": [], "source": [ - "# !pip install everyrow\n", + "from dotenv import load_dotenv\n", "import pandas as pd\n", "from everyrow import create_session\n", "from everyrow.ops import screen\n", @@ -43,11 +61,7 @@ "pd.set_option(\"display.max_colwidth\", None)\n", "\n", "\n", - "# load API key from environment/.env file or set it directly in the notebook\n", - "from dotenv import load_dotenv\n", - "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n" + "load_dotenv()" ] }, { @@ -221,7 +235,7 @@ } ], "source": [ - "fda_product_recalls_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-screening-at-scale/fda_product_recalls.csv\")\n", + "fda_product_recalls_df = pd.read_csv(\"fda_product_recalls.csv\")\n", "\n", "# Filter to recalls where center_classification_date is after 2021-08-01 to get a dataset with ≈10k rows\n", "fda_product_recalls_df[\"center_classification_date\"] = pd.to_datetime(fda_product_recalls_df[\"center_classification_date\"], errors=\"coerce\")\n", @@ -315,7 +329,7 @@ "metadata": {}, "outputs": [], "source": [ - "results_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-screening-at-scale/Screen%20child%20product%20recalls.csv\") # download from https://everyrow.io/sessions/df145a50-2dfd-48c6-97ed-6f82a82bca66" + "results_df = pd.read_csv(\"Screen child product recalls.csv\") # download from https://everyrow.io/sessions/df145a50-2dfd-48c6-97ed-6f82a82bca66" ] }, { diff --git a/docs/case_studies/llm-web-research-agents-at-scale/notebook.ipynb b/docs/case_studies/llm-web-research-agents-at-scale/notebook.ipynb index 0077e8e5..d2b5cbe4 100644 --- a/docs/case_studies/llm-web-research-agents-at-scale/notebook.ipynb +++ b/docs/case_studies/llm-web-research-agents-at-scale/notebook.ipynb @@ -6,6 +6,24 @@ "metadata": {}, "source": "# Run 10,000 LLM Web Research Agents\n\nThe everyrow `agent_map()` function runs an LLM web research agent on every row of a dataframe. In this notebook, I demonstrate scaling this to running 10,000 web agents.\n\nFirst, some numbers. The total cost was ~$0.11/row, using 120k LLM calls, 1.56B input tokens, 20.1M output tokens, executing 338k web searches, and reading 11,726 pages. The whole run took only 3 hours 27 minutes.\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ModelCallsInput TokensOutput TokensCost
gemini-3-flash-preview98,190847,115,55117,237,847$913.85
gemini-2.5-flash11,574700,327,0852,715,535$222.01
claude-sonnet-4-2025051410,01510,912,199193,567$35.64
\n
\n\nYou'll see that this is reasonably affordable only because the vast majority of the work is done by Gemini-3-Flash (running the agents) and Gemini-2.5-Flash (reading webpages). The SDK supports using higher powered LLMs when it's really worth it.\n\nAlso, you'll see that to process 10,000 rows, each agent executed 34 web searches, but only fully read ~1.2 pages. The rest of its information it got by reading search result snippets, which can be surprisingly informative to an agent trying to answer simple questions, often allowing it save a lot of tokens by not fetching and read any pages at all, and still answer correctly. Gemini-3-Flash is quite good at this, in general, doing nearly the best on [Deep Research Bench](https://evals.futuresearch.ai/), and by far the most cost-efficient model. (Though Opus 4.6, released in Feb 2026, also shows great token efficiency in doing web research, and can be cost competitive even though it's ~9x the price per token!)\n\nA large cost comes from writing output, as this agent produced a few paragraphs of unstructured research output in addition to specifically requested fields (see the dataframe below). Costs could be reduced by minimizing the outputs, but generally we find that output to be very useful in processing the outputs further, and reduces the chance that the agent is unable to report important information given a restrictive schema." }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-7c74dc69", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, { "cell_type": "markdown", "id": "1uoefxrbb", @@ -39,7 +57,7 @@ "metadata": {}, "outputs": [], "source": [ - "# !pip install everyrow\n", + "from dotenv import load_dotenv\n", "import pandas as pd\n", "from pydantic import BaseModel, Field\n", "from everyrow import create_session\n", @@ -48,11 +66,7 @@ "pd.set_option(\"display.max_colwidth\", None)\n", "\n", "\n", - "# load API key from environment/.env file or set it directly in the notebook\n", - "from dotenv import load_dotenv\n", - "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n" + "load_dotenv()" ] }, { @@ -170,7 +184,7 @@ } ], "source": [ - "input_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-web-research-agents-at-scale/regulatory_status_results.csv\", usecols=[\"row_id\", \"trade_name\", \"ingredient\", \"applicant\", \"strength\", \"dosage_form\"])\n", + "input_df = pd.read_csv(\"regulatory_status_results.csv\", usecols=[\"row_id\", \"trade_name\", \"ingredient\", \"applicant\", \"strength\", \"dosage_form\"])\n", "print(f\"{len(input_df):,} drug products\")\n", "print(f\"Columns: {list(input_df.columns)}\")\n", "input_df.head(5)" @@ -360,7 +374,7 @@ } ], "source": [ - "results_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-web-research-agents-at-scale/regulatory_status_results.csv\")\n", + "results_df = pd.read_csv(\"regulatory_status_results.csv\")\n", "print(f\"Total rows: {len(results_df):,}\")\n", "print(f\"Rows with results: {results_df['regulatory_status'].notna().sum():,}\")\n", "print(f\"Failed rows: {results_df['regulatory_status'].isna().sum()}\")\n", diff --git a/docs/case_studies/match-clinical-trials-to-papers/notebook.ipynb b/docs/case_studies/match-clinical-trials-to-papers/notebook.ipynb index e2e002f6..c1855605 100644 --- a/docs/case_studies/match-clinical-trials-to-papers/notebook.ipynb +++ b/docs/case_studies/match-clinical-trials-to-papers/notebook.ipynb @@ -27,6 +27,24 @@ "The key finding: EveryRow dynamically scales its resources to match the problem, maintaining accuracy as datasets grow. Claude Code alone allocates a fixed budget of subagents regardless of dataset size, so its accuracy degrades as the problem gets bigger." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-3882842f", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, { "cell_type": "markdown", "id": "task-description", @@ -410,21 +428,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "er-merge", "metadata": {}, "outputs": [], - "source": [ - "from everyrow.ops import merge\n", - "\n", - "result = await merge(\n", - " task=(\n", - " \"Match publications to the clinical trial they report results for. A paper matches a trial if the paper describes the results of that trial - look for matching interventions/drugs, conditions/diseases, study design, outcomes, and sponsor/institution. Trial titles may be rewritten in the paper. Drug names may appear as brand or generic. Not every paper has a matching trial and not every trial has a matching paper.\"\n", - " ),\n", - " left_table=papers_df,\n", - " right_table=trials_df,\n", - ")" - ] + "source": "from everyrow import create_session\nfrom everyrow.ops import merge\n\nasync with create_session(name=\"Clinical Trials to Papers Matching\") as session:\n print(f\"Session URL: {session.get_url()}\")\n result = await merge(\n session=session,\n task=(\n \"Match publications to the clinical trial they report results for. A paper matches a trial if the paper describes the results of that trial - look for matching interventions/drugs, conditions/diseases, study design, outcomes, and sponsor/institution. Trial titles may be rewritten in the paper. Drug names may appear as brand or generic. Not every paper has a matching trial and not every trial has a matching paper.\"\n ),\n left_table=papers_df,\n right_table=trials_df,\n )" }, { "cell_type": "code", @@ -759,4 +767,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/case_studies/match-software-vendors-to-requirements/notebook.ipynb b/docs/case_studies/match-software-vendors-to-requirements/notebook.ipynb index 680d7a78..f87fe5cc 100644 --- a/docs/case_studies/match-software-vendors-to-requirements/notebook.ipynb +++ b/docs/case_studies/match-software-vendors-to-requirements/notebook.ipynb @@ -5,6 +5,24 @@ "metadata": {}, "source": "# Fuzzy join two Pandas DataFrames using LLMs\n\nThis notebook demonstrates the [everyrow.io SDK](https://github.com/futuresearch/everyrow-sdk) merge capabilities:\n\n1. **Fuzzy String Matching** - Handling typos and corrupted data\n2. **LLM Merge** - Matching without common columns (company ↔ ticker)\n3. **Web Merge** - Dynamic data requiring real-time verification (CEO matching)\n\nThe SDK implements a cascade: **Exact → Fuzzy → LLM → Web**, using the simplest method that works." }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-26b5f7a2", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -25,12 +43,8 @@ "metadata": {}, "outputs": [], "source": [ - "# !pip install everyrow\n", - "# load API key from environment/.env file or set it directly in the notebook\n", - "from dotenv import load_dotenv\n", - "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n" + "!pip install everyrow\n", + "%env EVERYROW_API_KEY=your_api_key" ] }, { @@ -172,7 +186,7 @@ "from everyrow.generated.models import LLMEnum\n", "\n", "# Load dataset: 438 S&P 500 companies\n", - "data = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/companies.csv\")\n", + "data = pd.read_csv(\"../data/companies.csv\")\n", "print(f\"Dataset: {data.shape[0]} companies, {data.shape[1]} columns\")\n", "data.head()" ] diff --git a/docs/case_studies/merge-contacts-with-company-data/notebook.ipynb b/docs/case_studies/merge-contacts-with-company-data/notebook.ipynb index 4a284781..4a23a579 100644 --- a/docs/case_studies/merge-contacts-with-company-data/notebook.ipynb +++ b/docs/case_studies/merge-contacts-with-company-data/notebook.ipynb @@ -5,6 +5,24 @@ "metadata": {}, "source": "# How to merge datasets without common ID in Python\n\nThis notebook demonstrates using everyrow's `merge()` utility to join contact-level data with organization-level data before CRM upload.\n\n**Use Case:** Your data lives across multiple tables—contacts in one, company information in another. Before uploading to HubSpot/Salesforce, you need a flattened export where each contact row includes the associated company context.\n\n**Why everyrow?** Company names may not match exactly between tables (\"Acme Corp\" vs \"Acme Corporation\" vs \"ACME\"). The `merge()` function handles these variations semantically." }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-82613e6a", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -18,13 +36,10 @@ }, "outputs": [], "source": [ - "# !pip install everyrow\n", "import asyncio\n", - "# load API key from environment/.env file or set it directly in the notebook\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n", + "\n", "import pandas as pd\n", "from everyrow import create_session\n", "from everyrow.ops import merge" @@ -191,7 +206,7 @@ ], "source": [ "# Contacts table\n", - "contacts_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/crm_contacts.csv\")\n", + "contacts_df = pd.read_csv(\"../data/crm_contacts.csv\")\n", "\n", "print(f\"Contacts: {len(contacts_df)}\")\n", "contacts_df" @@ -362,7 +377,7 @@ ], "source": [ "# Funds/Companies table with enriched information\n", - "funds_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/crm_funds.csv\")\n", + "funds_df = pd.read_csv(\"../data/crm_funds.csv\")\n", "\n", "print(f\"Funds: {len(funds_df)}\")\n", "funds_df" diff --git a/docs/case_studies/merge-overlapping-contact-lists/notebook.ipynb b/docs/case_studies/merge-overlapping-contact-lists/notebook.ipynb index 8d682af7..2e403769 100644 --- a/docs/case_studies/merge-overlapping-contact-lists/notebook.ipynb +++ b/docs/case_studies/merge-overlapping-contact-lists/notebook.ipynb @@ -5,6 +5,24 @@ "metadata": {}, "source": "# Fuzzy match and merge contact lists in Python\n\nThis notebook demonstrates using everyrow's `merge()` utility to combine two overlapping contact lists where records lack exact matches.\n\n**Use Case:** You have candidate lists from two different sources and need to merge them to avoid sending duplicate recruiting emails. The challenge: less than 50% match exactly by name or email due to typos, nicknames, different email domains, and incomplete data.\n\n**Why everyrow?** Traditional approaches (VLOOKUP, fuzzy matching) fail on semantic variations. everyrow's `merge()` uses LLM-powered matching to intelligently identify duplicates despite significant data variations." }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-43fd66ea", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -18,13 +36,10 @@ }, "outputs": [], "source": [ - "# !pip install everyrow\n", "import asyncio\n", - "# load API key from environment/.env file or set it directly in the notebook\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n", + "\n", "import pandas as pd\n", "from everyrow import create_session\n", "from everyrow.ops import merge" @@ -209,7 +224,7 @@ ], "source": [ "# List A: From a conference attendee export\n", - "list_a = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/contacts_list_a.csv\").fillna(\"\")\n", + "list_a = pd.read_csv(\"../data/contacts_list_a.csv\").fillna(\"\")\n", "\n", "print(f\"List A: {len(list_a)} contacts\")\n", "list_a" @@ -369,7 +384,7 @@ ], "source": [ "# List B: From a research collaboration database\n", - "list_b = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/contacts_list_b.csv\").fillna(\"\")\n", + "list_b = pd.read_csv(\"../data/contacts_list_b.csv\").fillna(\"\")\n", "\n", "print(f\"List B: {len(list_b)} contacts\")\n", "list_b" diff --git a/docs/case_studies/multi-stage-lead-qualification/notebook.ipynb b/docs/case_studies/multi-stage-lead-qualification/notebook.ipynb index 5d3fc709..388bdc85 100644 --- a/docs/case_studies/multi-stage-lead-qualification/notebook.ipynb +++ b/docs/case_studies/multi-stage-lead-qualification/notebook.ipynb @@ -5,6 +5,24 @@ "metadata": {}, "source": "# Build an AI lead qualification pipeline in Python\n\nThis notebook demonstrates a **complex, multi-stage screening workflow** that combines multiple everyrow operations with pandas data transformations.\n\n**Use Case:** Qualify investment fund leads for a B2B research tools company. The workflow:\n1. Score funds by \"contrarian\" research approach (likely to adopt new tools)\n2. Filter to high-scoring candidates using pandas\n3. Research team sizes for remaining candidates\n4. Apply nuanced inclusion logic: include funds with strong research signals OR very small teams\n\n**Why this approach?** Traditional tools force binary choices. This workflow captures the nuanced mental model: \"I want funds that show research-tool-adoption signals, but I'll also include tiny funds where even weak signals matter.\"" }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-219c4192", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -18,13 +36,10 @@ }, "outputs": [], "source": [ - "# !pip install everyrow\n", "import asyncio\n", - "# load API key from environment/.env file or set it directly in the notebook\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n", + "\n", "import pandas as pd\n", "from pydantic import BaseModel, Field\n", "from everyrow import create_session\n", @@ -203,7 +218,7 @@ } ], "source": [ - "funds_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/investment_funds.csv\")\n", + "funds_df = pd.read_csv(\"../data/investment_funds.csv\")\n", "\n", "print(f\"Loaded {len(funds_df)} funds\")\n", "funds_df.head(10)" diff --git a/docs/case_studies/research-and-rank-permit-times/notebook.ipynb b/docs/case_studies/research-and-rank-permit-times/notebook.ipynb index be556338..0f2cd93a 100644 --- a/docs/case_studies/research-and-rank-permit-times/notebook.ipynb +++ b/docs/case_studies/research-and-rank-permit-times/notebook.ipynb @@ -5,19 +5,34 @@ "metadata": {}, "source": "# Use LLM Agents to research government data at scale\n\nThis notebook demonstrates using everyrow's `rank()` utility with **web research capabilities** to gather and rank real-world data that isn't available in a structured format.\n\n**Use Case:** Real estate investors need permit processing timelines to evaluate markets—delays directly impact holding costs. But municipalities publish this data inconsistently: some on websites, some in PDFs, some not at all.\n\n**Why everyrow?** The `rank()` function can perform web research to find permit processing times from official sources, contractor reports, and comparable city data—then rank cities by speed." }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-b1988ab7", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "# !pip install everyrow\n", "import asyncio\n", - "# load API key from environment/.env file or set it directly in the notebook\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n", + "\n", "import pandas as pd\n", "from everyrow import create_session\n", "from everyrow.ops import rank" @@ -153,7 +168,7 @@ } ], "source": [ - "texas_cities_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/texas_cities.csv\")\n", + "texas_cities_df = pd.read_csv(\"../data/texas_cities.csv\")\n", "\n", "print(f\"Analyzing {len(texas_cities_df)} Texas cities\")\n", "texas_cities_df.head(10)" diff --git a/docs/case_studies/score-leads-from-fragmented-data/notebook.ipynb b/docs/case_studies/score-leads-from-fragmented-data/notebook.ipynb index 80c2358b..da1bfbbc 100644 --- a/docs/case_studies/score-leads-from-fragmented-data/notebook.ipynb +++ b/docs/case_studies/score-leads-from-fragmented-data/notebook.ipynb @@ -5,6 +5,24 @@ "metadata": {}, "source": "# How to score and prioritize leads with AI in Python\n\nThis notebook demonstrates using everyrow's `rank()` utility to score B2B leads by their likelihood of suffering from data fragmentation challenges.\n\n**Use Case:** A data integration SaaS company wants to prioritize leads. Companies operating across multiple locations, entities, or point solutions are more likely to need data integration tools.\n\n**Why everyrow?** Traditional enrichment tools provide data fields but can't interpret them. Manual review of 1,000 leads is prohibitively slow. everyrow's `rank()` analyzes each company's operational complexity semantically." }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-d063f3ff", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -18,13 +36,10 @@ }, "outputs": [], "source": [ - "# !pip install everyrow\n", "import asyncio\n", - "# load API key from environment/.env file or set it directly in the notebook\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n", + "\n", "import pandas as pd\n", "from everyrow import create_session\n", "from everyrow.ops import rank" @@ -190,7 +205,7 @@ } ], "source": [ - "companies_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/b2b_companies.csv\")\n", + "companies_df = pd.read_csv(\"../data/b2b_companies.csv\")\n", "\n", "print(f\"Loaded {len(companies_df)} companies\")\n", "companies_df.head(10)" diff --git a/docs/case_studies/score-leads-without-crm-history/notebook.ipynb b/docs/case_studies/score-leads-without-crm-history/notebook.ipynb index 658a8465..b8747d37 100644 --- a/docs/case_studies/score-leads-without-crm-history/notebook.ipynb +++ b/docs/case_studies/score-leads-without-crm-history/notebook.ipynb @@ -5,6 +5,24 @@ "metadata": {}, "source": "# Score and rank leads without a CRM in Python\n\nThis notebook demonstrates using everyrow's `rank()` utility to score investment firms by their likelihood to purchase research tools—without needing CRM data or prior interactions.\n\n**Use Case:** A research tools company wants to rank investment firms by product fit. Traditional approaches either require expensive CRM integrations or burn credits on enrichment tools that provide data without interpretation.\n\n**Why everyrow?** The `rank()` function can analyze public information (website descriptions, investment focus, team characteristics) and provide both a score AND reasoning for why a firm was scored a certain way." }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-1a1f21fe", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -18,13 +36,10 @@ }, "outputs": [], "source": [ - "# !pip install everyrow\n", "import asyncio\n", - "# load API key from environment/.env file or set it directly in the notebook\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n", + "\n", "import pandas as pd\n", "from everyrow import create_session\n", "from everyrow.ops import rank" @@ -235,7 +250,7 @@ } ], "source": [ - "firms_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/investment_firms.csv\")\n", + "firms_df = pd.read_csv(\"../data/investment_firms.csv\")\n", "\n", "print(f\"Loaded {len(firms_df)} investment firms\")\n", "firms_df" diff --git a/docs/case_studies/screen-job-postings-by-criteria/notebook.ipynb b/docs/case_studies/screen-job-postings-by-criteria/notebook.ipynb index 8c7e3437..2615fc29 100644 --- a/docs/case_studies/screen-job-postings-by-criteria/notebook.ipynb +++ b/docs/case_studies/screen-job-postings-by-criteria/notebook.ipynb @@ -5,6 +5,24 @@ "metadata": {}, "source": "# How to filter job postings with LLM Agents\n\nThis notebook demonstrates using everyrow's `screen()` utility to filter job postings by semantic criteria that traditional regex/keyword matching struggles with.\n\n**Use Case:** Filter job postings from a \"Who's Hiring\" thread to find only those that meet ALL of:\n1. Remote-friendly (explicitly allows remote/hybrid/distributed work)\n2. Senior-level (title or requirements indicate 5+ years experience)\n3. Salary disclosed (specific compensation figures, not \"competitive\" or \"DOE\")\n\n**Why everyrow?** Traditional keyword matching achieves ~68% precision on this task. Semantic screening with everyrow achieves >90% precision by understanding context and intent." }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-b5c2199e", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -18,13 +36,10 @@ }, "outputs": [], "source": [ - "# !pip install everyrow\n", "import asyncio\n", - "# load API key from environment/.env file or set it directly in the notebook\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n", + "\n", "import pandas as pd\n", "from pydantic import BaseModel, Field\n", "from everyrow import create_session\n", @@ -146,7 +161,7 @@ } ], "source": [ - "job_postings = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/job_postings.csv\")\n", + "job_postings = pd.read_csv(\"../data/job_postings.csv\")\n", "\n", "print(f\"Loaded {len(job_postings)} job postings\")\n", "job_postings.head()" diff --git a/docs/case_studies/screen-stocks-by-investment-thesis/notebook.ipynb b/docs/case_studies/screen-stocks-by-investment-thesis/notebook.ipynb index 2d7281ff..57bca976 100644 --- a/docs/case_studies/screen-stocks-by-investment-thesis/notebook.ipynb +++ b/docs/case_studies/screen-stocks-by-investment-thesis/notebook.ipynb @@ -8,25 +8,28 @@ { "cell_type": "code", "execution_count": null, + "id": "setup-8d585f39", "metadata": {}, "outputs": [], "source": [ - "# !pip install everyrow\n", - "# Setup\n", - "import asyncio\n", - "from pathlib import Path\n", + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", "\n", - "import pandas as pd\n", - "from pydantic import BaseModel, Field\n", - "\n", - "# load API key from environment/.env file or set it directly in the notebook\n", - "from dotenv import load_dotenv\n", - "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n", - "from everyrow.ops import screen" + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Setup\nimport asyncio\nfrom pathlib import Path\n\nimport pandas as pd\nfrom pydantic import BaseModel, Field\nfrom dotenv import load_dotenv\n\n# Load API key from .env\nload_dotenv()\n\nfrom everyrow import create_session\nfrom everyrow.ops import screen" + }, { "cell_type": "code", "execution_count": 2, @@ -160,7 +163,7 @@ ], "source": [ "# Load S&P 500 companies\n", - "stocks = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/S%26P%20500%20Companies.csv\")\n", + "stocks = pd.read_csv(\"../data/S&P 500 Companies.csv\")\n", "print(f\"Loaded {len(stocks)} companies\")\n", "stocks.head()" ] @@ -199,7 +202,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Run the screen\nasync def run_screen():\n print(\"Screening... (this will take a few minutes)\\n\")\n \n result = await screen(\n task=SCREENING_TASK,\n input=stocks,\n response_model=ScreenResult,\n )\n \n return result.data\n\n# Run it\nresults = await run_screen()" + "source": "# Run the screen\nasync with create_session(name=\"Stock Screening: Investment Thesis\") as session:\n print(f\"Session URL: {session.get_url()}\\n\")\n print(\"Screening... (this will take a few minutes)\\n\")\n\n result = await screen(\n session=session,\n task=SCREENING_TASK,\n input=stocks,\n response_model=ScreenResult,\n )\n\n results = result.data" }, { "cell_type": "code", @@ -414,4 +417,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/docs/case_studies/screen-stocks-by-margin-sensitivity/notebook.ipynb b/docs/case_studies/screen-stocks-by-margin-sensitivity/notebook.ipynb index 03bbcc98..c22504d6 100644 --- a/docs/case_studies/screen-stocks-by-margin-sensitivity/notebook.ipynb +++ b/docs/case_studies/screen-stocks-by-margin-sensitivity/notebook.ipynb @@ -8,25 +8,28 @@ { "cell_type": "code", "execution_count": null, + "id": "setup-f4164b1c", "metadata": {}, "outputs": [], "source": [ - "# !pip install everyrow\n", - "# Setup\n", - "import asyncio\n", - "from pathlib import Path\n", + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", "\n", - "import pandas as pd\n", - "from pydantic import BaseModel, Field\n", - "\n", - "# load API key from environment/.env file or set it directly in the notebook\n", - "from dotenv import load_dotenv\n", - "load_dotenv()\n", - "# import os\n", - "# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n", - "from everyrow.ops import screen" + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Setup\nimport asyncio\nfrom pathlib import Path\n\nimport pandas as pd\nfrom pydantic import BaseModel, Field\nfrom dotenv import load_dotenv\n\n# Load API key from .env\nload_dotenv()\n\nfrom everyrow import create_session\nfrom everyrow.ops import screen" + }, { "cell_type": "code", "execution_count": 18, @@ -160,7 +163,7 @@ ], "source": [ "# Load S&P 500 companies\n", - "stocks = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/S%26P%20500%20Companies.csv\")\n", + "stocks = pd.read_csv(\"../data/S&P 500 Companies.csv\")\n", "print(f\"Loaded {len(stocks)} companies\")\n", "stocks.head()" ] @@ -216,7 +219,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Run the screen\nasync def run_screen():\n print(\"Screening... (this will take a few minutes)\\n\")\n \n result = await screen(\n task=SCREENING_TASK,\n input=stocks,\n response_model=ScreenResult,\n )\n \n return result.data\n\n# Run it\nresults = await run_screen()" + "source": "# Run the screen\nasync with create_session(name=\"Stock Screening: Oil Margin Sensitivity\") as session:\n print(f\"Session URL: {session.get_url()}\\n\")\n print(\"Screening... (this will take a few minutes)\\n\")\n\n result = await screen(\n session=session,\n task=SCREENING_TASK,\n input=stocks,\n response_model=ScreenResult,\n )\n\n results = result.data" }, { "cell_type": "code", @@ -520,4 +523,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/docs/case_studies/understanding-costs-and-speed-for-merge/notebook.ipynb b/docs/case_studies/understanding-costs-and-speed-for-merge/notebook.ipynb new file mode 100644 index 00000000..c107fbfc --- /dev/null +++ b/docs/case_studies/understanding-costs-and-speed-for-merge/notebook.ipynb @@ -0,0 +1,2334 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "intro-md", + "metadata": {}, + "source": [ + "# Understanding Costs and Speed for Merge\n", + "\n", + "Every data engineer has faced the challenge: you have two tables that *should* join, but the keys don't quite match. Company names are spelled differently. Subsidiaries need to map to parents. Typos have crept in. Abbreviations vary.\n", + "\n", + "The `everyrow.merge()` operation solves this by using a **cost-optimized cascade** of matching strategies:\n", + "\n", + "| Strategy | Cost | Speed | Example |\n", + "|----------|------|-------|--------|\n", + "| Exact match | Free | Instant | \"Apple Inc\" → \"Apple Inc\" |\n", + "| Fuzzy match | Free | Fast | \"Microsft Corp\" → \"Microsoft Corp\" |\n", + "| LLM reasoning | ~$0.002/row | ~1s/row | \"Instagram\" → \"Meta Platforms\" |\n", + "| Web search | ~$0.01/row | ~5s/row | Obscure/stale data |\n", + "\n", + "The key insight: **most real-world matches are cheap or free**. The expensive LLM-based matching only kicks in when simpler methods fail.\n", + "\n", + "This notebook empirically tests these claims with increasing levels of matching difficulty, measuring actual costs and timing at each step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup-5fde189e", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup: install everyrow if needed and configure API key\n", + "try:\n", + " import everyrow\n", + "except ImportError:\n", + " %pip install everyrow\n", + "\n", + "import os\n", + "if \"EVERYROW_API_KEY\" not in os.environ:\n", + " os.environ[\"EVERYROW_API_KEY\"] = \"your-api-key-here\" # Get one at everyrow.io\n" + ] + }, + { + "cell_type": "markdown", + "id": "setup-md", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "First, let's set up our imports and create helper functions for measuring costs and timing." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "setup-code", + "metadata": {}, + "outputs": [], + "source": [ + "import asyncio\n", + "import time\n", + "from dataclasses import dataclass\n", + "from typing import Literal\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "from everyrow import create_session, get_billing_balance\n", + "from everyrow.ops import merge" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "helpers-code", + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass\n", + "class ExperimentResult:\n", + " \"\"\"Results from a merge experiment.\"\"\"\n", + " name: str\n", + " rows: int\n", + " cost_dollars: float\n", + " duration_seconds: float\n", + " accuracy_pct: float | None = None\n", + " \n", + " def __repr__(self):\n", + " acc = f\", accuracy={self.accuracy_pct:.1f}%\" if self.accuracy_pct is not None else \"\"\n", + " cost_per_row = self.cost_dollars / self.rows if self.rows > 0 else 0\n", + " return (f\"ExperimentResult({self.name}: {self.rows} rows, \"\n", + " f\"${self.cost_dollars:.4f} (${cost_per_row:.5f}/row), \"\n", + " f\"{self.duration_seconds:.1f}s{acc})\")\n", + "\n", + "# Store all experiment results for final comparison\n", + "all_results: list[ExperimentResult] = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "measure-fn", + "metadata": {}, + "outputs": [], + "source": "async def measure_merge(\n name: str,\n task: str,\n left_table: pd.DataFrame,\n right_table: pd.DataFrame,\n merge_on_left: str | None = None,\n merge_on_right: str | None = None,\n expected_matches: dict[str, str] | None = None,\n use_web_search: Literal['auto', 'yes', 'no'] | None = None,\n) -> tuple[pd.DataFrame, ExperimentResult]:\n \"\"\"\n Run a merge operation and measure its cost, duration, and accuracy.\n \n Args:\n name: Experiment name for logging\n task: The merge task description\n left_table: Left DataFrame (all rows preserved)\n right_table: Right DataFrame to match from\n merge_on_left: Column name in left table (optional)\n merge_on_right: Column name in right table (optional)\n expected_matches: Dict mapping left values to expected right values (for accuracy)\n use_web_search: \"auto\", \"yes\", or \"no\"\n \n Returns:\n Tuple of (result DataFrame, ExperimentResult)\n \"\"\"\n # Measure billing before\n balance_before = await get_billing_balance()\n start_time = time.time()\n \n # Run the merge inside a named session\n async with create_session(name=name) as session:\n print(f\"Session URL: {session.get_url()}\")\n result = await merge(\n task=task,\n session=session,\n left_table=left_table,\n right_table=right_table,\n merge_on_left=merge_on_left,\n merge_on_right=merge_on_right,\n use_web_search=use_web_search,\n )\n \n # Measure billing after\n end_time = time.time()\n await asyncio.sleep(60) # wait for billing to update\n balance_after = await get_billing_balance()\n \n cost = balance_before.current_balance_dollars - balance_after.current_balance_dollars\n duration = end_time - start_time\n \n # Calculate accuracy if expected matches provided\n accuracy = None\n if expected_matches and merge_on_left and merge_on_right:\n correct = 0\n total = len(expected_matches)\n for left_val, expected_right in expected_matches.items():\n row = result.data[result.data[merge_on_left] == left_val]\n if len(row) > 0:\n actual_right = row[merge_on_right].iloc[0]\n if pd.notna(actual_right) and expected_right in str(actual_right):\n correct += 1\n accuracy = (correct / total) * 100 if total > 0 else None\n \n exp_result = ExperimentResult(\n name=name,\n rows=len(left_table),\n cost_dollars=cost,\n duration_seconds=duration,\n accuracy_pct=accuracy,\n )\n all_results.append(exp_result)\n \n print(f\"\\n{exp_result}\")\n return result.data, exp_result" + }, + { + "cell_type": "markdown", + "id": "exp1-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Experiment 1: Exact String Matches Only\n", + "\n", + "Let's start with the simplest case: both tables use identical strings. This should be **instant and free** since the system can do a simple string comparison.\n", + "\n", + "We'll create a realistic scenario: matching a list of Fortune 500 companies to their revenue data." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "exp1-data", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Left table: 10 rows\n", + "Right table: 10 rows\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
companysector
0Apple Inc.Technology
1Microsoft CorporationTechnology
2Amazon.com Inc.Consumer Cyclical
\n", + "
" + ], + "text/plain": [ + " company sector\n", + "0 Apple Inc. Technology\n", + "1 Microsoft Corporation Technology\n", + "2 Amazon.com Inc. Consumer Cyclical" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fortune 500-style company data with EXACT matching names\n", + "companies_exact = pd.DataFrame([\n", + " {\"company\": \"Apple Inc.\", \"sector\": \"Technology\"},\n", + " {\"company\": \"Microsoft Corporation\", \"sector\": \"Technology\"},\n", + " {\"company\": \"Amazon.com Inc.\", \"sector\": \"Consumer Cyclical\"},\n", + " {\"company\": \"Alphabet Inc.\", \"sector\": \"Technology\"},\n", + " {\"company\": \"Meta Platforms Inc.\", \"sector\": \"Technology\"},\n", + " {\"company\": \"Tesla Inc.\", \"sector\": \"Consumer Cyclical\"},\n", + " {\"company\": \"NVIDIA Corporation\", \"sector\": \"Technology\"},\n", + " {\"company\": \"JPMorgan Chase & Co.\", \"sector\": \"Financial Services\"},\n", + " {\"company\": \"Johnson & Johnson\", \"sector\": \"Healthcare\"},\n", + " {\"company\": \"Visa Inc.\", \"sector\": \"Financial Services\"},\n", + "])\n", + "\n", + "revenue_exact = pd.DataFrame([\n", + " {\"company_name\": \"Apple Inc.\", \"revenue_billions\": 394},\n", + " {\"company_name\": \"Microsoft Corporation\", \"revenue_billions\": 211},\n", + " {\"company_name\": \"Amazon.com Inc.\", \"revenue_billions\": 574},\n", + " {\"company_name\": \"Alphabet Inc.\", \"revenue_billions\": 307},\n", + " {\"company_name\": \"Meta Platforms Inc.\", \"revenue_billions\": 134},\n", + " {\"company_name\": \"Tesla Inc.\", \"revenue_billions\": 96},\n", + " {\"company_name\": \"NVIDIA Corporation\", \"revenue_billions\": 61},\n", + " {\"company_name\": \"JPMorgan Chase & Co.\", \"revenue_billions\": 158},\n", + " {\"company_name\": \"Johnson & Johnson\", \"revenue_billions\": 95},\n", + " {\"company_name\": \"Visa Inc.\", \"revenue_billions\": 32},\n", + "])\n", + "\n", + "expected_exact = {row[\"company\"]: row[\"company\"] for _, row in companies_exact.iterrows()}\n", + "\n", + "print(f\"Left table: {len(companies_exact)} rows\")\n", + "print(f\"Right table: {len(revenue_exact)} rows\")\n", + "companies_exact.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "exp1-run", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ExperimentResult(Exact matches only: 10 rows, $0.0000 ($0.00000/row), 12.9s, accuracy=100.0%)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
companysectorcompany_namerevenue_billions
0Apple Inc.TechnologyApple Inc.394
1Microsoft CorporationTechnologyMicrosoft Corporation211
2Amazon.com Inc.Consumer CyclicalAmazon.com Inc.574
3Alphabet Inc.TechnologyAlphabet Inc.307
4Meta Platforms Inc.TechnologyMeta Platforms Inc.134
\n", + "
" + ], + "text/plain": [ + " company sector company_name \\\n", + "0 Apple Inc. Technology Apple Inc. \n", + "1 Microsoft Corporation Technology Microsoft Corporation \n", + "2 Amazon.com Inc. Consumer Cyclical Amazon.com Inc. \n", + "3 Alphabet Inc. Technology Alphabet Inc. \n", + "4 Meta Platforms Inc. Technology Meta Platforms Inc. \n", + "\n", + " revenue_billions \n", + "0 394 \n", + "1 211 \n", + "2 574 \n", + "3 307 \n", + "4 134 " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_exact, stats_exact = await measure_merge(\n", + " name=\"Exact matches only\",\n", + " task=\"Match companies by name. Names are identical in both tables.\",\n", + " left_table=companies_exact,\n", + " right_table=revenue_exact,\n", + " merge_on_left=\"company\",\n", + " merge_on_right=\"company_name\",\n", + " expected_matches={c: c for c in companies_exact[\"company\"]},\n", + ")\n", + "\n", + "result_exact[[\"company\", \"sector\", \"company_name\", \"revenue_billions\"]].head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "bfdb0326", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ExperimentResult(Exact matches only: 10 rows, $0.0100 ($0.00100/row), 31.4s, accuracy=100.0%)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
companysectorcompany_namerevenue_billions
0Apple Inc.TechnologyApple Inc.394.0
1Microsoft CorporationTechnologyMicrosoft Corporation211.0
2Amazon.com Inc.Consumer CyclicalAmazon.com Inc.574.0
3Alphabet Inc.TechnologyAlphabet Inc.307.0
4Meta Platforms Inc.TechnologyMeta Platforms Inc.134.0
\n", + "
" + ], + "text/plain": [ + " company sector company_name \\\n", + "0 Apple Inc. Technology Apple Inc. \n", + "1 Microsoft Corporation Technology Microsoft Corporation \n", + "2 Amazon.com Inc. Consumer Cyclical Amazon.com Inc. \n", + "3 Alphabet Inc. Technology Alphabet Inc. \n", + "4 Meta Platforms Inc. Technology Meta Platforms Inc. \n", + "\n", + " revenue_billions \n", + "0 394.0 \n", + "1 211.0 \n", + "2 574.0 \n", + "3 307.0 \n", + "4 134.0 " + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_exact, stats_exact = await measure_merge(\n", + " name=\"Exact matches only\",\n", + " task=\"Match companies by name. Names are identical in both tables.\",\n", + " left_table=companies_exact,\n", + " right_table=revenue_exact.iloc[:-2],\n", + " merge_on_left=\"company\",\n", + " merge_on_right=\"company_name\",\n", + " expected_matches={c: c for c in companies_exact[\"company\"].iloc[:-2]},\n", + " \n", + ")\n", + "\n", + "result_exact[[\"company\", \"sector\", \"company_name\", \"revenue_billions\"]].head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "d7bf981e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
companysectorcompany_namerevenue_billionsresearch
0Apple Inc.TechnologyApple Inc.394.0{'company_name': 'This row was matched due to ...
1Microsoft CorporationTechnologyMicrosoft Corporation211.0{'company_name': 'This row was matched due to ...
2Amazon.com Inc.Consumer CyclicalAmazon.com Inc.574.0{'company_name': 'This row was matched due to ...
3Alphabet Inc.TechnologyAlphabet Inc.307.0{'company_name': 'This row was matched due to ...
4Meta Platforms Inc.TechnologyMeta Platforms Inc.134.0{'company_name': 'This row was matched due to ...
5Tesla Inc.Consumer CyclicalTesla Inc.96.0{'company_name': 'This row was matched due to ...
6NVIDIA CorporationTechnologyNVIDIA Corporation61.0{'company_name': 'This row was matched due to ...
7JPMorgan Chase & Co.Financial ServicesJPMorgan Chase & Co.158.0{'company_name': 'This row was matched due to ...
8Johnson & JohnsonHealthcareNaNNaNNaN
9Visa Inc.Financial ServicesNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " company sector company_name \\\n", + "0 Apple Inc. Technology Apple Inc. \n", + "1 Microsoft Corporation Technology Microsoft Corporation \n", + "2 Amazon.com Inc. Consumer Cyclical Amazon.com Inc. \n", + "3 Alphabet Inc. Technology Alphabet Inc. \n", + "4 Meta Platforms Inc. Technology Meta Platforms Inc. \n", + "5 Tesla Inc. Consumer Cyclical Tesla Inc. \n", + "6 NVIDIA Corporation Technology NVIDIA Corporation \n", + "7 JPMorgan Chase & Co. Financial Services JPMorgan Chase & Co. \n", + "8 Johnson & Johnson Healthcare NaN \n", + "9 Visa Inc. Financial Services NaN \n", + "\n", + " revenue_billions research \n", + "0 394.0 {'company_name': 'This row was matched due to ... \n", + "1 211.0 {'company_name': 'This row was matched due to ... \n", + "2 574.0 {'company_name': 'This row was matched due to ... \n", + "3 307.0 {'company_name': 'This row was matched due to ... \n", + "4 134.0 {'company_name': 'This row was matched due to ... \n", + "5 96.0 {'company_name': 'This row was matched due to ... \n", + "6 61.0 {'company_name': 'This row was matched due to ... \n", + "7 158.0 {'company_name': 'This row was matched due to ... \n", + "8 NaN NaN \n", + "9 NaN NaN " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_exact" + ] + }, + { + "cell_type": "markdown", + "id": "exp1-analysis", + "metadata": {}, + "source": [ + "As expected: **zero cost** for exact string matches. The cascade never needed to invoke LLM reasoning." + ] + }, + { + "cell_type": "markdown", + "id": "exp2-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Experiment 2: Exact + Fuzzy Matches (Typos & Variations)\n", + "\n", + "Real-world data is messy. Let's introduce realistic variations:\n", + "- **Typos**: \"Microsft\" instead of \"Microsoft\"\n", + "- **Case differences**: \"APPLE INC\" vs \"Apple Inc.\"\n", + "- **Missing punctuation**: \"Johnson Johnson\" vs \"Johnson & Johnson\"\n", + "- **Spacing issues**: \"JP Morgan\" vs \"JPMorgan\"\n", + "\n", + "These should all be handled by **fuzzy matching**, which is still free." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "exp2-data", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sample variations:\n", + " 'APPLE INC' → 'Apple Inc.'\n", + " 'Microsft Corporation' → 'Microsoft Corporation'\n", + " 'Amazon Inc' → 'Amazon.com Inc.'\n", + " 'Alphabet' → 'Alphabet Inc.'\n", + " 'Meta Platforms' → 'Meta Platforms Inc.'\n" + ] + } + ], + "source": [ + "# Same companies but with realistic typos and variations\n", + "companies_fuzzy = pd.DataFrame([\n", + " {\"company\": \"APPLE INC\", \"sector\": \"Technology\"}, # Case difference\n", + " {\"company\": \"Microsft Corporation\", \"sector\": \"Technology\"}, # Typo\n", + " {\"company\": \"Amazon Inc\", \"sector\": \"Consumer Cyclical\"}, # Missing .com\n", + " {\"company\": \"Alphabet\", \"sector\": \"Technology\"}, # Missing Inc.\n", + " {\"company\": \"Meta Platforms\", \"sector\": \"Technology\"}, # Missing Inc.\n", + " {\"company\": \"Telsa Inc.\", \"sector\": \"Consumer Cyclical\"}, # Typo (Telsa)\n", + " {\"company\": \"Nvidia Corp\", \"sector\": \"Technology\"}, # Abbreviation\n", + " {\"company\": \"JP Morgan Chase\", \"sector\": \"Financial Services\"}, # Spacing\n", + " {\"company\": \"Johnson Johnson\", \"sector\": \"Healthcare\"}, # Missing &\n", + " {\"company\": \"Visa\", \"sector\": \"Financial Services\"}, # Missing Inc.\n", + "])\n", + "\n", + "# Expected matches (left company -> right company_name)\n", + "expected_fuzzy = {\n", + " \"APPLE INC\": \"Apple Inc.\",\n", + " \"Microsft Corporation\": \"Microsoft Corporation\",\n", + " \"Amazon Inc\": \"Amazon.com Inc.\",\n", + " \"Alphabet\": \"Alphabet Inc.\",\n", + " \"Meta Platforms\": \"Meta Platforms Inc.\",\n", + " \"Telsa Inc.\": \"Tesla Inc.\",\n", + " \"Nvidia Corp\": \"NVIDIA Corporation\",\n", + " \"JP Morgan Chase\": \"JPMorgan Chase & Co.\",\n", + " \"Johnson Johnson\": \"Johnson & Johnson\",\n", + " \"Visa\": \"Visa Inc.\",\n", + "}\n", + "\n", + "print(\"Sample variations:\")\n", + "for left, right in list(expected_fuzzy.items())[:5]:\n", + " print(f\" '{left}' → '{right}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "exp2-run", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ExperimentResult(Exact + fuzzy (typos): 10 rows, $0.0000 ($0.00000/row), 19.6s, accuracy=100.0%)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
companycompany_namerevenue_billions
0APPLE INCApple Inc.394
1Microsft CorporationMicrosoft Corporation211
2Amazon IncAmazon.com Inc.574
3AlphabetAlphabet Inc.307
4Meta PlatformsMeta Platforms Inc.134
5Telsa Inc.Tesla Inc.96
6Nvidia CorpNVIDIA Corporation61
7JP Morgan ChaseJPMorgan Chase & Co.158
8Johnson JohnsonJohnson & Johnson95
9VisaVisa Inc.32
\n", + "
" + ], + "text/plain": [ + " company company_name revenue_billions\n", + "0 APPLE INC Apple Inc. 394\n", + "1 Microsft Corporation Microsoft Corporation 211\n", + "2 Amazon Inc Amazon.com Inc. 574\n", + "3 Alphabet Alphabet Inc. 307\n", + "4 Meta Platforms Meta Platforms Inc. 134\n", + "5 Telsa Inc. Tesla Inc. 96\n", + "6 Nvidia Corp NVIDIA Corporation 61\n", + "7 JP Morgan Chase JPMorgan Chase & Co. 158\n", + "8 Johnson Johnson Johnson & Johnson 95\n", + "9 Visa Visa Inc. 32" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_fuzzy, stats_fuzzy = await measure_merge(\n", + " name=\"Exact + fuzzy (typos)\",\n", + " task=\"Match companies by name. Handle typos, case differences, and minor variations.\",\n", + " left_table=companies_fuzzy,\n", + " right_table=revenue_exact,\n", + " merge_on_left=\"company\",\n", + " merge_on_right=\"company_name\",\n", + " expected_matches=expected_fuzzy,\n", + ")\n", + "\n", + "result_fuzzy[[\"company\", \"company_name\", \"revenue_billions\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "exp2-analysis", + "metadata": {}, + "source": [ + "Still **zero (or near-zero) cost**! Fuzzy string matching handles all these variations without needing LLM reasoning." + ] + }, + { + "cell_type": "markdown", + "id": "exp3-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Experiment 3: Mostly Exact + Few LLM Matches (Semantic Relationships)\n", + "\n", + "Now let's introduce cases that **require semantic understanding**:\n", + "- **Subsidiaries**: \"Instagram\" should match \"Meta Platforms\"\n", + "- **Parent companies**: \"YouTube\" should match \"Alphabet\"\n", + "- **Acquisitions**: \"LinkedIn\" should match \"Microsoft\"\n", + "- **Regional names**: \"MSD\" is Merck's name outside the US\n", + "\n", + "These can't be solved by string matching alone—the LLM needs to know that Instagram is owned by Meta.\n", + "\n", + "**Hypothesis**: With mostly exact matches and only a few semantic ones, costs should be minimal since only the semantic matches invoke the LLM." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "exp3-data", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total rows: 10\n", + " - Exact matches expected: 7 (free)\n", + " - Semantic matches expected: 3 (LLM required)\n" + ] + } + ], + "source": [ + "# Mix of exact matches and semantic relationships\n", + "companies_semantic = pd.DataFrame([\n", + " # Exact matches (7 rows - should be free)\n", + " {\"company\": \"Apple Inc.\", \"sector\": \"Technology\"},\n", + " {\"company\": \"Microsoft Corporation\", \"sector\": \"Technology\"},\n", + " {\"company\": \"Amazon.com Inc.\", \"sector\": \"Consumer Cyclical\"},\n", + " {\"company\": \"Tesla Inc.\", \"sector\": \"Consumer Cyclical\"},\n", + " {\"company\": \"NVIDIA Corporation\", \"sector\": \"Technology\"},\n", + " {\"company\": \"JPMorgan Chase & Co.\", \"sector\": \"Financial Services\"},\n", + " {\"company\": \"Visa Inc.\", \"sector\": \"Financial Services\"},\n", + " # Semantic matches (3 rows - require LLM)\n", + " {\"company\": \"Instagram\", \"sector\": \"Technology\"}, # → Meta Platforms Inc.\n", + " {\"company\": \"YouTube\", \"sector\": \"Technology\"}, # → Alphabet Inc.\n", + " {\"company\": \"WhatsApp\", \"sector\": \"Technology\"}, # → Meta Platforms Inc.\n", + "])\n", + "\n", + "expected_semantic = {\n", + " \"Apple Inc.\": \"Apple Inc.\",\n", + " \"Microsoft Corporation\": \"Microsoft Corporation\",\n", + " \"Amazon.com Inc.\": \"Amazon.com Inc.\",\n", + " \"Tesla Inc.\": \"Tesla Inc.\",\n", + " \"NVIDIA Corporation\": \"NVIDIA Corporation\",\n", + " \"JPMorgan Chase & Co.\": \"JPMorgan Chase & Co.\",\n", + " \"Visa Inc.\": \"Visa Inc.\",\n", + " \"Instagram\": \"Meta Platforms Inc.\",\n", + " \"YouTube\": \"Alphabet Inc.\",\n", + " \"WhatsApp\": \"Meta Platforms Inc.\",\n", + "}\n", + "\n", + "print(f\"Total rows: {len(companies_semantic)}\")\n", + "print(f\" - Exact matches expected: 7 (free)\")\n", + "print(f\" - Semantic matches expected: 3 (LLM required)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "exp3-run", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ExperimentResult(Mostly exact + semantic: 10 rows, $0.0300 ($0.00300/row), 67.3s, accuracy=100.0%)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
companycompany_namerevenue_billions
0Apple Inc.Apple Inc.394
1Microsoft CorporationMicrosoft Corporation211
2Amazon.com Inc.Amazon.com Inc.574
3Tesla Inc.Tesla Inc.96
4NVIDIA CorporationNVIDIA Corporation61
5JPMorgan Chase & Co.JPMorgan Chase & Co.158
6Visa Inc.Visa Inc.32
7InstagramMeta Platforms Inc.134
8YouTubeAlphabet Inc.307
9WhatsAppMeta Platforms Inc.134
\n", + "
" + ], + "text/plain": [ + " company company_name revenue_billions\n", + "0 Apple Inc. Apple Inc. 394\n", + "1 Microsoft Corporation Microsoft Corporation 211\n", + "2 Amazon.com Inc. Amazon.com Inc. 574\n", + "3 Tesla Inc. Tesla Inc. 96\n", + "4 NVIDIA Corporation NVIDIA Corporation 61\n", + "5 JPMorgan Chase & Co. JPMorgan Chase & Co. 158\n", + "6 Visa Inc. Visa Inc. 32\n", + "7 Instagram Meta Platforms Inc. 134\n", + "8 YouTube Alphabet Inc. 307\n", + "9 WhatsApp Meta Platforms Inc. 134" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_semantic, stats_semantic = await measure_merge(\n", + " name=\"Mostly exact + semantic\",\n", + " task=\"\"\"Match companies. Note:\n", + " - Instagram and WhatsApp are owned by Meta Platforms\n", + " - YouTube is owned by Alphabet (Google's parent)\n", + " \"\"\",\n", + " left_table=companies_semantic,\n", + " right_table=revenue_exact,\n", + " merge_on_left=\"company\",\n", + " merge_on_right=\"company_name\",\n", + " expected_matches=expected_semantic,\n", + ")\n", + "\n", + "result_semantic[[\"company\", \"company_name\", \"revenue_billions\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "exp3-analysis-code", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cost per LLM match: $0.0100\n", + "Total cost for 3 LLM matches: $0.0300\n", + "\n", + "The 7 exact matches were FREE.\n" + ] + } + ], + "source": [ + "# Calculate estimated per-semantic-match cost\n", + "if stats_semantic.cost_dollars > 0:\n", + " semantic_matches = 3 # Instagram, YouTube, WhatsApp\n", + " cost_per_llm_match = stats_semantic.cost_dollars / semantic_matches\n", + " print(f\"Cost per LLM match: ${cost_per_llm_match:.4f}\")\n", + " print(f\"Total cost for {semantic_matches} LLM matches: ${stats_semantic.cost_dollars:.4f}\")\n", + " print(f\"\\nThe 7 exact matches were FREE.\")" + ] + }, + { + "cell_type": "markdown", + "id": "exp3-summary", + "metadata": {}, + "source": [ + "This demonstrates the cascade in action: **70% of rows matched for free** (exact matches), while only 30% required LLM reasoning." + ] + }, + { + "cell_type": "markdown", + "id": "exp4-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Experiment 4: Non-Trivial Matching (Breakdown by Match Type)\n", + "\n", + "Let's test a more realistic scenario where we need to match pharmaceutical company subsidiaries and regional variations to their parent companies. This is a common real-world challenge in clinical trial data.\n", + "\n", + "We'll create data that tests the full cascade:\n", + "- **Exact matches**: Identical names\n", + "- **Fuzzy matches**: Typos and variations\n", + "- **LLM matches**: Subsidiaries, regional names, abbreviations" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "exp4-data", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total sponsor records: 13\n", + "\n", + "Expected match breakdown:\n", + " - Exact matches: 4 rows (free)\n", + " - Fuzzy matches: 3 rows (free)\n", + " - LLM matches: 6 rows (charged)\n" + ] + } + ], + "source": [ + "# Clinical trial sponsors (left table)\n", + "trial_sponsors = pd.DataFrame([\n", + " # Exact matches (should be free)\n", + " {\"sponsor\": \"Pfizer Inc.\", \"trial_count\": 150},\n", + " {\"sponsor\": \"Novartis AG\", \"trial_count\": 120},\n", + " {\"sponsor\": \"Sanofi S.A.\", \"trial_count\": 100},\n", + " {\"sponsor\": \"AstraZeneca PLC\", \"trial_count\": 95},\n", + " \n", + " # Fuzzy matches (should still be free)\n", + " {\"sponsor\": \"Pfzer Inc\", \"trial_count\": 5}, # Typo\n", + " {\"sponsor\": \"NOVARTIS\", \"trial_count\": 8}, # Case\n", + " {\"sponsor\": \"Astra Zeneca\", \"trial_count\": 12}, # Spacing\n", + " \n", + " # LLM matches - subsidiaries and regional names\n", + " {\"sponsor\": \"Genentech\", \"trial_count\": 45}, # → Roche\n", + " {\"sponsor\": \"MSD\", \"trial_count\": 80}, # → Merck (regional name)\n", + " {\"sponsor\": \"BMS\", \"trial_count\": 60}, # → Bristol-Myers Squibb\n", + " {\"sponsor\": \"AbbVie\", \"trial_count\": 70}, # Was part of Abbott\n", + " {\"sponsor\": \"Genzyme\", \"trial_count\": 25}, # → Sanofi (acquired)\n", + " {\"sponsor\": \"Medimmune\", \"trial_count\": 20}, # → AstraZeneca\n", + "])\n", + "\n", + "# Parent pharma companies (right table)\n", + "pharma_parents = pd.DataFrame([\n", + " {\"company\": \"Pfizer Inc.\", \"hq_country\": \"USA\", \"market_cap_b\": 250},\n", + " {\"company\": \"Novartis AG\", \"hq_country\": \"Switzerland\", \"market_cap_b\": 200},\n", + " {\"company\": \"Roche Holding AG\", \"hq_country\": \"Switzerland\", \"market_cap_b\": 280},\n", + " {\"company\": \"Merck & Co.\", \"hq_country\": \"USA\", \"market_cap_b\": 270},\n", + " {\"company\": \"Bristol-Myers Squibb\", \"hq_country\": \"USA\", \"market_cap_b\": 150},\n", + " {\"company\": \"AbbVie Inc.\", \"hq_country\": \"USA\", \"market_cap_b\": 260},\n", + " {\"company\": \"Sanofi S.A.\", \"hq_country\": \"France\", \"market_cap_b\": 130},\n", + " {\"company\": \"AstraZeneca PLC\", \"hq_country\": \"UK\", \"market_cap_b\": 220},\n", + "])\n", + "\n", + "expected_pharma = {\n", + " \"Pfizer Inc.\": \"Pfizer\", \"Novartis AG\": \"Novartis\", \"Sanofi S.A.\": \"Sanofi\",\n", + " \"AstraZeneca PLC\": \"AstraZeneca\", \"Pfzer Inc\": \"Pfizer\", \"NOVARTIS\": \"Novartis\",\n", + " \"Astra Zeneca\": \"AstraZeneca\", \"Genentech\": \"Roche\", \"MSD\": \"Merck\",\n", + " \"BMS\": \"Bristol-Myers\", \"AbbVie\": \"AbbVie\", \"Genzyme\": \"Sanofi\",\n", + " \"Medimmune\": \"AstraZeneca\",\n", + "}\n", + "\n", + "print(f\"Total sponsor records: {len(trial_sponsors)}\")\n", + "print(f\"\\nExpected match breakdown:\")\n", + "print(f\" - Exact matches: 4 rows (free)\")\n", + "print(f\" - Fuzzy matches: 3 rows (free)\")\n", + "print(f\" - LLM matches: 6 rows (charged)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "exp4-run", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ExperimentResult(Pharma non-trivial: 13 rows, $0.0000 ($0.00000/row), 51.6s, accuracy=61.5%)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sponsortrial_countcompanyhq_countrymarket_cap_b
0Pfizer Inc.150Pfizer Inc.USA250.0
1Novartis AG120Novartis AGSwitzerland200.0
2Sanofi S.A.100Sanofi S.A.France130.0
3AstraZeneca PLC95AstraZeneca PLCUK220.0
4Pfzer Inc5NaNNaNNaN
5NOVARTIS8NaNNaNNaN
6Astra Zeneca12NaNNaNNaN
7Genentech45Roche Holding AGSwitzerland280.0
8MSD80Merck & Co.USA270.0
9BMS60Bristol-Myers SquibbUSA150.0
10AbbVie70AbbVie Inc.USA260.0
11Genzyme25NaNNaNNaN
12Medimmune20NaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " sponsor trial_count company hq_country \\\n", + "0 Pfizer Inc. 150 Pfizer Inc. USA \n", + "1 Novartis AG 120 Novartis AG Switzerland \n", + "2 Sanofi S.A. 100 Sanofi S.A. France \n", + "3 AstraZeneca PLC 95 AstraZeneca PLC UK \n", + "4 Pfzer Inc 5 NaN NaN \n", + "5 NOVARTIS 8 NaN NaN \n", + "6 Astra Zeneca 12 NaN NaN \n", + "7 Genentech 45 Roche Holding AG Switzerland \n", + "8 MSD 80 Merck & Co. USA \n", + "9 BMS 60 Bristol-Myers Squibb USA \n", + "10 AbbVie 70 AbbVie Inc. USA \n", + "11 Genzyme 25 NaN NaN \n", + "12 Medimmune 20 NaN NaN \n", + "\n", + " market_cap_b \n", + "0 250.0 \n", + "1 200.0 \n", + "2 130.0 \n", + "3 220.0 \n", + "4 NaN \n", + "5 NaN \n", + "6 NaN \n", + "7 280.0 \n", + "8 270.0 \n", + "9 150.0 \n", + "10 260.0 \n", + "11 NaN \n", + "12 NaN " + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_pharma, stats_pharma = await measure_merge(\n", + " name=\"Pharma non-trivial\",\n", + " task=\"\"\"Match clinical trial sponsors to their parent pharmaceutical company.\n", + " \n", + " Key relationships to know:\n", + " - Genentech is a subsidiary of Roche\n", + " - MSD is Merck's name outside the United States\n", + " - BMS is the abbreviation for Bristol-Myers Squibb\n", + " - Genzyme was acquired by Sanofi\n", + " - MedImmune is a subsidiary of AstraZeneca\n", + " \"\"\",\n", + " left_table=trial_sponsors,\n", + " right_table=pharma_parents,\n", + " merge_on_left=\"sponsor\",\n", + " merge_on_right=\"company\",\n", + " expected_matches=expected_pharma,\n", + ")\n", + "\n", + "result_pharma[[\"sponsor\", \"trial_count\", \"company\", \"hq_country\", \"market_cap_b\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "exp4-breakdown", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Match Type Breakdown:\n", + " Exact matches: 4 (31%) - FREE\n", + " Fuzzy matches: 3 (23%) - FREE\n", + " LLM matches: 6 (46%) - CHARGED\n", + " ─────────────────────\n", + " Total: 13\n", + "\n", + "Free matches: 54%\n", + "Paid matches: 46%\n" + ] + } + ], + "source": [ + "# Analyze match type breakdown\n", + "exact_matches = 4\n", + "fuzzy_matches = 3 \n", + "llm_matches = 6\n", + "total = exact_matches + fuzzy_matches + llm_matches\n", + "\n", + "print(\"Match Type Breakdown:\")\n", + "print(f\" Exact matches: {exact_matches:2d} ({exact_matches/total*100:.0f}%) - FREE\")\n", + "print(f\" Fuzzy matches: {fuzzy_matches:2d} ({fuzzy_matches/total*100:.0f}%) - FREE\")\n", + "print(f\" LLM matches: {llm_matches:2d} ({llm_matches/total*100:.0f}%) - CHARGED\")\n", + "print(f\" ─────────────────────\")\n", + "print(f\" Total: {total:2d}\")\n", + "print(f\"\\nFree matches: {(exact_matches + fuzzy_matches)/total*100:.0f}%\")\n", + "print(f\"Paid matches: {llm_matches/total*100:.0f}%\")\n", + "\n", + "if stats_pharma.cost_dollars > 0:\n", + " print(f\"\\nActual cost: ${stats_pharma.cost_dollars:.4f}\")\n", + " print(f\"Cost per LLM match: ${stats_pharma.cost_dollars/llm_matches:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "exp4-summary", + "metadata": {}, + "source": [ + "Even with complex pharmaceutical relationships, **over half the matches were free**. The cost scales with the number of rows requiring semantic understanding, not the total row count." + ] + }, + { + "cell_type": "markdown", + "id": "exp5-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Experiment 5: LLM-Only Matching (No `merge_on` Parameters)\n", + "\n", + "What happens when you **don't specify** which columns to match? The system must:\n", + "1. Analyze both tables to guess which columns are relevant\n", + "2. Use LLM reasoning for every row\n", + "\n", + "This is more expensive but useful when:\n", + "- You're not sure which columns should match\n", + "- Multiple columns might be relevant\n", + "- The matching logic is complex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "exp5-data", + "metadata": {}, + "outputs": [], + "source": [ + "# Contact data without clear merge keys\n", + "contacts = pd.DataFrame([\n", + " {\"name\": \"John Smith\", \"email\": \"jsmith@acme.com\", \"title\": \"VP Sales\"},\n", + " {\"name\": \"Sarah Johnson\", \"email\": \"sarah.j@techcorp.io\", \"title\": \"CTO\"},\n", + " {\"name\": \"Mike Chen\", \"email\": \"m.chen@globalinc.com\", \"title\": \"Director\"},\n", + " {\"name\": \"Emily Davis\", \"email\": \"emily@startup.co\", \"title\": \"CEO\"},\n", + " {\"name\": \"Tom Wilson\", \"email\": \"twilson@bigco.com\", \"title\": \"Manager\"},\n", + "])\n", + "\n", + "# Company data to match against\n", + "companies = pd.DataFrame([\n", + " {\"company_name\": \"Acme Corporation\", \"domain\": \"acme.com\", \"industry\": \"Manufacturing\"},\n", + " {\"company_name\": \"TechCorp Solutions\", \"domain\": \"techcorp.io\", \"industry\": \"Software\"},\n", + " {\"company_name\": \"Global Industries Inc\", \"domain\": \"globalinc.com\", \"industry\": \"Consulting\"},\n", + " {\"company_name\": \"Startup Co\", \"domain\": \"startup.co\", \"industry\": \"Technology\"},\n", + " {\"company_name\": \"BigCo Enterprises\", \"domain\": \"bigco.com\", \"industry\": \"Finance\"},\n", + "])\n", + "\n", + "print(\"Contacts:\")\n", + "print(contacts.to_string(index=False))\n", + "print(\"\\nCompanies:\")\n", + "print(companies.to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "exp5-run", + "metadata": {}, + "outputs": [], + "source": [ + "# Run WITHOUT specifying merge_on columns\n", + "result_nokeys, stats_nokeys = await measure_merge(\n", + " name=\"LLM-only (no merge_on)\",\n", + " task=\"\"\"Match each contact to their company.\n", + " Use the email domain to identify which company each person works for.\n", + " For example, jsmith@acme.com works at Acme Corporation.\n", + " \"\"\",\n", + " left_table=contacts,\n", + " right_table=companies,\n", + " # Note: No merge_on_left or merge_on_right specified!\n", + ")\n", + "\n", + "result_nokeys" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "exp5-compare", + "metadata": {}, + "outputs": [], + "source": [ + "# Compare: same data but WITH merge hints\n", + "result_withkeys, stats_withkeys = await measure_merge(\n", + " name=\"With merge_on hints\",\n", + " task=\"\"\"Match contacts to companies by email domain.\"\"\",\n", + " left_table=contacts,\n", + " right_table=companies,\n", + " merge_on_left=\"email\",\n", + " merge_on_right=\"domain\",\n", + ")\n", + "\n", + "print(f\"\\nComparison:\")\n", + "print(f\" Without merge_on: ${stats_nokeys.cost_dollars:.4f}, {stats_nokeys.duration_seconds:.1f}s\")\n", + "print(f\" With merge_on: ${stats_withkeys.cost_dollars:.4f}, {stats_withkeys.duration_seconds:.1f}s\")\n", + "\n", + "if stats_nokeys.cost_dollars > 0 and stats_withkeys.cost_dollars > 0:\n", + " ratio = stats_nokeys.cost_dollars / stats_withkeys.cost_dollars\n", + " print(f\"\\n LLM-only is {ratio:.1f}x more expensive\")" + ] + }, + { + "cell_type": "markdown", + "id": "exp5-summary", + "metadata": {}, + "source": [ + "**Takeaway**: Providing `merge_on` hints significantly reduces costs when the matching columns are known." + ] + }, + { + "cell_type": "markdown", + "id": "exp6-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Experiment 6: Scaling Analysis\n", + "\n", + "How do costs scale as we increase:\n", + "1. **Number of rows** (10 → 50 → 100 → 200)\n", + "2. **Content per row** (more columns, longer text)\n", + "\n", + "For this experiment, we'll generate synthetic data with controllable characteristics and measure the cost/time relationship." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "exp6-helpers", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_company_data(n_rows: int, add_description: bool = False) -> tuple[pd.DataFrame, pd.DataFrame]:\n", + " \"\"\"\n", + " Generate synthetic company data for scaling tests.\n", + " \n", + " Returns left_table (with variations) and right_table (canonical names).\n", + " Mix includes: 40% exact, 30% fuzzy, 30% semantic.\n", + " \"\"\"\n", + " base_companies = [\n", + " (\"Apple Inc.\", \"AAPL\", \"Technology\"),\n", + " (\"Microsoft Corporation\", \"MSFT\", \"Technology\"),\n", + " (\"Amazon.com Inc.\", \"AMZN\", \"E-commerce\"),\n", + " (\"Alphabet Inc.\", \"GOOGL\", \"Technology\"),\n", + " (\"Meta Platforms Inc.\", \"META\", \"Technology\"),\n", + " (\"Tesla Inc.\", \"TSLA\", \"Automotive\"),\n", + " (\"NVIDIA Corporation\", \"NVDA\", \"Technology\"),\n", + " (\"JPMorgan Chase & Co.\", \"JPM\", \"Finance\"),\n", + " (\"Johnson & Johnson\", \"JNJ\", \"Healthcare\"),\n", + " (\"Visa Inc.\", \"V\", \"Finance\"),\n", + " ]\n", + " \n", + " # Variations for left table\n", + " variations = {\n", + " \"Apple Inc.\": [\"Apple Inc.\", \"APPLE INC\", \"Apple\"], # exact, case, short\n", + " \"Microsoft Corporation\": [\"Microsoft Corporation\", \"Microsft Corp\", \"MSFT\"],\n", + " \"Amazon.com Inc.\": [\"Amazon.com Inc.\", \"Amazon Inc\", \"AWS\"], # exact, fuzzy, semantic\n", + " \"Alphabet Inc.\": [\"Alphabet Inc.\", \"Alphabet\", \"Google\"],\n", + " \"Meta Platforms Inc.\": [\"Meta Platforms Inc.\", \"Meta Platforms\", \"Facebook\"],\n", + " \"Tesla Inc.\": [\"Tesla Inc.\", \"Telsa Inc\", \"Tesla Motors\"],\n", + " \"NVIDIA Corporation\": [\"NVIDIA Corporation\", \"Nvidia Corp\", \"GeForce\"],\n", + " \"JPMorgan Chase & Co.\": [\"JPMorgan Chase & Co.\", \"JP Morgan\", \"Chase Bank\"],\n", + " \"Johnson & Johnson\": [\"Johnson & Johnson\", \"Johnson Johnson\", \"J&J\"],\n", + " \"Visa Inc.\": [\"Visa Inc.\", \"Visa\", \"Visa Card\"],\n", + " }\n", + " \n", + " left_rows = []\n", + " for i in range(n_rows):\n", + " base = base_companies[i % len(base_companies)]\n", + " company_name = base[0]\n", + " var_list = variations[company_name]\n", + " # Cycle through: exact (40%), fuzzy (30%), semantic (30%)\n", + " var_idx = i % 3 # 0=exact, 1=fuzzy, 2=semantic\n", + " var_name = var_list[min(var_idx, len(var_list)-1)]\n", + " \n", + " row = {\n", + " \"company\": var_name,\n", + " \"record_id\": f\"REC-{i:04d}\",\n", + " \"sector\": base[2],\n", + " }\n", + " if add_description:\n", + " row[\"description\"] = f\"Company record {i} for {var_name}. \" * 5\n", + " left_rows.append(row)\n", + " \n", + " right_rows = [\n", + " {\"company_name\": c[0], \"ticker\": c[1], \"industry\": c[2], \"employees\": (i+1)*10000}\n", + " for i, c in enumerate(base_companies)\n", + " ]\n", + " \n", + " return pd.DataFrame(left_rows), pd.DataFrame(right_rows)\n", + "\n", + "# Test the generator\n", + "test_left, test_right = generate_company_data(10)\n", + "print(\"Sample left table:\")\n", + "print(test_left.head())" + ] + }, + { + "cell_type": "markdown", + "id": "exp6-scaling-rows-md", + "metadata": {}, + "source": [ + "### 6.1 Scaling with Number of Rows\n", + "\n", + "Let's measure how costs grow as we increase row count." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "exp6-scaling-rows", + "metadata": {}, + "outputs": [], + "source": [ + "row_counts = [10, 30, 50, 100]\n", + "scaling_results = []\n", + "\n", + "for n_rows in row_counts:\n", + " left_df, right_df = generate_company_data(n_rows)\n", + " \n", + " _, result = await measure_merge(\n", + " name=f\"Scale test: {n_rows} rows\",\n", + " task=\"\"\"Match companies. Handle variations like:\n", + " - Google is Alphabet's main product\n", + " - Facebook is now Meta Platforms\n", + " - AWS is part of Amazon\n", + " - Chase Bank is part of JPMorgan\n", + " \"\"\",\n", + " left_table=left_df,\n", + " right_table=right_df,\n", + " merge_on_left=\"company\",\n", + " merge_on_right=\"company_name\",\n", + " )\n", + " scaling_results.append(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "exp6-scaling-analysis", + "metadata": {}, + "outputs": [], + "source": [ + "# Analyze scaling results\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"ROW SCALING ANALYSIS\")\n", + "print(\"=\"*60)\n", + "print(f\"{'Rows':<10} {'Cost':>10} {'Time (s)':>10} {'$/row':>12}\")\n", + "print(\"-\"*42)\n", + "\n", + "for r in scaling_results:\n", + " cost_per_row = r.cost_dollars / r.rows if r.rows > 0 else 0\n", + " print(f\"{r.rows:<10} ${r.cost_dollars:>8.4f} {r.duration_seconds:>10.1f} ${cost_per_row:>10.5f}\")\n", + "\n", + "# Check if cost scales linearly\n", + "if len(scaling_results) >= 2 and scaling_results[0].cost_dollars > 0:\n", + " first = scaling_results[0]\n", + " last = scaling_results[-1]\n", + " row_ratio = last.rows / first.rows\n", + " cost_ratio = last.cost_dollars / first.cost_dollars if first.cost_dollars > 0 else 0\n", + " print(f\"\\nScaling factor: {row_ratio:.0f}x rows → {cost_ratio:.1f}x cost\")\n", + " if cost_ratio > 0:\n", + " print(f\"Cost scales {'linearly' if 0.8 < cost_ratio/row_ratio < 1.2 else 'sub-linearly' if cost_ratio/row_ratio < 0.8 else 'super-linearly'}\")" + ] + }, + { + "cell_type": "markdown", + "id": "exp6-scaling-content-md", + "metadata": {}, + "source": [ + "### 6.2 Scaling with Content per Row\n", + "\n", + "Does adding more columns or longer text fields affect costs?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "exp6-scaling-content", + "metadata": {}, + "outputs": [], + "source": [ + "# Compare: minimal columns vs rich content\n", + "n_rows = 20\n", + "\n", + "# Minimal content\n", + "left_minimal, right_minimal = generate_company_data(n_rows, add_description=False)\n", + "_, result_minimal = await measure_merge(\n", + " name=f\"Minimal content ({n_rows} rows)\",\n", + " task=\"Match companies. Google→Alphabet, Facebook→Meta, AWS→Amazon.\",\n", + " left_table=left_minimal,\n", + " right_table=right_minimal,\n", + " merge_on_left=\"company\",\n", + " merge_on_right=\"company_name\",\n", + ")\n", + "\n", + "# Rich content\n", + "left_rich, right_rich = generate_company_data(n_rows, add_description=True)\n", + "_, result_rich = await measure_merge(\n", + " name=f\"Rich content ({n_rows} rows)\",\n", + " task=\"Match companies. Google→Alphabet, Facebook→Meta, AWS→Amazon.\",\n", + " left_table=left_rich,\n", + " right_table=right_rich,\n", + " merge_on_left=\"company\",\n", + " merge_on_right=\"company_name\",\n", + ")\n", + "\n", + "print(f\"\\nContent comparison ({n_rows} rows):\")\n", + "print(f\" Minimal ({len(left_minimal.columns)} cols): ${result_minimal.cost_dollars:.4f}, {result_minimal.duration_seconds:.1f}s\")\n", + "print(f\" Rich ({len(left_rich.columns)} cols): ${result_rich.cost_dollars:.4f}, {result_rich.duration_seconds:.1f}s\")" + ] + }, + { + "cell_type": "markdown", + "id": "summary-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Summary: Cost & Performance Findings\n", + "\n", + "Let's compile all our experimental results into a final comparison." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "summary-table", + "metadata": {}, + "outputs": [], + "source": [ + "# Create summary DataFrame\n", + "summary_data = []\n", + "for r in all_results:\n", + " cost_per_row = r.cost_dollars / r.rows if r.rows > 0 else 0\n", + " summary_data.append({\n", + " \"Experiment\": r.name,\n", + " \"Rows\": r.rows,\n", + " \"Cost ($)\": f\"${r.cost_dollars:.4f}\",\n", + " \"Time (s)\": f\"{r.duration_seconds:.1f}\",\n", + " \"$/Row\": f\"${cost_per_row:.5f}\",\n", + " \"Accuracy\": f\"{r.accuracy_pct:.0f}%\" if r.accuracy_pct else \"N/A\",\n", + " })\n", + "\n", + "summary_df = pd.DataFrame(summary_data)\n", + "print(\"\\n\" + \"=\"*80)\n", + "print(\"COMPLETE EXPERIMENT SUMMARY\")\n", + "print(\"=\"*80)\n", + "print(summary_df.to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "key-findings", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate key findings\n", + "total_cost = sum(r.cost_dollars for r in all_results)\n", + "total_rows = sum(r.rows for r in all_results)\n", + "total_time = sum(r.duration_seconds for r in all_results)\n", + "\n", + "# Find zero-cost experiments\n", + "zero_cost = [r for r in all_results if r.cost_dollars < 0.001]\n", + "low_cost = [r for r in all_results if 0.001 <= r.cost_dollars < 0.01]\n", + "\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"KEY FINDINGS\")\n", + "print(\"=\"*60)\n", + "print(f\"\\nTotal rows processed: {total_rows}\")\n", + "print(f\"Total cost: ${total_cost:.4f}\")\n", + "print(f\"Total time: {total_time:.1f}s\")\n", + "print(f\"Average cost per row: ${total_cost/total_rows:.5f}\")\n", + "\n", + "print(f\"\\nExperiments with zero/near-zero cost: {len(zero_cost)}\")\n", + "for r in zero_cost:\n", + " print(f\" - {r.name}\")\n", + "\n", + "print(f\"\\nCost Optimization Strategies:\")\n", + "print(f\" 1. Use merge_on parameters when you know the columns\")\n", + "print(f\" 2. Clean data for fuzzy matching (typos are free to resolve)\")\n", + "print(f\" 3. Provide context in task description for semantic matches\")\n", + "print(f\" 4. LLM costs scale with semantic matches, not total rows\")" + ] + }, + { + "cell_type": "markdown", + "id": "conclusion-md", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Conclusion\n", + "\n", + "The `everyrow.merge()` operation uses a **cost-optimized cascade** that makes intelligent merging surprisingly affordable:\n", + "\n", + "1. **Exact and fuzzy matches are free** - typos, case differences, and minor variations don't cost anything\n", + "\n", + "2. **Only semantic matches incur costs** - the LLM only processes rows that truly need reasoning (subsidiaries, acquisitions, regional names)\n", + "\n", + "3. **Providing `merge_on` hints reduces costs** - when you know which columns to match, specify them\n", + "\n", + "4. **Costs scale with complexity, not size** - a 1000-row dataset with clean data costs less than a 100-row dataset requiring semantic reasoning\n", + "\n", + "For most real-world use cases, the majority of matches fall into the free tiers, making intelligent merging practical even for large datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "d1032e1c", + "metadata": {}, + "outputs": [ + { + "ename": "CancelledError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mCancelledError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[64]\u001b[39m\u001b[32m, line 12\u001b[39m\n\u001b[32m 2\u001b[39m investment_vcs = pd.read_csv(\u001b[33m\"\u001b[39m\u001b[33m/Users/peter/Downloads/investments_VC.csv\u001b[39m\u001b[33m\"\u001b[39m).iloc[:\u001b[32m5000\u001b[39m]\n\u001b[32m 3\u001b[39m merged_unicorns = pd.merge(\n\u001b[32m 4\u001b[39m unicorn_companies,\n\u001b[32m 5\u001b[39m investment_vcs,\n\u001b[32m (...)\u001b[39m\u001b[32m 9\u001b[39m suffixes=(\u001b[33m\"\u001b[39m\u001b[33m_unicorn\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m_vc\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 10\u001b[39m )\n\u001b[32m---> \u001b[39m\u001b[32m12\u001b[39m result_exact, stats_exact = \u001b[38;5;28;01mawait\u001b[39;00m measure_merge(\n\u001b[32m 13\u001b[39m name=\u001b[33m\"\u001b[39m\u001b[33mcrunchbase merge\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 14\u001b[39m task=\u001b[33m\"\u001b[39m\u001b[33mMatch companies by (company) name.\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 15\u001b[39m left_table=unicorn_companies,\n\u001b[32m 16\u001b[39m right_table=investment_vcs,\n\u001b[32m 17\u001b[39m )\n\u001b[32m 19\u001b[39m result_exact[[\u001b[33m\"\u001b[39m\u001b[33mcompany\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33msector\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mcompany_name\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mrevenue_billions\u001b[39m\u001b[33m\"\u001b[39m]].head(\u001b[32m5\u001b[39m)\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[39]\u001b[39m\u001b[32m, line 32\u001b[39m, in \u001b[36mmeasure_merge\u001b[39m\u001b[34m(name, task, left_table, right_table, merge_on_left, merge_on_right, expected_matches, use_web_search)\u001b[39m\n\u001b[32m 29\u001b[39m start_time = time.time()\n\u001b[32m 31\u001b[39m \u001b[38;5;66;03m# Run the merge\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m32\u001b[39m result = \u001b[38;5;28;01mawait\u001b[39;00m merge(\n\u001b[32m 33\u001b[39m task=task,\n\u001b[32m 34\u001b[39m left_table=left_table,\n\u001b[32m 35\u001b[39m right_table=right_table,\n\u001b[32m 36\u001b[39m merge_on_left=merge_on_left,\n\u001b[32m 37\u001b[39m merge_on_right=merge_on_right,\n\u001b[32m 38\u001b[39m use_web_search=use_web_search,\n\u001b[32m 39\u001b[39m )\n\u001b[32m 41\u001b[39m \u001b[38;5;66;03m# Measure billing after\u001b[39;00m\n\u001b[32m 42\u001b[39m end_time = time.time()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/everyrow-sdk/src/everyrow/ops.py:526\u001b[39m, in \u001b[36mmerge\u001b[39m\u001b[34m(task, session, left_table, right_table, merge_on_left, merge_on_right, use_web_search)\u001b[39m\n\u001b[32m 521\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m session \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 522\u001b[39m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mwith\u001b[39;00m create_session() \u001b[38;5;28;01mas\u001b[39;00m internal_session:\n\u001b[32m 523\u001b[39m merge_task = \u001b[38;5;28;01mawait\u001b[39;00m merge_async(\n\u001b[32m 524\u001b[39m task=task,\n\u001b[32m 525\u001b[39m session=internal_session,\n\u001b[32m--> \u001b[39m\u001b[32m526\u001b[39m left_table=left_table,\n\u001b[32m 527\u001b[39m right_table=right_table,\n\u001b[32m 528\u001b[39m merge_on_left=merge_on_left,\n\u001b[32m 529\u001b[39m merge_on_right=merge_on_right,\n\u001b[32m 530\u001b[39m use_web_search=use_web_search,\n\u001b[32m 531\u001b[39m )\n\u001b[32m 532\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m merge_task.await_result()\n\u001b[32m 533\u001b[39m merge_task = \u001b[38;5;28;01mawait\u001b[39;00m merge_async(\n\u001b[32m 534\u001b[39m task=task,\n\u001b[32m 535\u001b[39m session=session,\n\u001b[32m (...)\u001b[39m\u001b[32m 540\u001b[39m use_web_search=use_web_search,\n\u001b[32m 541\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/everyrow-sdk/src/everyrow/task.py:80\u001b[39m, in \u001b[36mEveryrowTask.await_result\u001b[39m\u001b[34m(self, client)\u001b[39m\n\u001b[32m 76\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m client \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 77\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m EveryrowError(\n\u001b[32m 78\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mNo client available. Provide a client or use the task within a session context.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 79\u001b[39m )\n\u001b[32m---> \u001b[39m\u001b[32m80\u001b[39m final_status = \u001b[38;5;28;01mawait\u001b[39;00m await_task_completion(\u001b[38;5;28mself\u001b[39m.task_id, client)\n\u001b[32m 82\u001b[39m result_response = \u001b[38;5;28;01mawait\u001b[39;00m get_task_result(\u001b[38;5;28mself\u001b[39m.task_id, client)\n\u001b[32m 83\u001b[39m artifact_id = result_response.artifact_id\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/everyrow-sdk/src/everyrow/task.py:130\u001b[39m, in \u001b[36mawait_task_completion\u001b[39m\u001b[34m(task_id, client)\u001b[39m\n\u001b[32m 124\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m status_response.status \u001b[38;5;129;01min\u001b[39;00m (\n\u001b[32m 125\u001b[39m TaskStatus.COMPLETED,\n\u001b[32m 126\u001b[39m TaskStatus.FAILED,\n\u001b[32m 127\u001b[39m TaskStatus.REVOKED,\n\u001b[32m 128\u001b[39m ):\n\u001b[32m 129\u001b[39m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m130\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m asyncio.sleep(\u001b[32m1\u001b[39m)\n\u001b[32m 132\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m status_response.status == TaskStatus.FAILED:\n\u001b[32m 133\u001b[39m error_msg = (\n\u001b[32m 134\u001b[39m status_response.error\n\u001b[32m 135\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(status_response.error, Unset)\n\u001b[32m 136\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mUnknown error\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 137\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.local/share/uv/python/cpython-3.12.6-macos-aarch64-none/lib/python3.12/asyncio/tasks.py:665\u001b[39m, in \u001b[36msleep\u001b[39m\u001b[34m(delay, result)\u001b[39m\n\u001b[32m 661\u001b[39m h = loop.call_later(delay,\n\u001b[32m 662\u001b[39m futures._set_result_unless_cancelled,\n\u001b[32m 663\u001b[39m future, result)\n\u001b[32m 664\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m665\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m future\n\u001b[32m 666\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 667\u001b[39m h.cancel()\n", + "\u001b[31mCancelledError\u001b[39m: " + ] + } + ], + "source": [ + "unicorn_companies = pd.read_csv(\"~/Downloads/unicorn_companies.csv\")\n", + "investment_vcs = pd.read_csv(\"/Users/peter/Downloads/investments_VC.csv\")\n", + "merged_unicorns = pd.merge(\n", + " unicorn_companies,\n", + " investment_vcs,\n", + " left_on=\"Company\",\n", + " right_on=\"name\",\n", + " how=\"inner\",\n", + " suffixes=(\"_unicorn\", \"_vc\")\n", + ")\n", + "\n", + "result_exact, stats_exact = await measure_merge(\n", + " name=\"crunchbase merge\",\n", + " task=\"Match companies by (company) name.\",\n", + " left_table=unicorn_companies.iloc[:5000],\n", + " right_table=investment_vcs.iloc[:5000],\n", + ")\n", + "\n", + "result_exact[[\"company\", \"sector\", \"company_name\", \"revenue_billions\"]].head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcdf664f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fei_numberrecalling_firm_nameproduct_typeproduct_classificationstatusdistribution_patternrecalling_firm_cityrecalling_firm_staterecalling_firm_countrycenter_classification_datereason_for_recallproduct_descriptionevent_idevent_classificationproduct_idcenterrecall_details
03.002602e+09Lamb Weston SalesFood/CosmeticsClass IOngoingDistributed in CA, IA, IL, KS, LA MO, MS, NM, ...KennewickWashingtonUnited States2023-04-21Undeclared Wheat in foodservice item Hashbrown...G5300 Lamb's Supreme Hash Brown Patties, Froze...92014Class I199418CFSANhttps://www.accessdata.fda.gov/scripts/ires/?P...
13.012438e+09Fresh Express IncorpatedFood/CosmeticsClass IOngoingProduct was shipped to the following states: F...WindermereFloridaUnited States2023-04-21The firm was notified by one of their customer...Fresh EXPRESS Chopped Kit Caesar Romaine Lettu...92068Class I199573CFSANhttps://www.accessdata.fda.gov/scripts/ires/?P...
23.012438e+09Fresh Express IncorpatedFood/CosmeticsClass IOngoingProduct was shipped to the following states: F...WindermereFloridaUnited States2023-04-21The firm was notified by one of their customer...Fresh Express Chopped Kit Chipotle Cheddar TOT...92068Class I199574CFSANhttps://www.accessdata.fda.gov/scripts/ires/?P...
33.012438e+09Fresh Express IncorpatedFood/CosmeticsClass IOngoingProduct was shipped to the following states: F...WindermereFloridaUnited States2023-04-21The firm was notified by one of their customer...PREMIUM MAKOTO HONEY GINGER SALAD KIT TOTAL NE...92068Class I199575CFSANhttps://www.accessdata.fda.gov/scripts/ires/?P...
41.000222e+09Blood Bank Computer Systems, IncBiologicsClass IITerminatedGA, DE, TX, MO, PA, CA, FL, KY, IA, MI, IL, an...AuburnWashingtonUnited States2023-04-21Blood Bank Computer Systems has discovered in ...ABO Wheels, Version 1.1.091219Class II197268CBERhttps://www.accessdata.fda.gov/scripts/ires/?P...
\n", + "
" + ], + "text/plain": [ + " fei_number recalling_firm_name product_type \\\n", + "0 3.002602e+09 Lamb Weston Sales Food/Cosmetics \n", + "1 3.012438e+09 Fresh Express Incorpated Food/Cosmetics \n", + "2 3.012438e+09 Fresh Express Incorpated Food/Cosmetics \n", + "3 3.012438e+09 Fresh Express Incorpated Food/Cosmetics \n", + "4 1.000222e+09 Blood Bank Computer Systems, Inc Biologics \n", + "\n", + " product_classification status \\\n", + "0 Class I Ongoing \n", + "1 Class I Ongoing \n", + "2 Class I Ongoing \n", + "3 Class I Ongoing \n", + "4 Class II Terminated \n", + "\n", + " distribution_pattern recalling_firm_city \\\n", + "0 Distributed in CA, IA, IL, KS, LA MO, MS, NM, ... Kennewick \n", + "1 Product was shipped to the following states: F... Windermere \n", + "2 Product was shipped to the following states: F... Windermere \n", + "3 Product was shipped to the following states: F... Windermere \n", + "4 GA, DE, TX, MO, PA, CA, FL, KY, IA, MI, IL, an... Auburn \n", + "\n", + " recalling_firm_state recalling_firm_country center_classification_date \\\n", + "0 Washington United States 2023-04-21 \n", + "1 Florida United States 2023-04-21 \n", + "2 Florida United States 2023-04-21 \n", + "3 Florida United States 2023-04-21 \n", + "4 Washington United States 2023-04-21 \n", + "\n", + " reason_for_recall \\\n", + "0 Undeclared Wheat in foodservice item Hashbrown... \n", + "1 The firm was notified by one of their customer... \n", + "2 The firm was notified by one of their customer... \n", + "3 The firm was notified by one of their customer... \n", + "4 Blood Bank Computer Systems has discovered in ... \n", + "\n", + " product_description event_id \\\n", + "0 G5300 Lamb's Supreme Hash Brown Patties, Froze... 92014 \n", + "1 Fresh EXPRESS Chopped Kit Caesar Romaine Lettu... 92068 \n", + "2 Fresh Express Chopped Kit Chipotle Cheddar TOT... 92068 \n", + "3 PREMIUM MAKOTO HONEY GINGER SALAD KIT TOTAL NE... 92068 \n", + "4 ABO Wheels, Version 1.1.0 91219 \n", + "\n", + " event_classification product_id center \\\n", + "0 Class I 199418 CFSAN \n", + "1 Class I 199573 CFSAN \n", + "2 Class I 199574 CFSAN \n", + "3 Class I 199575 CFSAN \n", + "4 Class II 197268 CBER \n", + "\n", + " recall_details \n", + "0 https://www.accessdata.fda.gov/scripts/ires/?P... \n", + "1 https://www.accessdata.fda.gov/scripts/ires/?P... \n", + "2 https://www.accessdata.fda.gov/scripts/ires/?P... \n", + "3 https://www.accessdata.fda.gov/scripts/ires/?P... \n", + "4 https://www.accessdata.fda.gov/scripts/ires/?P... " + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"/Users/peter/Downloads/fda_product_recalls.csv\")\n", + "df[[\"recalling_firm_name\", \"product_type\", \"distribution_pattern\", \"\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "d3079b95", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(9949, 17)" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pandas.core.frame import DataFrame\n", + "\n", + "df_2021: DataFrame = df[df['center_classification_date'] >= pd.Timestamp('2021-08-01')] # type: ignore\n", + "\n", + "df_2021.head()\n", + "df_2021.tail()\n", + "df_2021.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "400b9406", + "metadata": {}, + "outputs": [], + "source": "from everyrow.ops import screen\nasync with create_session(name=\"FDA Recall Screening\") as session:\n print(f\"Session URL: {session.get_url()}\")\n await screen(\n session=session,\n task=\"Find recalls of products that I might have used for my child born on 2021-08-01.\",\n input=df_2021,\n )" + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "31d42a63", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "correct_df = pd.read_csv(\"/Users/peter/Downloads/merge_websites_correct_output_2246.csv\")\n", + "def get_correct_website_for_name(name: str) -> str:\n", + " return correct_df[correct_df[\"name\"] == name][\"personal_website_url\"].values[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c4af271", + "metadata": {}, + "outputs": [], + "source": "async with create_session(name=\"Website Matching (n=100)\") as session:\n print(f\"Session URL: {session.get_url()}\")\n result = await merge(\n session=session,\n task=\"Match each person to their website(s).\",\n left_table=pd.read_csv(\"/Users/peter/Downloads/merge_websites_input_left_100.csv\"),\n right_table=pd.read_csv(\"/Users/peter/Downloads/merge_websites_input_right_100.csv\"),\n )" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "311607e8", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2db46cae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num of matched rows: 100\n", + "num of LLM matches: 95\n", + "num of web search matches: 5\n", + "fraction of correct matches: 0.97\n" + ] + } + ], + "source": [ + "print(\"num of matched rows:\", len(result.data))\n", + "num_of_llm_matches = sum([1 if r[\"personal_website_url\"] == 'This row was matched due to the information in both tables' else 0 for r in result.data.research])\n", + "num_of_web_search_matches = sum([1 if r[\"personal_website_url\"].startswith('This row was matched due to the following information found in the web:') else 0 for r in result.data.research])\n", + "fraction_of_correct_matches = np.mean([1 if url == get_correct_website_for_name(name) else 0 for name, url in zip(result.data.name, result.data.personal_website_url)])\n", + "print(\"num of LLM matches:\", num_of_llm_matches)\n", + "print(\"num of web search matches:\", num_of_web_search_matches)\n", + "print(\"fraction of correct matches:\", fraction_of_correct_matches)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6a229a40", + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "string indices must be integers, not 'str'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m [\u001b[32m1\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mr\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mpersonal_website_url\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m == \u001b[33m'\u001b[39m\u001b[33mThis row was matched due to the information in both tables\u001b[39m\u001b[33m'\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \u001b[32m0\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m r \u001b[38;5;129;01min\u001b[39;00m result.research]\n", + "\u001b[31mTypeError\u001b[39m: string indices must be integers, not 'str'" + ] + } + ], + "source": [ + "[1 if r[\"personal_website_url\"] == 'This row was matched due to the information in both tables' else 0 for r in result.research]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5af2329b", + "metadata": {}, + "outputs": [], + "source": "for n in [200, 400, 800, 1600, 2246]:\n async with create_session(name=f\"Website Matching (n={n})\") as session:\n print(f\"Session URL: {session.get_url()}\")\n result = await merge(\n session=session,\n task=\"Match each person to their website(s).\",\n left_table=pd.read_csv(f\"/Users/peter/Downloads/merge_websites_input_left_{n}.csv\"),\n right_table=pd.read_csv(f\"/Users/peter/Downloads/merge_websites_input_right_{n}.csv\"),\n )\n print(f\"n={n}\")\n print(\"num of matched rows:\", len(result.data))\n num_of_llm_matches = sum([1 if r[\"personal_website_url\"] == 'This row was matched due to the information in both tables' else 0 for r in result.data.research])\n num_of_web_search_matches = sum([1 if r[\"personal_website_url\"].startswith('This row was matched due to the following information found in the web:') else 0 for r in result.data.research])\n fraction_of_correct_matches = np.mean([1 if url == get_correct_website_for_name(name) else 0 for name, url in zip(result.data.name, result.data.personal_website_url)])\n print(\"num of LLM matches:\", num_of_llm_matches)\n print(\"num of web search matches:\", num_of_web_search_matches)\n print(\"fraction of correct matches:\", fraction_of_correct_matches)\n print(\"-\"*100)\n print()" + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "4e2ee1c3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n=800\n", + "num of matched rows: 800\n", + "num of LLM matches: 780\n", + "num of web search matches: 20\n", + "fraction of correct matches: 0.77625\n", + "----------------------------------------------------------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "import json\n", + "result = pd.read_csv(\"/Users/peter/Downloads/merge_websites_output_800.csv\")\n", + "result.research = [json.loads(r) for r in result.research]\n", + "print(f\"n=800\")\n", + "print(\"num of matched rows:\", len(result))\n", + "num_of_llm_matches = sum([1 if r[\"personal_website_url\"] == 'This row was matched due to the information in both tables' else 0 for r in result.research])\n", + "num_of_web_search_matches = sum([1 if r[\"personal_website_url\"].startswith('This row was matched due to the following information found in the web:') else 0 for r in result.research])\n", + "fraction_of_correct_matches = np.mean([1 if url == get_correct_website_for_name(name) else 0 for name, url in zip(result.name, result.personal_website_url)])\n", + "print(\"num of LLM matches:\", num_of_llm_matches)\n", + "print(\"num of web search matches:\", num_of_web_search_matches)\n", + "print(\"fraction of correct matches:\", fraction_of_correct_matches)\n", + "print(\"-\"*100)\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7109208c", + "metadata": {}, + "outputs": [], + "source": [ + "result = pd.read_csv(\"/Users/peter/Downloads/merge_websites_output_1600.csv\")\n", + "result.research = [json.loads(r) for r in result.research]\n", + "print(f\"n=1600\")\n", + "print(\"num of matched rows:\", len(result))\n", + "num_of_llm_matches = sum([1 if r[\"personal_website_url\"] == 'This row was matched due to the information in both tables' else 0 for r in result.research])\n", + "num_of_web_search_matches = sum([1 if r[\"personal_website_url\"].startswith('This row was matched due to the following information found in the web:') else 0 for r in result.research])\n", + "fraction_of_correct_matches = np.mean([1 if url == get_correct_website_for_name(name) else 0 for name, url in zip(result.name, result.personal_website_url)])\n", + "print(\"num of LLM matches:\", num_of_llm_matches)\n", + "print(\"num of web search matches:\", num_of_web_search_matches)\n", + "print(\"fraction of correct matches:\", fraction_of_correct_matches)\n", + "print(\"-\"*100)\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbb3ad56", + "metadata": {}, + "outputs": [], + "source": [ + "result = pd.read_csv(\"/Users/peter/Downloads/merge_websites_output_2246.csv\")\n", + "result.research = [json.loads(r) for r in result.research]\n", + "print(f\"n=2246\")\n", + "print(\"num of matched rows:\", len(result))\n", + "num_of_llm_matches = sum([1 if r[\"personal_website_url\"] == 'This row was matched due to the information in both tables' else 0 for r in result.research])\n", + "num_of_web_search_matches = sum([1 if r[\"personal_website_url\"].startswith('This row was matched due to the following information found in the web:') else 0 for r in result.research])\n", + "fraction_of_correct_matches = np.mean([1 if url == get_correct_website_for_name(name) else 0 for name, url in zip(result.name, result.personal_website_url)])\n", + "print(\"num of LLM matches:\", num_of_llm_matches)\n", + "print(\"num of web search matches:\", num_of_web_search_matches)\n", + "print(\"fraction of correct matches:\", fraction_of_correct_matches)\n", + "print(\"-\"*100)\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b04fdef", + "metadata": {}, + "outputs": [], + "source": "import asyncio\n\nasync def run_merge_and_report(n):\n async with create_session(name=f\"Website Matching (n={n})\") as session:\n print(f\"Session URL: {session.get_url()}\")\n result = await merge(\n session=session,\n task=\"Match each person to their website(s).\",\n left_table=pd.read_csv(f\"/Users/peter/Downloads/merge_websites_input_left_{n}.csv\"),\n right_table=pd.read_csv(f\"/Users/peter/Downloads/merge_websites_input_right_{n}.csv\"),\n )\n print(f\"n={n}\")\n print(\"num of matched rows:\", len(result.data))\n num_of_llm_matches = sum([1 if r[\"personal_website_url\"] == 'This row was matched due to the information in both tables' else 0 for r in result.data.research])\n num_of_web_search_matches = sum([1 if r[\"personal_website_url\"].startswith('This row was matched due to the following information found in the web:') else 0 for r in result.data.research])\n fraction_of_correct_matches = np.mean([1 if url == get_correct_website_for_name(name) else 0 for name, url in zip(result.data.name, result.data.personal_website_url)])\n print(\"num of LLM matches:\", num_of_llm_matches)\n print(\"num of web search matches:\", num_of_web_search_matches)\n print(\"fraction of correct matches:\", fraction_of_correct_matches)\n print(\"-\"*100)\n print()\n\nawait asyncio.gather(*(run_merge_and_report(n) for n in [1600, 2246]))" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2939ad03", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file