Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions docs-site/scripts/check-links.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@
# GitHub blob URLs pointing into this repo are checked as local files
REPO_BLOB_PREFIX = "https://github.com/futuresearch/everyrow-sdk/blob/main/"

# Colab URLs pointing into this repo are checked as local files
REPO_COLAB_PREFIX = (
"https://colab.research.google.com/github/"
"futuresearch/everyrow-sdk/blob/main/"
)

# Git LFS media URLs — the correct way to link to LFS-tracked files.
# These are checked as local files instead of fetching from GitHub.
REPO_LFS_PREFIX = (
Expand Down Expand Up @@ -186,6 +192,16 @@ def check_file(
)
continue

# Colab links to this repo: check the notebook exists locally.
if url_without_fragment.startswith(REPO_COLAB_PREFIX):
rel_path = url_without_fragment[len(REPO_COLAB_PREFIX) :]
if not (REPO_ROOT / rel_path).exists():
errors.append(
f" {page_label}: file not found for {href!r}"
f" (expected {rel_path})"
)
continue

if url_without_fragment in SKIPPED_URLS:
continue

Expand Down
2 changes: 2 additions & 0 deletions docs-site/src/app/case-studies/[slug]/page.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { notFound } from "next/navigation";
import { DocsLayout } from "@/components/DocsLayout";
import { NotebookActions } from "@/components/NotebookActions";
import { getNavigation } from "@/utils/docs";
import { getNotebookBySlug, getNotebookSlugs } from "@/utils/notebooks";

Expand Down Expand Up @@ -50,6 +51,7 @@ export default async function NotebookPage({ params }: PageProps) {

return (
<DocsLayout navigation={navigation}>
<NotebookActions slug={slug} />
<article
className="notebook-content"
dangerouslySetInnerHTML={{ __html: notebook.html }}
Expand Down
56 changes: 56 additions & 0 deletions docs-site/src/components/NotebookActions.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
const GITHUB_BASE =
"https://github.com/futuresearch/everyrow-sdk/blob/main/docs/case_studies";
const COLAB_BASE =
"https://colab.research.google.com/github/futuresearch/everyrow-sdk/blob/main/docs/case_studies";

interface NotebookActionsProps {
slug: string;
}

export function NotebookActions({ slug }: NotebookActionsProps) {
const githubUrl = `${GITHUB_BASE}/${slug}/notebook.ipynb`;
const colabUrl = `${COLAB_BASE}/${slug}/notebook.ipynb`;

return (
<div className="notebook-actions">
<a
href={githubUrl}
target="_blank"
rel="noopener noreferrer"
className="notebook-action-link"
>
<svg
width="16"
height="16"
viewBox="0 0 24 24"
fill="currentColor"
aria-hidden="true"
>
<path d="M12 0C5.37 0 0 5.37 0 12c0 5.31 3.435 9.795 8.205 11.385.6.105.825-.255.825-.57 0-.285-.015-1.23-.015-2.235-3.015.555-3.795-.735-4.035-1.41-.135-.345-.72-1.41-1.23-1.695-.42-.225-1.02-.78-.015-.795.945-.015 1.62.87 1.845 1.23 1.08 1.815 2.805 1.305 3.495.99.105-.78.42-1.305.765-1.605-2.67-.3-5.46-1.335-5.46-5.925 0-1.305.465-2.385 1.23-3.225-.12-.3-.54-1.53.12-3.18 0 0 1.005-.315 3.3 1.23.96-.27 1.98-.405 3-.405s2.04.135 3 .405c2.295-1.56 3.3-1.23 3.3-1.23.66 1.65.24 2.88.12 3.18.765.84 1.23 1.905 1.23 3.225 0 4.605-2.805 5.625-5.475 5.925.435.375.81 1.095.81 2.22 0 1.605-.015 2.895-.015 3.3 0 .315.225.69.825.57A12.02 12.02 0 0 0 24 12c0-6.63-5.37-12-12-12z" />
</svg>
View source
</a>
<a
href={colabUrl}
target="_blank"
rel="noopener noreferrer"
className="notebook-action-link notebook-action-colab"
>
<svg
width="16"
height="16"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
strokeWidth="2"
strokeLinecap="round"
strokeLinejoin="round"
aria-hidden="true"
>
<polygon points="5 3 19 12 5 21 5 3" />
</svg>
Run in Colab
</a>
</div>
);
}
35 changes: 35 additions & 0 deletions docs-site/src/styles/notebook.css
Original file line number Diff line number Diff line change
@@ -1,3 +1,38 @@
/* Notebook action buttons (View source, Run in Colab) */
.notebook-actions {
display: flex;
gap: 0.75rem;
margin-bottom: 1rem;
justify-content: flex-end;
}

.notebook-action-link {
display: inline-flex;
align-items: center;
gap: 0.375rem;
padding: 0.375rem 0.75rem;
font-size: 0.8125rem;
font-weight: 500;
color: var(--muted);
border: 1px solid var(--border);
border-radius: 0.375rem;
text-decoration: none;
transition: color 0.15s, border-color 0.15s, background-color 0.15s;
}

.notebook-action-link:hover {
color: var(--foreground);
border-color: var(--foreground);
background-color: var(--code-bg);
text-decoration: none;
}

.notebook-action-colab:hover {
color: var(--accent);
border-color: var(--accent);
background-color: var(--accent-light);
}

/*
* Notebook styles - scoped to .notebook-content
* Based on Jupyter's default styling but minimal
Expand Down
43 changes: 35 additions & 8 deletions docs/case_studies/dedupe-crm-company-records/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
{
"cell_type": "markdown",
"metadata": {},
"source": "# How to use LLMs to deduplicate CRM Data\n\nThis notebook demonstrates how to use the everyrow SDK's `dedupe` operation to deduplicate messy CRM data using AI-powered semantic matching."
"source": [
"# How to use LLMs to deduplicate CRM Data\n",
"\n",
"This notebook demonstrates how to use the everyrow SDK's `dedupe` operation to deduplicate messy CRM data using AI-powered semantic matching."
]
},
{
"cell_type": "markdown",
Expand All @@ -19,7 +23,19 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "from datetime import datetime\nfrom textwrap import dedent\n\nimport pandas as pd\nfrom dotenv import load_dotenv\n\nfrom everyrow.ops import dedupe\n\nload_dotenv()"
"source": [
"# !pip install everyrow\n",
"from datetime import datetime\n",
"from textwrap import dedent\n",
"\n",
"import pandas as pd\n",
"# load API key from environment/.env file or set it directly in the notebook\n",
"from dotenv import load_dotenv\n",
"load_dotenv()\n",
"# import os\n",
"# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n",
"from everyrow.ops import dedupe\n"
]
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -188,7 +204,7 @@
}
],
"source": [
"data = pd.read_csv(\"../data/case_01_crm_data.csv\", engine=\"pyarrow\")\n",
"data = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/case_01_crm_data.csv\", engine=\"pyarrow\")\n",
"\n",
"print(f\"Total records: {len(data)}\")\n",
"data.sort_values(by=\"company_name\").head(15)"
Expand Down Expand Up @@ -219,7 +235,18 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "equivalence_relation = dedent(\"\"\"\n Two entries are duplicates if they include data for the same legal entity.\n\"\"\")\n\nprint(\"Deduplicating CRM data...\\n\")\n\nresult = await dedupe(\n input=data,\n equivalence_relation=equivalence_relation,\n)"
"source": [
"equivalence_relation = dedent(\"\"\"\n",
" Two entries are duplicates if they include data for the same legal entity.\n",
"\"\"\")\n",
"\n",
"print(\"Deduplicating CRM data...\\n\")\n",
"\n",
"result = await dedupe(\n",
" input=data,\n",
" equivalence_relation=equivalence_relation,\n",
")"
]
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -584,6 +611,9 @@
}
],
"metadata": {
"everyrow": {
"description": "Python notebook cleaning 500 CRM records with inconsistent company names, missing contacts, and partial email matches. Uses everyrow's dedupe() with a plain-English equivalence relation to find and group semantic duplicates."
},
"kernelspec": {
"display_name": ".venv",
"language": "python",
Expand All @@ -601,10 +631,7 @@
"pygments_lexer": "ipython3",
"version": "3.12.11"
},
"language_version": "3.12",
"everyrow": {
"description": "Python notebook cleaning 500 CRM records with inconsistent company names, missing contacts, and partial email matches. Uses everyrow's dedupe() with a plain-English equivalence relation to find and group semantic duplicates."
}
"language_version": "3.12"
},
"nbformat": 4,
"nbformat_minor": 4
Expand Down
16 changes: 11 additions & 5 deletions docs/case_studies/llm-powered-merging-at-scale/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@
"metadata": {},
"outputs": [],
"source": [
"# !pip install everyrow\n",
"# load API key from environment/.env file or set it directly in the notebook\n",
"from dotenv import load_dotenv\n",
"load_dotenv()\n",
"# import os\n",
"# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from everyrow.ops import merge\n",
Expand Down Expand Up @@ -115,8 +121,8 @@
}
],
"source": [
"left_df = pd.read_csv(\"merge_websites_input_left_2246.csv\")\n",
"right_df = pd.read_csv(\"merge_websites_input_right_2246.csv\")\n",
"left_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-merging-at-scale/merge_websites_input_left_2246.csv\")\n",
"right_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-merging-at-scale/merge_websites_input_right_2246.csv\")\n",
"\n",
"print(f\"Left table: {len(left_df)} rows\")\n",
"left_df.head(3)"
Expand Down Expand Up @@ -240,8 +246,8 @@
"for n in [100, 200, 400, 800, 1600, 2246]:\n",
" result = await merge(\n",
" task=\"Match each person to their website(s).\",\n",
" left_table=pd.read_csv(f\"merge_websites_input_left_{n}.csv\"),\n",
" right_table=pd.read_csv(f\"merge_websites_input_right_{n}.csv\"),\n",
" left_table=pd.read_csv(f\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-merging-at-scale/merge_websites_input_left_{n}.csv\"),\n",
" right_table=pd.read_csv(f\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-merging-at-scale/merge_websites_input_right_{n}.csv\"),\n",
" )\n",
" print(f\"n={n}\")\n",
" print(\"num of matched rows:\", len(result.data))\n",
Expand Down Expand Up @@ -307,7 +313,7 @@
"metadata": {},
"outputs": [],
"source": [
"results_df = pd.read_csv(\"merge_websites_output_800.csv\")"
"results_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-merging-at-scale/merge_websites_output_800.csv\")"
]
},
{
Expand Down
12 changes: 8 additions & 4 deletions docs/case_studies/llm-powered-screening-at-scale/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,19 @@
"metadata": {},
"outputs": [],
"source": [
"from dotenv import load_dotenv\n",
"# !pip install everyrow\n",
"import pandas as pd\n",
"from everyrow import create_session\n",
"from everyrow.ops import screen\n",
"\n",
"pd.set_option(\"display.max_colwidth\", None)\n",
"\n",
"\n",
"load_dotenv()"
"# load API key from environment/.env file or set it directly in the notebook\n",
"from dotenv import load_dotenv\n",
"load_dotenv()\n",
"# import os\n",
"# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n"
]
},
{
Expand Down Expand Up @@ -217,7 +221,7 @@
}
],
"source": [
"fda_product_recalls_df = pd.read_csv(\"fda_product_recalls.csv\")\n",
"fda_product_recalls_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-screening-at-scale/fda_product_recalls.csv\")\n",
"\n",
"# Filter to recalls where center_classification_date is after 2021-08-01 to get a dataset with ≈10k rows\n",
"fda_product_recalls_df[\"center_classification_date\"] = pd.to_datetime(fda_product_recalls_df[\"center_classification_date\"], errors=\"coerce\")\n",
Expand Down Expand Up @@ -311,7 +315,7 @@
"metadata": {},
"outputs": [],
"source": [
"results_df = pd.read_csv(\"Screen child product recalls.csv\") # download from https://everyrow.io/sessions/df145a50-2dfd-48c6-97ed-6f82a82bca66"
"results_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-powered-screening-at-scale/Screen%20child%20product%20recalls.csv\") # download from https://everyrow.io/sessions/df145a50-2dfd-48c6-97ed-6f82a82bca66"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
"from dotenv import load_dotenv\n",
"# !pip install everyrow\n",
"import pandas as pd\n",
"from pydantic import BaseModel, Field\n",
"from everyrow import create_session\n",
Expand All @@ -48,7 +48,11 @@
"pd.set_option(\"display.max_colwidth\", None)\n",
"\n",
"\n",
"load_dotenv()"
"# load API key from environment/.env file or set it directly in the notebook\n",
"from dotenv import load_dotenv\n",
"load_dotenv()\n",
"# import os\n",
"# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n"
]
},
{
Expand Down Expand Up @@ -166,7 +170,7 @@
}
],
"source": [
"input_df = pd.read_csv(\"regulatory_status_results.csv\", usecols=[\"row_id\", \"trade_name\", \"ingredient\", \"applicant\", \"strength\", \"dosage_form\"])\n",
"input_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-web-research-agents-at-scale/regulatory_status_results.csv\", usecols=[\"row_id\", \"trade_name\", \"ingredient\", \"applicant\", \"strength\", \"dosage_form\"])\n",
"print(f\"{len(input_df):,} drug products\")\n",
"print(f\"Columns: {list(input_df.columns)}\")\n",
"input_df.head(5)"
Expand Down Expand Up @@ -356,7 +360,7 @@
}
],
"source": [
"results_df = pd.read_csv(\"regulatory_status_results.csv\")\n",
"results_df = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/case_studies/llm-web-research-agents-at-scale/regulatory_status_results.csv\")\n",
"print(f\"Total rows: {len(results_df):,}\")\n",
"print(f\"Rows with results: {results_df['regulatory_status'].notna().sum():,}\")\n",
"print(f\"Failed rows: {results_df['regulatory_status'].isna().sum()}\")\n",
Expand Down Expand Up @@ -848,4 +852,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,12 @@
"metadata": {},
"outputs": [],
"source": [
"!pip install everyrow\n",
"%env EVERYROW_API_KEY=your_api_key"
"# !pip install everyrow\n",
"# load API key from environment/.env file or set it directly in the notebook\n",
"from dotenv import load_dotenv\n",
"load_dotenv()\n",
"# import os\n",
"# os.environ[\"EVERYROW_API_KEY\"] = \"get an API key on everyrow.io. $20 free\"\n"
]
},
{
Expand Down Expand Up @@ -168,7 +172,7 @@
"from everyrow.generated.models import LLMEnum\n",
"\n",
"# Load dataset: 438 S&P 500 companies\n",
"data = pd.read_csv(\"../data/companies.csv\")\n",
"data = pd.read_csv(\"https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/main/docs/data/companies.csv\")\n",
"print(f\"Dataset: {data.shape[0]} companies, {data.shape[1]} columns\")\n",
"data.head()"
]
Expand Down
Loading