# everyrow Pandas Accessor

This notebook demonstrates the `df.everyrow` pandas accessor - a fluent API for AI-powered DataFrame operations.

Instead of:
```python
from everyrow.ops import screen
result = await screen(task="...", input=df)
filtered_df = result.data
```

You can write:
```python
filtered_df = await df.everyrow.screen("...")
```

Get an API key at [everyrow.io/api-key](https://everyrow.io/api-key) to run this notebook.

In [None]:
import pandas as pd
from dotenv import load_dotenv
from pydantic import BaseModel, Field

# Importing from everyrow registers the df.everyrow accessor
from everyrow import create_session
from everyrow.task import EffortLevel

load_dotenv()

## 1. Screen

Filter rows using natural language criteria.

In [None]:
jobs = pd.DataFrame([
    {"company": "Airtable",  "post": "Async-first team, 8+ yrs exp, $185-220K base"},
    {"company": "Vercel",    "post": "Lead our NYC team. Competitive comp, DOE"},
    {"company": "Notion",    "post": "In-office SF. Staff eng, $200K + equity"},
    {"company": "Linear",    "post": "Bootcamp grads welcome! $85K, remote-friendly"},
    {"company": "Descript",  "post": "Work from anywhere. Principal architect, $250K"},
    {"company": "Retool",    "post": "Flexible location. Building infra. Comp TBD"},
])

print("Input jobs:")
print(jobs.to_string())

In [None]:
# Basic screen - returns DataFrame directly
qualified_jobs = await jobs.everyrow.screen("""
    Qualifies if ALL THREE are met:
    1. Remote-friendly (allows remote, hybrid, or distributed)
    2. Senior-level (5+ yrs exp OR title includes Senior/Staff/Principal)
    3. Salary disclosed (specific numbers like "$150K", not "competitive" or "DOE")
""")

print("Qualified jobs:")
print(qualified_jobs.to_string())

In [None]:
# Access metadata via last_result
print(f"Artifact ID: {jobs.everyrow.last_result.artifact_id}")

## 2. Rank

Score and sort rows by qualitative factors.

In [None]:
candidates = pd.DataFrame([
    {"name": "Alice", "background": "10 years ML at Google, PhD in NLP"},
    {"name": "Bob", "background": "2 years as PM, MBA from Stanford"},
    {"name": "Carol", "background": "5 years data science, built ML pipelines at Stripe"},
    {"name": "Dave", "background": "Fresh bootcamp grad, passionate about AI"},
])

print("Candidates:")
print(candidates.to_string())

In [None]:
# Rank candidates by fit
ranked = await candidates.everyrow.rank(
    "fit_score",
    task="Rank candidates by fit for a senior ML engineer role at a startup",
    field_type="float",
    ascending=False  # Best candidates first
)

print("Ranked candidates:")
print(ranked.to_string())

## 3. Dedupe

Remove duplicates when fuzzy matching isn't enough.

In [None]:
companies = pd.DataFrame([
    {"name": "Apple Inc.", "source": "SEC filings"},
    {"name": "Apple", "source": "LinkedIn"},
    {"name": "Google LLC", "source": "SEC filings"},
    {"name": "Alphabet/Google", "source": "News article"},
    {"name": "Meta Platforms", "source": "SEC filings"},
    {"name": "Facebook (Meta)", "source": "Old database"},
])

print(f"Input companies ({len(companies)} rows):")
print(companies.to_string())

In [None]:
# Dedupe by company identity
unique = await companies.everyrow.dedupe(
    "Same company, accounting for legal suffixes, parent/subsidiary, and rebranding"
)

print(f"Unique companies ({len(unique)} rows):")
print(unique.to_string())

## 4. Merge

Join tables when keys don't match exactly.

In [None]:
subsidiaries = pd.DataFrame([
    {"name": "YouTube", "product": "Video streaming"},
    {"name": "Instagram", "product": "Photo sharing"},
    {"name": "WhatsApp", "product": "Messaging"},
    {"name": "GitHub", "product": "Code hosting"},
])

parents = pd.DataFrame([
    {"company": "Alphabet Inc.", "market_cap_b": 2000},
    {"company": "Meta Platforms", "market_cap_b": 1200},
    {"company": "Microsoft Corp", "market_cap_b": 3000},
])

print("Subsidiaries:")
print(subsidiaries.to_string())
print("\nParent companies:")
print(parents.to_string())

In [None]:
# Merge subsidiaries to parents
merged = await subsidiaries.everyrow.merge(
    parents,
    task="Match each subsidiary to its parent company",
    left_on="name",
    right_on="company"
)

print("Merged data:")
print(merged.to_string())

In [None]:
# Access merge breakdown
breakdown = subsidiaries.everyrow.last_result.breakdown
print(f"Exact matches: {len(breakdown.exact)}")
print(f"Fuzzy matches: {len(breakdown.fuzzy)}")
print(f"LLM matches: {len(breakdown.llm)}")
print(f"Web-assisted matches: {len(breakdown.web)}")

## 5. Agent Map

Run AI research on every row.

In [None]:
startups = pd.DataFrame([
    {"company": "Stripe"},
    {"company": "Databricks"},
    {"company": "Figma"},
])

print("Startups to research:")
print(startups.to_string())

In [None]:
# Define structured output
class CompanyInfo(BaseModel):
    founded_year: int = Field(description="Year the company was founded")
    hq_city: str = Field(description="Headquarters city")
    latest_valuation_b: float = Field(description="Latest valuation in billions USD")

# Research each company
enriched = await startups.everyrow.agent_map(
    "Research this company's founding year, HQ location, and latest valuation",
    effort_level=EffortLevel.MEDIUM,
    response_model=CompanyInfo
)

print("Enriched data:")
print(enriched.to_string())

## 6. Single Agent

Run AI analysis on the entire DataFrame.

In [None]:
sales = pd.DataFrame([
    {"month": "Jan", "revenue": 100000, "customers": 50},
    {"month": "Feb", "revenue": 120000, "customers": 55},
    {"month": "Mar", "revenue": 95000, "customers": 48},
    {"month": "Apr", "revenue": 140000, "customers": 62},
    {"month": "May", "revenue": 160000, "customers": 70},
])

print("Sales data:")
print(sales.to_string())

In [None]:
# Analyze the data
analysis = await sales.everyrow.single_agent(
    "Analyze this sales data. Identify trends, anomalies, and provide recommendations."
)

print("Analysis:")
print(analysis.to_string())

## 7. Chaining Operations

Operations return DataFrames, so you can chain them naturally.

In [None]:
# Example: Screen vendors, then rank the qualified ones
vendors = pd.DataFrame([
    {"company": "Acme Corp", "category": "Cloud Infrastructure"},
    {"company": "Beta Inc", "category": "Security"},
    {"company": "Gamma Ltd", "category": "Cloud Infrastructure"},
    {"company": "Delta Co", "category": "Analytics"},
])

# Chain: screen then rank
qualified = await vendors.everyrow.screen(
    "Filter to cloud infrastructure vendors only"
)

ranked_qualified = await qualified.everyrow.rank(
    "reliability_score",
    task="Rank cloud infrastructure vendors by reliability and uptime reputation",
    ascending=False
)

print("Top qualified vendors:")
print(ranked_qualified.to_string())

## 8. Using Sessions

For multiple operations, use an explicit session to group them together.

In [None]:
from everyrow import create_session

async with create_session(name="Vendor Analysis") as session:
    print(f"Session URL: {session.get_url()}")
    
    # All operations share the same session
    screened = await vendors.everyrow.with_session(session).screen(
        "Filter to security vendors"
    )
    
    ranked = await screened.everyrow.with_session(session).rank(
        "trust_score",
        task="Rank by security certifications and compliance",
        ascending=False
    )
    
    print("Top security vendors:")
    print(ranked.to_string())