# US Senate - Financial Disclosure Ingestion

**Data Source:** https://efdsearch.senate.gov/search/

## CLI Commands
```bash
mcli run us-senate run       # Run full ingestion
mcli run us-senate status    # Check ingestion status
mcli run us-senate list      # List downloaded disclosures
```

In [None]:
import asyncio
import json
import logging
import sys
from datetime import datetime
from functools import wraps
from pathlib import Path
from typing import Any, Dict, List, Optional

import click
import aiohttp
from bs4 import BeautifulSoup

project_root = Path.cwd().parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root / 'src'))

from politician_trading.config import ScrapingConfig, WorkflowConfig

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
@click.group(name="us-senate")
def us_senate():
    """US Senate financial disclosure ingestion."""
    pass

def click_async(f):
    @wraps(f)
    def wrapper(*args, **kwargs):
        return asyncio.run(f(*args, **kwargs))
    return wrapper

config = WorkflowConfig.default()
scraping_config = config.scraping
OUTPUT_DIR = project_root / 'data' / 'raw' / 'us_senate'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
async def scrape_senate_disclosures(config: ScrapingConfig) -> List[Dict[str, Any]]:
    """Scrape Senate EFD database."""
    base_url = "https://efdsearch.senate.gov"
    search_url = f"{base_url}/search/view/ptr/"
    disclosures = []
    
    async with aiohttp.ClientSession(
        timeout=aiohttp.ClientTimeout(total=config.timeout),
        headers={"User-Agent": config.user_agent}
    ) as session:
        logger.info("Fetching Senate PTR listing...")
        try:
            async with session.get(search_url) as response:
                if response.status != 200:
                    logger.error(f"Failed: {response.status}")
                    return []
                html = await response.text()
                soup = BeautifulSoup(html, 'html.parser')
                rows = soup.select('tbody tr')
                logger.info(f"Found {len(rows)} rows")
                for row in rows[:100]:
                    cells = row.find_all('td')
                    if len(cells) >= 4:
                        name = cells[0].get_text(strip=True)
                        report_type = cells[1].get_text(strip=True) if len(cells) > 1 else ''
                        filing_date = cells[2].get_text(strip=True) if len(cells) > 2 else ''
                        link = row.find('a', href=True)
                        report_url = base_url + link['href'] if link else None
                        if name:
                            disclosures.append({
                                'politician_name': name,
                                'report_type': report_type,
                                'filing_date': filing_date,
                                'source_url': report_url,
                                'source': 'us_senate',
                            })
        except Exception as e:
            logger.error(f"Error: {e}")
    return disclosures

In [None]:
@us_senate.command(name="run")
@click.option('--output', default=None, help='Output file path')
@click_async
async def run_ingestion(output: Optional[str]):
    """Run US Senate financial disclosure ingestion."""
    click.echo("Starting US Senate ingestion...")
    disclosures = await scrape_senate_disclosures(scraping_config)
    
    if not disclosures:
        click.echo("No disclosures found.", err=True)
        return
    
    click.echo(f"Scraped {len(disclosures)} records")
    
    output_file = Path(output) if output else OUTPUT_DIR / f'senate_disclosures_{datetime.now().strftime("%Y%m%d")}.json'
    
    with open(output_file, 'w') as f:
        json.dump({
            'metadata': {
                'source': 'us_senate',
                'downloaded_at': datetime.now().isoformat(),
                'total_records': len(disclosures),
            },
            'disclosures': disclosures
        }, f, indent=2)
    
    click.echo(f"Saved to {output_file}")

In [None]:
@us_senate.command(name="status")
def check_status():
    """Check status of US Senate ingestion."""
    files = list(OUTPUT_DIR.glob('senate_disclosures_*.json'))
    if files:
        latest = max(files, key=lambda p: p.stat().st_mtime)
        with open(latest) as f:
            data = json.load(f)
        metadata = data.get('metadata', {})
        click.echo(f"Latest: {latest.name}")
        click.echo(f"Records: {metadata.get('total_records', 'Unknown')}")
        click.echo(f"Downloaded: {metadata.get('downloaded_at', 'Unknown')}")
    else:
        click.echo("No data found. Run 'mcli run us-senate run' first.")

In [None]:
@us_senate.command(name="list")
@click.option('--limit', default=10, type=int, help='Number of records')
def list_disclosures(limit: int):
    """List downloaded US Senate disclosures."""
    files = list(OUTPUT_DIR.glob('senate_disclosures_*.json'))
    if not files:
        click.echo("No data found.")
        return
    latest = max(files, key=lambda p: p.stat().st_mtime)
    with open(latest) as f:
        data = json.load(f)
    disclosures = data.get('disclosures', [])
    click.echo(f"Showing {min(limit, len(disclosures))} of {len(disclosures)}:\n")
    for d in disclosures[:limit]:
        click.echo(f"  {d['politician_name']} - {d.get('report_type', 'N/A')}")
        click.echo(f"    Date: {d.get('filing_date', 'N/A')}")