# US States - Financial Disclosure Ingestion

**States:** California, Texas, New York, Florida, Illinois, Pennsylvania, Massachusetts

## CLI Commands
```bash
mcli run us-states run                  # Run all states
mcli run us-states run --state texas    # Specific state
mcli run us-states status               # Check status
mcli run us-states list-states          # List available states
```

In [None]:
import asyncio
import json
import logging
import sys
from datetime import datetime
from functools import wraps
from pathlib import Path
from typing import Optional

import click

project_root = Path.cwd().parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root / 'src'))

from politician_trading.config import WorkflowConfig
from politician_trading.scrapers.scrapers_us_states import (
    run_us_states_collection,
    run_texas_collection,
    run_new_york_collection,
    run_florida_collection,
    run_illinois_collection,
    run_pennsylvania_collection,
    run_massachusetts_collection,
)
from politician_trading.scrapers.scrapers_california import run_california_collection

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
@click.group(name="us-states")
def us_states():
    """US state-level financial disclosure ingestion."""
    pass

def click_async(f):
    @wraps(f)
    def wrapper(*args, **kwargs):
        return asyncio.run(f(*args, **kwargs))
    return wrapper

config = WorkflowConfig.default()
scraping_config = config.scraping
OUTPUT_DIR = project_root / 'data' / 'raw' / 'us_states'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

STATE_SCRAPERS = {
    'california': run_california_collection,
    'texas': run_texas_collection,
    'new_york': run_new_york_collection,
    'florida': run_florida_collection,
    'illinois': run_illinois_collection,
    'pennsylvania': run_pennsylvania_collection,
    'massachusetts': run_massachusetts_collection,
}

In [None]:
@us_states.command(name="run")
@click.option('--state', default=None, help='Specific state to scrape')
@click.option('--output', default=None, help='Output file path')
@click_async
async def run_ingestion(state: Optional[str], output: Optional[str]):
    """Run US states financial disclosure ingestion."""
    all_disclosures = []
    by_state = {}
    
    if state:
        state = state.lower().replace(' ', '_')
        if state not in STATE_SCRAPERS:
            click.echo(f"Unknown state: {state}", err=True)
            click.echo(f"Available: {', '.join(STATE_SCRAPERS.keys())}")
            return
        click.echo(f"Scraping {state}...")
        disclosures = await STATE_SCRAPERS[state](scraping_config)
        all_disclosures.extend(disclosures)
        by_state[state] = len(disclosures)
    else:
        click.echo("Scraping all US states...")
        disclosures = await run_us_states_collection(scraping_config)
        all_disclosures.extend(disclosures)
        for d in disclosures:
            s = d.raw_data.get('state', 'Unknown')
            by_state[s] = by_state.get(s, 0) + 1
        # Also run California
        try:
            ca_disclosures = await run_california_collection(scraping_config)
            all_disclosures.extend(ca_disclosures)
            by_state['California'] = len(ca_disclosures)
        except Exception as e:
            logger.error(f"California failed: {e}")
    
    click.echo(f"\nTotal: {len(all_disclosures)} disclosures")
    for s, count in by_state.items():
        click.echo(f"  {s}: {count}")
    
    output_file = Path(output) if output else OUTPUT_DIR / f'us_states_{datetime.now().strftime("%Y%m%d")}.json'
    
    serializable = []
    for d in all_disclosures:
        serializable.append({
            'asset_name': d.asset_name,
            'asset_type': d.asset_type,
            'transaction_type': d.transaction_type.value if d.transaction_type else None,
            'transaction_date': d.transaction_date.isoformat() if d.transaction_date else None,
            'amount_min': float(d.amount_range_min) if d.amount_range_min else None,
            'amount_max': float(d.amount_range_max) if d.amount_range_max else None,
            'raw_data': d.raw_data,
        })
    
    with open(output_file, 'w') as f:
        json.dump({
            'metadata': {
                'source': 'us_states',
                'downloaded_at': datetime.now().isoformat(),
                'total_records': len(all_disclosures),
                'by_state': by_state,
            },
            'disclosures': serializable
        }, f, indent=2, default=str)
    click.echo(f"Saved to {output_file}")

In [None]:
@us_states.command(name="list-states")
def list_states():
    """List available states for scraping."""
    click.echo("Available states:")
    for state in STATE_SCRAPERS.keys():
        click.echo(f"  - {state}")

In [None]:
@us_states.command(name="status")
def check_status():
    """Check status of US states ingestion."""
    files = list(OUTPUT_DIR.glob('us_states_*.json'))
    if files:
        latest = max(files, key=lambda p: p.stat().st_mtime)
        with open(latest) as f:
            data = json.load(f)
        metadata = data.get('metadata', {})
        click.echo(f"Latest: {latest.name}")
        click.echo(f"Records: {metadata.get('total_records', 'Unknown')}")
        click.echo(f"By state: {metadata.get('by_state', {})}")
    else:
        click.echo("No data found.")