# US House of Representatives - Financial Disclosure Ingestion

This notebook ingests financial disclosure data from the US House of Representatives.

**Data Source:** https://disclosures-clerk.house.gov/FinancialDisclosure

## CLI Commands
```bash
mcli run us-house run              # Run full ingestion
mcli run us-house run --year 2024  # Specific year
mcli run us-house run --parse-pdfs # Parse PDFs for transactions
mcli run us-house status           # Check ingestion status
```

In [1]:
import asyncio
import io
import json
import logging
import os
import sys
import zipfile
from datetime import datetime, timedelta
from decimal import Decimal
from functools import wraps
from pathlib import Path
from typing import Any, Dict, List, Optional

import click
import aiohttp

# Add project root to path
project_root = Path.cwd().parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root / 'src'))

from politician_trading.config import ScrapingConfig, WorkflowConfig

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [2]:
# Click command group setup
@click.group(name="us-house")
def us_house():
    """US House of Representatives financial disclosure ingestion."""
    pass

def click_async(f):
    """Decorator to run async functions with click."""
    @wraps(f)
    def wrapper(*args, **kwargs):
        return asyncio.run(f(*args, **kwargs))
    return wrapper

In [3]:
# Configuration
config = WorkflowConfig.default()
scraping_config = config.scraping

# Constants
OUTPUT_DIR = project_root / 'data' / 'raw' / 'us_house'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [5]:
import pprint

In [6]:
pprint.pprint(config)

WorkflowConfig(supabase=SupabaseConfig(url='https://uljsqvwkomdrlnofmlad.supabase.co',
                                       key='eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InVsanNxdndrb21kcmxub2ZtbGFkIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTY4MDIyNDQsImV4cCI6MjA3MjM3ODI0NH0.QCpfcEpxGX_5Wn8ljf_J2KWjJLGdF8zRsV_7OatxmHI',
                                       service_role_key='eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InVsanNxdndrb21kcmxub2ZtbGFkIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTc1NjgwMjI0NCwiZXhwIjoyMDcyMzc4MjQ0fQ.4364sQbTJQd4IcxEQG6mPiOUw1iJ2bdKfV6W4oRqHvs'),
               scraping=ScrapingConfig(request_delay=1.0,
                                       max_retries=3,
                                       timeout=30,
                                       user_agent='Mozilla/5.0 (compatible; '
                                                  'MCLI-PoliticianTracker/1.0)',
                                       enable_us_federal

In [None]:
def get_base_url() -> str:
    return "https://disclosures-clerk.house.gov"

In [None]:
def get_zip_url(year: int) -> str:
    return f"{get_base_url()}/public_disc/financial-pdfs/{year}FD.ZIP"

In [8]:
get_zip_url(2025)

'https://disclosures-clerk.house.gov/public_disc/financial-pdfs/2025FD.ZIP'

In [None]:
async def download_house_index(year: int, config: ScrapingConfig) -> List[Dict[str, Any]]:
    """Download and parse the House disclosure index file."""
    base_url = get_base_url()
    zip_url = get_zip_url(year)
    
    disclosures = []
    
    async with aiohttp.ClientSession(
        timeout=aiohttp.ClientTimeout(total=config.timeout * 2),
        headers={"User-Agent": config.user_agent}
    ) as session:
        logger.info(f"Downloading House disclosure index for {year}...")
        
        async with session.get(zip_url) as response:
            if response.status != 200:
                logger.error(f"Failed to download index: {response.status}")
                return []
            
            zip_content = await response.read()
            logger.info(f"Downloaded {len(zip_content):,} bytes")
            
            with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
                txt_filename = f"{year}FD.txt"
                
                if txt_filename not in z.namelist():
                    logger.error(f"Index file {txt_filename} not found")
                    return []
                
                with z.open(txt_filename) as f:
                    content = f.read().decode('utf-8', errors='ignore')
                
                lines = content.strip().split('\n')
                logger.info(f"Found {len(lines)} records")
                
                for line in lines[1:]:  # Skip header
                    fields = line.split('\t')
                    if len(fields) < 9:
                        continue
                    
                    prefix, last_name, first_name, suffix = fields[0:4]
                    filing_type, state_district, file_year = fields[4:7]
                    filing_date_str, doc_id = fields[7:9]
                    
                    doc_id = doc_id.strip()
                    if not doc_id or doc_id == 'DocID':
                        continue
                    
                    name_parts = [p.strip() for p in [prefix, first_name, last_name, suffix] if p.strip()]
                    full_name = ' '.join(name_parts)
                    
                    filing_date = None
                    if filing_date_str:
                        try:
                            filing_date = datetime.strptime(filing_date_str.strip(), "%m/%d/%Y").isoformat()
                        except ValueError:
                            pass
                    
                    pdf_url = f"{base_url}/public_disc/financial-pdfs/{year}/{doc_id}.pdf"
                    
                    disclosures.append({
                        "politician_name": full_name,
                        "first_name": first_name.strip(),
                        "last_name": last_name.strip(),
                        "state_district": state_district.strip(),
                        "filing_type": filing_type.strip(),
                        "filing_date": filing_date,
                        "doc_id": doc_id,
                        "pdf_url": pdf_url,
                        "year": year,
                        "source": "us_house",
                    })
    
    return disclosures

In [None]:
@us_house.command(name="run")
@click.option('--year', default=None, type=int, help='Year to scrape (default: current year)')
@click.option('--parse-pdfs', is_flag=True, help='Parse PDFs for transaction details')
@click.option('--max-pdfs', default=10, type=int, help='Max PDFs to parse (if --parse-pdfs)')
@click.option('--output', default=None, help='Output file path')
@click_async
async def run_ingestion(year: Optional[int], parse_pdfs: bool, max_pdfs: int, output: Optional[str]):
    """Run US House financial disclosure ingestion."""
    if year is None:
        year = datetime.now().year
    
    click.echo(f"Starting US House ingestion for {year}...")
    
    # Download index
    disclosures = await download_house_index(year, scraping_config)
    
    if not disclosures:
        click.echo("No disclosures found.", err=True)
        return
    
    click.echo(f"Downloaded {len(disclosures)} disclosure records")
    
    # Statistics
    filing_types = {}
    for d in disclosures:
        ft = d.get('filing_type', 'Unknown')
        filing_types[ft] = filing_types.get(ft, 0) + 1
    
    click.echo("\nFiling types:")
    for ft, count in sorted(filing_types.items(), key=lambda x: x[1], reverse=True):
        click.echo(f"  {ft}: {count}")
    
    # Save results
    if output:
        output_file = Path(output)
    else:
        output_file = OUTPUT_DIR / f'house_disclosures_{year}.json'
    
    with open(output_file, 'w') as f:
        json.dump({
            'metadata': {
                'source': 'us_house',
                'year': year,
                'downloaded_at': datetime.now().isoformat(),
                'total_records': len(disclosures),
                'filing_types': filing_types,
            },
            'disclosures': disclosures
        }, f, indent=2)
    
    click.echo(f"\nSaved {len(disclosures)} records to {output_file}")

In [None]:
@us_house.command(name="status")
@click.option('--year', default=None, type=int, help='Year to check')
def check_status(year: Optional[int]):
    """Check status of US House ingestion."""
    if year is None:
        year = datetime.now().year
    
    output_file = OUTPUT_DIR / f'house_disclosures_{year}.json'
    
    if output_file.exists():
        with open(output_file) as f:
            data = json.load(f)
        
        metadata = data.get('metadata', {})
        click.echo(f"US House {year} Ingestion Status:")
        click.echo(f"  File: {output_file}")
        click.echo(f"  Records: {metadata.get('total_records', 'Unknown')}")
        click.echo(f"  Downloaded: {metadata.get('downloaded_at', 'Unknown')}")
        
        filing_types = metadata.get('filing_types', {})
        if filing_types:
            click.echo(f"  Filing types: {filing_types}")
    else:
        click.echo(f"No data found for {year}. Run 'mcli run us-house run' first.")

In [None]:
@us_house.command(name="list")
@click.option('--year', default=None, type=int, help='Year to list')
@click.option('--limit', default=10, type=int, help='Number of records to show')
@click.option('--type', 'filing_type', default=None, help='Filter by filing type (P, A, C, etc.)')
def list_disclosures(year: Optional[int], limit: int, filing_type: Optional[str]):
    """List downloaded US House disclosures."""
    if year is None:
        year = datetime.now().year
    
    output_file = OUTPUT_DIR / f'house_disclosures_{year}.json'
    
    if not output_file.exists():
        click.echo(f"No data found for {year}. Run 'mcli run us-house run' first.")
        return
    
    with open(output_file) as f:
        data = json.load(f)
    
    disclosures = data.get('disclosures', [])
    
    if filing_type:
        disclosures = [d for d in disclosures if d.get('filing_type') == filing_type]
        click.echo(f"Filtered to type '{filing_type}': {len(disclosures)} records")
    
    click.echo(f"\nShowing {min(limit, len(disclosures))} of {len(disclosures)} disclosures:\n")
    
    for d in disclosures[:limit]:
        click.echo(f"  {d['politician_name']} ({d['filing_type']})")
        click.echo(f"    Date: {d.get('filing_date', 'N/A')}")
        click.echo(f"    State: {d.get('state_district', 'N/A')}")
        click.echo(f"    PDF: {d.get('pdf_url', 'N/A')}")
        click.echo()

In [None]:
# Test the commands
if __name__ == "__main__":
    # For testing in notebook
    print("Testing US House ingestion...")
    disclosures = asyncio.run(download_house_index(datetime.now().year, scraping_config))
    print(f"Downloaded {len(disclosures)} records")