In [18]:
import asyncio
import io
import json
import logging
import os
import re
import zipfile
from datetime import datetime, timedelta
from decimal import Decimal
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urljoin, urlparse

import aiohttp
from bs4 import BeautifulSoup
from pdf2image import convert_from_bytes
import pytesseract

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [19]:
from politician_trading.models import Politician, TradingDisclosure, TransactionType

In [20]:
from politician_trading.scrapers.scrapers import BaseScraper
from politician_trading.config import ScrapingConfig

In [21]:
async def parse_house_pdf(pdf_url: str, pdf_content: bytes = None, session: aiohttp.ClientSession = None) -> List[Dict[str, Any]]:
    """Parse a House disclosure PDF to extract transaction details
    
    Args:
        pdf_url: URL of the PDF to parse
        pdf_content: Optional pre-downloaded PDF bytes
        session: Optional aiohttp session for downloading
        
    Returns:
        List of transaction dictionaries with extracted data
    """
    transactions = []
    
    try:
        # Download PDF if not provided
        if pdf_content is None:
            if session is None:
                async with aiohttp.ClientSession() as temp_session:
                    async with temp_session.get(pdf_url) as response:
                        if response.status != 200:
                            logger.warning(f"Failed to download PDF: {pdf_url} (status {response.status})")
                            return []
                        pdf_content = await response.read()
            else:
                async with session.get(pdf_url) as response:
                    if response.status != 200:
                        logger.warning(f"Failed to download PDF: {pdf_url} (status {response.status})")
                        return []
                    pdf_content = await response.read()
        
        # Convert PDF pages to images at 600 DPI for better OCR
        logger.info(f"Converting PDF to images ({len(pdf_content)} bytes)")
        pages = convert_from_bytes(pdf_content, dpi=600)
        
        # Extract text from each page
        full_text = ""
        for i, page in enumerate(pages):
            logger.debug(f"OCR processing page {i+1}/{len(pages)}")
            text = pytesseract.image_to_string(page)
            full_text += text + "\n\n"
        
        logger.info(f"Extracted {len(full_text)} characters of text")
        
        # Parse transactions from text
        transactions = extract_transactions_from_text(full_text)
        logger.info(f"Extracted {len(transactions)} transactions from PDF")
        
    except Exception as e:
        logger.error(f"Error parsing PDF {pdf_url}: {e}")
    
    return transactions


def extract_transactions_from_text(text: str) -> List[Dict[str, Any]]:
    """Extract transaction details from OCR'd text
    
    Looks for patterns like:
    - " P " for Purchase, " S " for Sale, " E " for Exchange
    - Ticker symbols in parentheses: (AAPL), (MSFT)
    - Amount ranges: $1,001 - $15,000
    - Dates: MM/DD/YYYY
    
    Args:
        text: OCR'd text from PDF
        
    Returns:
        List of transaction dictionaries
    """
    transactions = []
    
    # Split by double newlines to get paragraphs/sections
    sections = text.split('\n\n')
    
    for section in sections:
        # Remove single newlines within section for easier parsing
        line = section.replace('\r', '').replace('\n', ' ')
        
        # Look for transaction type indicators
        transaction_type = None
        if ' P ' in line or ' Purchase ' in line or 'Purchase' in line:
            transaction_type = 'PURCHASE'
        elif ' S ' in line or ' Sale ' in line or 'Sale' in line:
            transaction_type = 'SALE'
        elif ' E ' in line or ' Exchange ' in line or 'Exchange' in line:
            transaction_type = 'EXCHANGE'
        
        if not transaction_type:
            continue
        
        # Extract ticker symbol from parentheses
        ticker = None
        closing_paren = line.find(')')
        if closing_paren != -1:
            opening_paren = line.rfind('(', 0, closing_paren)
            if opening_paren != -1:
                potential_ticker = line[opening_paren+1:closing_paren].strip()
                # Tickers are usually 1-5 uppercase letters
                if potential_ticker and potential_ticker.isupper() and 1 <= len(potential_ticker) <= 5:
                    ticker = potential_ticker
        
        # Extract asset name (usually before the ticker in parentheses)
        asset_name = None
        if ticker and opening_paren != -1:
            # Look backwards from opening paren for the asset name
            before_ticker = line[:opening_paren].strip()
            # Asset name is typically the last few words before the ticker
            words = before_ticker.split()
            if len(words) >= 2:
                asset_name = ' '.join(words[-5:])  # Take up to last 5 words
        
        # Extract amount range
        amount_min, amount_max, amount_exact = parse_amount_from_text(line)
        
        # Extract transaction date (MM/DD/YYYY)
        transaction_date = None
        date_pattern = r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b'
        date_match = re.search(date_pattern, line)
        if date_match:
            try:
                month, day, year = date_match.groups()
                transaction_date = datetime(int(year), int(month), int(day))
            except:
                pass
        
        # Only add transaction if we found a ticker
        if ticker:
            transaction = {
                'ticker': ticker,
                'asset_name': asset_name or ticker,
                'transaction_type': transaction_type,
                'transaction_date': transaction_date,
                'amount_min': amount_min,
                'amount_max': amount_max,
                'amount_exact': amount_exact,
                'raw_text': line[:200],  # Keep snippet for debugging
            }
            transactions.append(transaction)
    
    return transactions


def parse_amount_from_text(text: str) -> Tuple[Optional[Decimal], Optional[Decimal], Optional[Decimal]]:
    """Parse amount/value from text
    
    Returns:
        Tuple of (min_amount, max_amount, exact_amount)
    """
    if not text:
        return None, None, None
    
    # Clean up text
    text = text.replace(',', '')
    
    # Standard House disclosure ranges
    range_mappings = {
        r'\$1,?001\s*-\s*\$15,?000': (Decimal("1001"), Decimal("15000")),
        r'\$15,?001\s*-\s*\$50,?000': (Decimal("15001"), Decimal("50000")),
        r'\$50,?001\s*-\s*\$100,?000': (Decimal("50001"), Decimal("100000")),
        r'\$100,?001\s*-\s*\$250,?000': (Decimal("100001"), Decimal("250000")),
        r'\$250,?001\s*-\s*\$500,?000': (Decimal("250001"), Decimal("500000")),
        r'\$500,?001\s*-\s*\$1,?000,?000': (Decimal("500001"), Decimal("1000000")),
        r'\$1,?000,?001\s*-\s*\$5,?000,?000': (Decimal("1000001"), Decimal("5000000")),
        r'\$5,?000,?001\s*-\s*\$25,?000,?000': (Decimal("5000001"), Decimal("25000000")),
        r'\$25,?000,?001\s*-\s*\$50,?000,?000': (Decimal("25000001"), Decimal("50000000")),
        r'Over\s+\$50,?000,?000': (Decimal("50000001"), None),
    }
    
    # Check standard ranges
    for pattern, (min_val, max_val) in range_mappings.items():
        if re.search(pattern, text, re.IGNORECASE):
            return min_val, max_val, None
    
    # Look for custom range patterns: $X - $Y or $X-$Y
    range_match = re.search(r'\$(\d+)\s*-\s*\$(\d+)', text)
    if range_match:
        min_val = Decimal(range_match.group(1))
        max_val = Decimal(range_match.group(2))
        return min_val, max_val, None
    
    # Look for exact amounts: $X or $X.XX
    exact_match = re.search(r'\$(\d+(?:\.\d{2})?)', text)
    if exact_match:
        exact_val = Decimal(exact_match.group(1))
        return None, None, exact_val
    
    return None, None, None

In [22]:
disclosures = []
base_url = "https://disclosures-clerk.house.gov"
search_url = f"{base_url}/FinancialDisclosure"

In [23]:
 logger.info("Starting House disclosures scrape from official database")

INFO:__main__:Starting House disclosures scrape from official database


In [24]:
scraper = ScrapingConfig()

In [25]:
scraper

ScrapingConfig(request_delay=1.0, max_retries=3, timeout=30, user_agent='Mozilla/5.0 (compatible; MCLI-PoliticianTracker/1.0)', enable_us_federal=True, enable_us_states=True, enable_eu_parliament=True, enable_eu_national=True, enable_third_party=True, us_congress_sources=['https://disclosures-clerk.house.gov/FinancialDisclosure', 'https://efd.senate.gov', 'https://api.quiverquant.com/beta/live/congresstrading'], eu_sources=['https://www.europarl.europa.eu/meps/en/declarations'])

In [26]:
scraper.timeout

30

In [27]:
class HouseResponse():
    """Response object for House scraping results"""
    def __init__(self):
        self.status_code: Optional[int] = None
        self.disclosures: List[Dict[str, Any]] = []  # Now stores disclosure metadata dicts
        self.html: Optional[str] = None
        self.error: Optional[str] = None
        self.success: bool = False
        self.form_fields: Optional[dict] = None

In [28]:
import json
import os
from pathlib import Path

MEMBERS_CACHE_FILE = "house_members_cache.json"
CACHE_EXPIRY_HOURS = 24

async def get_house_members(api_key: Optional[str] = None, return_full_data: bool = False) -> List[Dict[str, Any]]:
    """Fetch current House and Senate members with caching
    
    Args:
        api_key: Congress API key (from https://api.congress.gov). 
                 If not provided, will try to load from CONGRESS_API_KEY env var.
        return_full_data: If True, return full member data dictionaries.
                         If False, return just member names (default for backwards compatibility)
    
    Returns:
        List of member data (either full dicts or just names, depending on return_full_data)
    """
    # Get API key from parameter or environment
    if api_key is None:
        api_key = os.getenv("CONGRESS_API_KEY")
    
    if not api_key:
        logger.error("Congress API key is required. Get one at https://api.congress.gov")
        logger.error("Set it as CONGRESS_API_KEY environment variable or pass as parameter")
        return []
    
    cache_path = Path(MEMBERS_CACHE_FILE)
    
    # Check if cache exists and is fresh
    if cache_path.exists():
        try:
            with open(cache_path, 'r') as f:
                cache_data = json.load(f)
                cache_time = datetime.fromisoformat(cache_data["timestamp"])
                if datetime.now() - cache_time < timedelta(hours=CACHE_EXPIRY_HOURS):
                    logger.info(f"Using cached members ({len(cache_data['members'])} members)")
                    if return_full_data:
                        return cache_data.get("full_members", cache_data["members"])
                    else:
                        return cache_data["members"]
        except Exception as e:
            logger.debug(f"Cache read failed: {e}")
    
    # Fetch fresh data from Congress API
    all_members = []
    try:
        async with aiohttp.ClientSession() as session:
            # Get all current members (both House and Senate)
            # The API returns both chambers by default
            url = f"https://api.congress.gov/v3/member?currentMember=true&limit=600&api_key={api_key}"
            logger.info(f"Fetching all current Congress members from API...")
            
            async with session.get(url) as response:
                if response.status == 200:
                    data = await response.json()
                    members = data.get("members", [])
                    
                    logger.info(f"Fetched {len(members)} members from Congress API")
                    
                    # Extract full member data
                    full_members = []
                    for m in members:
                        member_info = {
                            "name": m.get("name", ""),
                            "bioguideId": m.get("bioguideId", ""),
                            "party": m.get("partyName", ""),
                            "state": m.get("state", ""),
                            "district": m.get("district"),  # Only for House members
                            "url": m.get("url", ""),
                            "updateDate": m.get("updateDate", ""),
                        }
                        full_members.append(member_info)
                    
                    # Extract just names for backwards compatibility
                    member_names = [m.get("name", "") for m in members if m.get("name")]
                    
                    # Save to cache
                    cache_data = {
                        "timestamp": datetime.now().isoformat(),
                        "members": member_names,
                        "full_members": full_members,
                    }
                    with open(cache_path, 'w') as f:
                        json.dump(cache_data, f, indent=2)
                    
                    logger.info(f"Cached {len(member_names)} members")
                    
                    if return_full_data:
                        return full_members
                    else:
                        return member_names
                        
                elif response.status == 403:
                    error_data = await response.json()
                    logger.error(f"API key rejected: {error_data}")
                    logger.error("Please verify your Congress API key is valid")
                else:
                    error_text = await response.text()
                    logger.error(f"Congress API returned status {response.status}: {error_text}")
                    
    except Exception as e:
        logger.error(f"Error fetching members: {e}")
    
    return []

In [29]:
# Test get_house_members function
print("=" * 60)
print("Testing get_house_members()...")
print("=" * 60)

# Test 1: Get just names (backwards compatible)
members = await get_house_members()
print(f"\n‚úÖ Fetched {len(members)} member names")
if members:
    print(f"First 5 members: {members[:5]}")

# Test 2: Get full member data
print("\n" + "=" * 60)
print("Testing get_house_members(return_full_data=True)...")
print("=" * 60)

full_members = await get_house_members(return_full_data=True)
print(f"\n‚úÖ Fetched {len(full_members)} full member records")
if full_members:
    print(f"\nFirst member details:")
    for key, value in full_members[0].items():
        print(f"  {key}: {value}")
    
    # Show party breakdown
    parties = {}
    for m in full_members:
        party = m.get("party", "Unknown")
        parties[party] = parties.get(party, 0) + 1
    
    print(f"\nParty breakdown:")
    for party, count in sorted(parties.items(), key=lambda x: x[1], reverse=True):
        print(f"  {party}: {count}")
    
    # Show state count
    states = set(m.get("state", "") for m in full_members)
    print(f"\nRepresented states: {len(states)}")
    
    # Show House vs Senate
    house_members = [m for m in full_members if m.get("district") is not None]
    senate_members = [m for m in full_members if m.get("district") is None]
    print(f"\nHouse members: {len(house_members)}")
    print(f"Senate members: {len(senate_members)}")
    print(f"Total: {len(full_members)}")


INFO:__main__:Using cached members (250 members)
INFO:__main__:Using cached members (250 members)


Testing get_house_members()...

‚úÖ Fetched 250 member names
First 5 members: ['Grijalva, Adelita S.', 'Crawford, Eric A. "Rick"', 'Young, Todd', 'Wyden, Ron', 'Warren, Elizabeth']

Testing get_house_members(return_full_data=True)...

‚úÖ Fetched 250 full member records

First member details:
  name: Grijalva, Adelita S.
  bioguideId: G000606
  party: Democratic
  state: Arizona
  district: 7
  url: https://api.congress.gov/v3/member/G000606?format=json
  updateDate: 2025-11-13T17:05:26Z

Party breakdown:
  Republican: 135
  Democratic: 113
  Independent: 2

Represented states: 53

House members: 143
Senate members: 107
Total: 250


In [30]:
import io
import zipfile
from datetime import datetime

async def scrape_house(year: Optional[int] = None, parse_pdfs: bool = False, max_pdfs: Optional[int] = None) -> HouseResponse:
    """Scrape House disclosures using the ZIP index file approach
    
    This approach downloads the annual index file which contains metadata
    for all filings, rather than trying to scrape the search form.
    
    Args:
        year: Year to scrape (defaults to current year)
        parse_pdfs: If True, download and parse PDF files to extract transactions
        max_pdfs: Maximum number of PDFs to parse (for testing/limiting API calls)
    
    Returns:
        HouseResponse with disclosure metadata and optionally parsed transactions
    """
    house_response = HouseResponse()
    
    if year is None:
        year = datetime.now().year
    
    base_url = "https://disclosures-clerk.house.gov"
    # The ZIP file contains an index of all financial disclosure filings
    zip_url = f"{base_url}/public_disc/financial-pdfs/{year}FD.ZIP"
    
    try:
        logger.info(f"Downloading House disclosure index for {year}...")
        
        async with aiohttp.ClientSession(
            timeout=aiohttp.ClientTimeout(total=scraper.timeout * 3),  # Longer timeout for PDFs
            headers={"User-Agent": scraper.user_agent},
        ) as session:
            
            # Download the ZIP index file
            async with session.get(zip_url) as response:
                house_response.status_code = response.status
                
                if response.status != 200:
                    logger.error(f"Failed to download index: {response.status}")
                    house_response.error = f"Failed to download index with status {response.status}"
                    return house_response
                
                zip_content = await response.read()
                logger.info(f"Downloaded index file ({len(zip_content)} bytes)")
                
                # Extract the index file from the ZIP
                try:
                    with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
                        # The ZIP contains a TXT file with the index
                        txt_filename = f"{year}FD.txt"
                        
                        if txt_filename not in z.namelist():
                            logger.error(f"Expected file {txt_filename} not found in ZIP")
                            logger.info(f"Available files: {z.namelist()}")
                            house_response.error = f"Index file {txt_filename} not found in ZIP"
                            return house_response
                        
                        # Read the index file
                        with z.open(txt_filename) as f:
                            index_content = f.read().decode('utf-8', errors='ignore')
                            
                        logger.info(f"Extracted index file")
                        
                        # Parse the tab-separated index file
                        lines = index_content.strip().split('\n')
                        logger.info(f"Found {len(lines)} filing records in index")
                        
                        disclosures = []
                        parsed_count = 0
                        
                        # Skip header line (line 0)
                        for i, line in enumerate(lines[1:], start=1):
                            fields = line.split('\t')
                            
                            if len(fields) < 9:
                                continue  # Skip malformed lines
                            
                            # Extract key information
                            # Field indices: [0]=Prefix, [1]=Last, [2]=First, [3]=Suffix, [4]=FilingType, 
                            #                [5]=StateDst, [6]=Year, [7]=FilingDate, [8]=DocID
                            prefix = fields[0].strip()
                            last_name = fields[1].strip()
                            first_name = fields[2].strip()
                            suffix = fields[3].strip()
                            filing_type = fields[4].strip()
                            state_district = fields[5].strip()
                            file_year = fields[6].strip()
                            filing_date_str = fields[7].strip()
                            doc_id = fields[8].strip()  # Important: strip removes \r
                            
                            if not doc_id or doc_id == 'DocID':  # Skip header or empty
                                continue
                            
                            # Build full name with prefix/suffix
                            name_parts = [p for p in [prefix, first_name, last_name, suffix] if p]
                            full_name = ' '.join(name_parts)
                            
                            # Parse filing date
                            filing_date = None
                            if filing_date_str:
                                try:
                                    filing_date = datetime.strptime(filing_date_str, "%m/%d/%Y")
                                except:
                                    try:
                                        filing_date = datetime.strptime(filing_date_str, "%Y-%m-%d")
                                    except:
                                        pass
                            
                            # Build PDF URL - FIXED: Use financial-pdfs instead of ptr-pdfs
                            pdf_url = f"{base_url}/public_disc/financial-pdfs/{year}/{doc_id}.pdf"
                            
                            # Create disclosure metadata
                            disclosure_info = {
                                "politician_name": full_name,
                                "first_name": first_name,
                                "last_name": last_name,
                                "state_district": state_district,
                                "filing_type": filing_type,
                                "filing_date": filing_date,
                                "doc_id": doc_id,
                                "pdf_url": pdf_url,
                                "year": year,
                                "transactions": [],  # Will be populated if parse_pdfs=True
                            }
                            
                            # Optionally parse PDF for transactions
                            if parse_pdfs:
                                # Check if we've hit the limit
                                if max_pdfs and parsed_count >= max_pdfs:
                                    logger.info(f"Reached max_pdfs limit ({max_pdfs}), skipping remaining PDFs")
                                    disclosures.append(disclosure_info)
                                    continue
                                
                                logger.info(f"Parsing PDF {parsed_count + 1}/{max_pdfs or '‚àû'}: {full_name} ({doc_id})")
                                
                                try:
                                    transactions = await parse_house_pdf(pdf_url, session=session)
                                    disclosure_info["transactions"] = transactions
                                    
                                    if transactions:
                                        logger.info(f"  Found {len(transactions)} transactions")
                                    else:
                                        logger.info(f"  No transactions found")
                                    
                                    parsed_count += 1
                                    
                                    # Rate limiting between PDF downloads
                                    await asyncio.sleep(scraper.request_delay)
                                    
                                except Exception as e:
                                    logger.error(f"  Error parsing PDF: {e}")
                                    disclosure_info["parse_error"] = str(e)
                            
                            disclosures.append(disclosure_info)
                        
                        house_response.disclosures = disclosures
                        house_response.success = True
                        
                        if parse_pdfs:
                            logger.info(f"Successfully parsed {parsed_count} PDFs from {len(disclosures)} total disclosures")
                        else:
                            logger.info(f"Successfully retrieved metadata for {len(disclosures)} House disclosures")
                        
                except zipfile.BadZipFile as e:
                    logger.error(f"Invalid ZIP file: {e}")
                    house_response.error = f"Invalid ZIP file: {e}"
                except Exception as e:
                    logger.error(f"Error extracting ZIP: {e}")
                    house_response.error = f"Error extracting ZIP: {e}"
    
    except Exception as e:
        logger.error(f"Error scraping House: {e}", exc_info=True)
        house_response.error = str(e)
        house_response.success = False
    
    return house_response

In [31]:
# Test the new scrape_house function
print("=" * 70)
print("Testing scrape_house() with ZIP index approach")
print("=" * 70)

house_response = await scrape_house()

print(f"\n{'‚úÖ' if house_response.success else '‚ùå'} Success: {house_response.success}")
print(f"Status Code: {house_response.status_code}")
print(f"Error: {house_response.error if house_response.error else 'None'}")
print(f"Disclosures found: {len(house_response.disclosures)}")

if house_response.disclosures:
    print(f"\nüìä First 5 disclosures:")
    for i, disclosure in enumerate(house_response.disclosures[:5]):
        print(f"\n  {i+1}. {disclosure['politician_name']}")
        print(f"     Type: {disclosure['filing_type']}")
        print(f"     Date: {disclosure['filing_date']}")
        print(f"     Doc ID: {disclosure['doc_id']}")
        print(f"     PDF: {disclosure['pdf_url']}")
    
    # Show some statistics
    print(f"\nüìà Statistics:")
    print(f"   Total filings: {len(house_response.disclosures)}")
    
    # Count by filing type
    filing_types = {}
    for d in house_response.disclosures:
        ft = d.get('filing_type', 'Unknown')
        filing_types[ft] = filing_types.get(ft, 0) + 1
    
    print(f"   Filing types:")
    for ft, count in sorted(filing_types.items(), key=lambda x: x[1], reverse=True):
        print(f"     - {ft}: {count}")
    
    # Show recent filings
    dated_disclosures = [d for d in house_response.disclosures if d.get('filing_date')]
    if dated_disclosures:
        recent = sorted(dated_disclosures, key=lambda x: x['filing_date'], reverse=True)[:5]
        print(f"\n   Most recent 5 filings:")
        for d in recent:
            print(f"     - {d['filing_date'].strftime('%Y-%m-%d')}: {d['politician_name']}")


INFO:__main__:Downloading House disclosure index for 2025...
INFO:__main__:Downloaded index file (55104 bytes)
INFO:__main__:Extracted index file
INFO:__main__:Found 1553 filing records in index
INFO:__main__:Successfully retrieved metadata for 1552 House disclosures


Testing scrape_house() with ZIP index approach

‚úÖ Success: True
Status Code: 200
Error: None
Disclosures found: 1552

üìä First 5 disclosures:

  1. Richard Aaron
     Type: D
     Date: 2025-03-24 00:00:00
     Doc ID: 40003749
     PDF: https://disclosures-clerk.house.gov/public_disc/financial-pdfs/2025/40003749.pdf

  2. William P. Abel
     Type: C
     Date: 2025-10-12 00:00:00
     Doc ID: 10072640
     PDF: https://disclosures-clerk.house.gov/public_disc/financial-pdfs/2025/10072640.pdf

  3. Mr. Rock Adel Aboujaoude Jr.
     Type: C
     Date: 2025-11-12 00:00:00
     Doc ID: 10072809
     PDF: https://disclosures-clerk.house.gov/public_disc/financial-pdfs/2025/10072809.pdf

  4. David Abrevaya
     Type: W
     Date: 2025-05-19 00:00:00
     Doc ID: 8005
     PDF: https://disclosures-clerk.house.gov/public_disc/financial-pdfs/2025/8005.pdf

  5. Katherine M. Abughazaleh
     Type: C
     Date: 2025-05-14 00:00:00
     Doc ID: 10065677
     PDF: https://disclosures-clerk.hou

In [32]:
# Debug: Inspect the index file structure
import aiohttp
import zipfile
import io

async def debug_index_file():
    """Debug the index file to understand its structure"""
    year = 2025
    base_url = "https://disclosures-clerk.house.gov"
    zip_url = f"{base_url}/public_disc/financial-pdfs/{year}FD.ZIP"
    
    async with aiohttp.ClientSession() as session:
        print(f"Downloading: {zip_url}")
        async with session.get(zip_url) as response:
            if response.status != 200:
                print(f"‚ùå Failed to download: {response.status}")
                return
            
            zip_content = await response.read()
            print(f"‚úÖ Downloaded {len(zip_content)} bytes\n")
            
            with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
                print(f"Files in ZIP: {z.namelist()}\n")
                
                txt_filename = f"{year}FD.txt"
                if txt_filename in z.namelist():
                    with z.open(txt_filename) as f:
                        content = f.read().decode('utf-8', errors='ignore')
                    
                    lines = content.strip().split('\n')
                    print(f"Total lines: {len(lines)}\n")
                    
                    # Show first 5 lines to understand structure
                    print("="*80)
                    print("FIRST 5 LINES (RAW):")
                    print("="*80)
                    for i, line in enumerate(lines[:5]):
                        print(f"\nLine {i}:")
                        print(repr(line[:200]))  # Show raw with special chars
                    
                    # Parse and show fields
                    print("\n" + "="*80)
                    print("PARSED FIELDS:")
                    print("="*80)
                    for i, line in enumerate(lines[:5]):
                        fields = line.split('\t')
                        print(f"\nLine {i} - {len(fields)} fields:")
                        for j, field in enumerate(fields):
                            print(f"  [{j}]: {repr(field[:50])}")
                    
                    # Try to construct and test a PDF URL
                    print("\n" + "="*80)
                    print("TESTING PDF URLs:")
                    print("="*80)
                    
                    for i, line in enumerate(lines[:3]):
                        fields = line.split('\t')
                        if len(fields) >= 9:
                            last_name = fields[1].strip()
                            first_name = fields[2].strip()
                            doc_id = fields[8].strip()
                            
                            print(f"\nRecord {i}:")
                            print(f"  Name: {first_name} {last_name}")
                            print(f"  Doc ID: {doc_id}")
                            
                            # Try different URL patterns
                            test_urls = [
                                f"{base_url}/public_disc/ptr-pdfs/{year}/{doc_id}.pdf",
                                f"{base_url}/public_disc/financial-pdfs/{year}/{doc_id}.pdf",
                                f"{base_url}/public_disc/ptr-pdfs/{doc_id}.pdf",
                                f"{base_url}/PublicDisclosure/FinancialDisclosure/ViewMemberReportsPDF?FID={doc_id}",
                            ]
                            
                            for url in test_urls:
                                async with session.head(url) as test_response:
                                    status = test_response.status
                                    icon = "‚úÖ" if status == 200 else "‚ùå"
                                    print(f"    {icon} {status} - {url}")
                                    if status == 200:
                                        break

await debug_index_file()

Downloading: https://disclosures-clerk.house.gov/public_disc/financial-pdfs/2025FD.ZIP
‚úÖ Downloaded 55104 bytes

Files in ZIP: ['2025FD.txt', '2025FD.xml']

Total lines: 1553

FIRST 5 LINES (RAW):

Line 0:
'Prefix\tLast\tFirst\tSuffix\tFilingType\tStateDst\tYear\tFilingDate\tDocID\r'

Line 1:
'\tAaron\tRichard\t\tD\tMI04\t2025\t3/24/2025\t40003749\r'

Line 2:
'\tAbel\tWilliam P.\t\tC\tTX31\t2025\t10/12/2025\t10072640\r'

Line 3:
'Mr.\tAboujaoude\tRock Adel\tJr.\tC\tFL03\t2025\t11/12/2025\t10072809\r'

Line 4:
'\tAbrevaya\tDavid\t\tW\tIL09\t2025\t5/19/2025\t8005\r'

PARSED FIELDS:

Line 0 - 9 fields:
  [0]: 'Prefix'
  [1]: 'Last'
  [2]: 'First'
  [3]: 'Suffix'
  [4]: 'FilingType'
  [5]: 'StateDst'
  [6]: 'Year'
  [7]: 'FilingDate'
  [8]: 'DocID\r'

Line 1 - 9 fields:
  [0]: ''
  [1]: 'Aaron'
  [2]: 'Richard'
  [3]: ''
  [4]: 'D'
  [5]: 'MI04'
  [6]: '2025'
  [7]: '3/24/2025'
  [8]: '40003749\r'

Line 2 - 9 fields:
  [0]: ''
  [1]: 'Abel'
  [2]: 'William P.'
  [3]: ''
  [4]: 'C'
  [5]:

In [33]:
# Test PDF parsing with a small sample
print("=" * 70)
print("Testing scrape_house() WITH PDF PARSING (limited to 3 PDFs)")
print("=" * 70)

house_response = await scrape_house(parse_pdfs=True, max_pdfs=3)

print(f"\n{'‚úÖ' if house_response.success else '‚ùå'} Success: {house_response.success}")
print(f"Status Code: {house_response.status_code}")
print(f"Error: {house_response.error if house_response.error else 'None'}")
print(f"Total disclosures: {len(house_response.disclosures)}")

# Find disclosures with parsed transactions
parsed_disclosures = [d for d in house_response.disclosures if d.get('transactions')]
print(f"Disclosures with parsed transactions: {len(parsed_disclosures)}")

if parsed_disclosures:
    print(f"\nüìä Parsed Disclosures with Transactions:")
    for i, disclosure in enumerate(parsed_disclosures[:5]):
        print(f"\n  {i+1}. {disclosure['politician_name']} - {disclosure['filing_date'].strftime('%Y-%m-%d') if disclosure['filing_date'] else 'N/A'}")
        print(f"     Doc ID: {disclosure['doc_id']}")
        print(f"     Transactions: {len(disclosure['transactions'])}")
        
        for j, txn in enumerate(disclosure['transactions'][:3]):  # Show first 3 transactions
            print(f"\n     Transaction {j+1}:")
            print(f"       Ticker: {txn['ticker']}")
            print(f"       Asset: {txn['asset_name']}")
            print(f"       Type: {txn['transaction_type']}")
            print(f"       Date: {txn['transaction_date'].strftime('%Y-%m-%d') if txn['transaction_date'] else 'N/A'}")
            if txn['amount_min'] or txn['amount_max']:
                print(f"       Amount: ${txn['amount_min']:,} - ${txn['amount_max']:,}")
            elif txn['amount_exact']:
                print(f"       Amount: ${txn['amount_exact']:,}")
        
        if len(disclosure['transactions']) > 3:
            print(f"\n     ... and {len(disclosure['transactions']) - 3} more transactions")

# Count total transactions across all parsed disclosures
total_transactions = sum(len(d.get('transactions', [])) for d in house_response.disclosures)
print(f"\nüìà Total transactions extracted: {total_transactions}")

# Show transaction type breakdown
if total_transactions > 0:
    transaction_types = {}
    all_tickers = set()
    
    for d in house_response.disclosures:
        for txn in d.get('transactions', []):
            txn_type = txn.get('transaction_type', 'Unknown')
            transaction_types[txn_type] = transaction_types.get(txn_type, 0) + 1
            if txn.get('ticker'):
                all_tickers.add(txn['ticker'])
    
    print(f"\nüìä Transaction Types:")
    for txn_type, count in sorted(transaction_types.items(), key=lambda x: x[1], reverse=True):
        print(f"   {txn_type}: {count}")
    
    print(f"\nüè¢ Unique tickers found: {len(all_tickers)}")
    print(f"   Sample tickers: {sorted(list(all_tickers))[:10]}")


INFO:__main__:Downloading House disclosure index for 2025...
INFO:__main__:Downloaded index file (55104 bytes)
INFO:__main__:Extracted index file
INFO:__main__:Found 1553 filing records in index
INFO:__main__:Parsing PDF 1/3: Richard Aaron (40003749)
INFO:__main__:Converting PDF to images (47660 bytes)


Testing scrape_house() WITH PDF PARSING (limited to 3 PDFs)


INFO:__main__:Extracted 1051 characters of text
INFO:__main__:Extracted 0 transactions from PDF
INFO:__main__:  No transactions found
INFO:__main__:Parsing PDF 2/3: William P. Abel (10072640)
INFO:__main__:Converting PDF to images (67622 bytes)
INFO:__main__:Extracted 2121 characters of text
INFO:__main__:Extracted 0 transactions from PDF
INFO:__main__:  No transactions found
INFO:__main__:Parsing PDF 3/3: Mr. Rock Adel Aboujaoude Jr. (10072809)
INFO:__main__:Converting PDF to images (65687 bytes)
INFO:__main__:Extracted 1666 characters of text
INFO:__main__:Extracted 0 transactions from PDF
INFO:__main__:  No transactions found
INFO:__main__:Reached max_pdfs limit (3), skipping remaining PDFs
INFO:__main__:Reached max_pdfs limit (3), skipping remaining PDFs
INFO:__main__:Reached max_pdfs limit (3), skipping remaining PDFs
INFO:__main__:Reached max_pdfs limit (3), skipping remaining PDFs
INFO:__main__:Reached max_pdfs limit (3), skipping remaining PDFs
INFO:__main__:Reached max_pdfs li


‚úÖ Success: True
Status Code: 200
Error: None
Total disclosures: 1552
Disclosures with parsed transactions: 0

üìà Total transactions extracted: 0
