In [None]:
"""
Test FEC Extraction Pipeline
Tests the complete flow: Client -> Service -> Repos -> Database
"""
import sys
sys.path.insert(0, '/home/claude/fund_lens_etl')

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from fund_lens_etl import config
from fund_lens_etl.clients.fec_client import FECClient
from fund_lens_etl.repos.raw_filing_repo import RawFilingRepo
from fund_lens_etl.repos.fec_staging_repo import FECContributionStagingRepo
from fund_lens_etl.services.fec_service import FECExtractionService

# Setup database connection
DATABASE_URL = f"postgresql://{config.DB_USER}:{config.DB_PASSWORD}@{config.DB_HOST}:{config.DB_PORT}/{config.DB_NAME}"
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(bind=engine)

print("✅ Imports successful")
print(f"📊 Database: {config.DB_NAME}")

In [None]:
# Create service with all dependencies
fec_client = FECClient()
raw_filing_repo = RawFilingRepo()
fec_staging_repo = FECContributionStagingRepo()

service = FECExtractionService(
    fec_client=fec_client,
    raw_filing_repo=raw_filing_repo,
    fec_staging_repo=fec_staging_repo
)

print("✅ Service initialized")
print(f"   Client: {type(fec_client).__name__}")
print(f"   Raw Filing Repo: {type(raw_filing_repo).__name__}")
print(f"   Staging Repo: {type(fec_staging_repo).__name__}")

In [None]:
# Extract a SMALL SAMPLE of Maryland contributions for testing
session = SessionLocal()

try:
    result = service.extract_and_store_contributions(
        session=session,
        contributor_state="MD",
        two_year_transaction_period=2024,  # 2023-2024 cycle
        source="fec_api",
        max_results=10  # Just 10 records for testing
    )

    print("✅ Extraction complete!")
    print(f"   Contributions fetched: {result['contributions_fetched']}")
    print(f"   Raw filing ID: {result['raw_filing_id']}")
    print(f"   Contributions stored: {result['contributions_stored']}")
    print(f"   File hash: {result['file_hash']}")

except Exception as e:
    print(f"❌ Extraction failed: {e}")
    raise
finally:
    session.close()

In [None]:
# Run the exact same extraction - should skip duplicate
session = SessionLocal()

try:
    result = service.extract_and_store_contributions(
        session=session,
        contributor_state="MD",
        two_year_transaction_period=2024,
        source="fec_api",
        max_results=10  # Same parameters as before
    )

    if result.get('skipped'):
        print("✅ Idempotency works! Duplicate data was skipped.")
        print(f"   Original raw_filing_id: {result['raw_filing_id']}")
        print(f"   Contributions stored: {result['contributions_stored']} (should be 0)")
    else:
        print("⚠️  Warning: Data was re-inserted (idempotency may not be working)")
        print(f"   Contributions fetched: {result['contributions_fetched']}")
        print(f"   Raw filing ID: {result['raw_filing_id']}")
        print(f"   Contributions stored: {result['contributions_stored']}")

except Exception as e:
    print(f"❌ Test failed: {e}")
    raise
finally:
    session.close()

In [None]:
# Query the data we just inserted
from fund_lens_etl.repos.raw_filing_repo import RawFilingRepo
from fund_lens_etl.repos.fec_staging_repo import FECContributionStagingRepo

session = SessionLocal()
raw_filing_repo = RawFilingRepo()
fec_staging_repo = FECContributionStagingRepo()

try:
    # Get the raw filing by file hash
    file_hash = result['file_hash']
    raw_filing = raw_filing_repo.get_by_file_hash(session, file_hash)

    if raw_filing:
        print(f"✅ Raw Filing Record:")
        print(f"   ID: {raw_filing.id}")
        print(f"   Source: {raw_filing.source}")
        print(f"   File URL: {raw_filing.file_url}")
        print(f"   File Hash: {raw_filing.file_hash}")
        print(f"   Ingested At: {raw_filing.ingested_at}")
        print(f"   Metadata: {raw_filing.metadata}")
        print(f"   Raw Content Records: {len(raw_filing.raw_content)}")
        print()

        # Get staging contributions
        staging_contribs = fec_staging_repo.get_by_raw_filing_id(session, raw_filing.id)
        print(f"✅ Staging Contributions: {len(staging_contribs)} records")
        print()

        # Show first contribution
        if staging_contribs:
            first = staging_contribs[0]
            print(f"   Sample Contribution:")
            print(f"   - ID: {first.id}")
            print(f"   - Committee: {first.cmte_id}")
            print(f"   - Contributor: {first.name}")
            print(f"   - City, State: {first.city}, {first.state}")
            print(f"   - Amount: {first.transaction_amt}")
            print(f"   - Date: {first.transaction_dt}")
            print(f"   - Sub ID: {first.sub_id}")
            print(f"   - Standardized: {first.standardized}")
    else:
        print("❌ Raw filing not found!")

except Exception as e:
    print(f"❌ Query failed: {e}")
    import traceback
    traceback.print_exc()
finally:
    session.close()

In [None]:
# Test query methods
session = SessionLocal()

try:
    # Test unstandardized query
    unstandardized = fec_staging_repo.get_unstandardized(session, limit=10)
    print(f"✅ Unstandardized contributions: {len(unstandardized)}")
    if unstandardized:
        print(f"   First record ID: {unstandardized[0].id}")
        print(f"   First contributor: {unstandardized[0].name}")
    print()

    # Test date range query (use FEC format YYYYMMDD)
    by_date = fec_staging_repo.get_by_date_range(
        session,
        start_date="20240101",
        end_date="20241231",
        standardized=False
    )
    print(f"✅ Contributions in 2024 date range: {len(by_date)}")
    if by_date:
        print(f"   First date: {by_date[0].transaction_dt}")
        print(f"   First amount: {by_date[0].transaction_amt}")
    print()

    # Test get by source (raw_filing_repo method)
    from datetime import datetime, timedelta
    end_date = datetime.now()
    start_date = end_date - timedelta(days=7)  # Last 7 days

    filings = raw_filing_repo.get_by_source_and_date_range(
        session,
        source="fec_api",
        start_date=start_date,
        end_date=end_date
    )
    print(f"✅ Raw filings from last 7 days: {len(filings)}")
    for filing in filings:
        print(f"   - Filing ID {filing.id}: {len(filing.raw_content)} records, hash: {filing.file_hash[:16]}...")

except Exception as e:
    print(f"❌ Query test failed: {e}")
    import traceback
    traceback.print_exc()
finally:
    session.close()

In [None]:
# Verify raw filings exist (no date filter)
session = SessionLocal()

try:
    filings = raw_filing_repo.get_by_source_and_date_range(
        session,
        source="fec_api"
    )
    print(f"✅ Total raw filings with source='fec_api': {len(filings)}")
    for filing in filings:
        print(f"   - Filing ID {filing.id}: {len(filing.raw_content)} records")
        print(f"     Ingested: {filing.ingested_at}")
        print(f"     Metadata: {filing.metadata}")
        print()

except Exception as e:
    print(f"❌ Query failed: {e}")
    import traceback
    traceback.print_exc()
finally:
    session.close()

In [None]:
# Enable debug logging to see rate limiting in action
import logging

# Set the FEC client logger to DEBUG level
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

# Set specific loggers
logging.getLogger('fund_lens_etl.clients.fec_client').setLevel(logging.DEBUG)
logging.getLogger('fund_lens_etl.services.fec_service').setLevel(logging.INFO)

print("✅ Debug logging enabled")

In [None]:
# Test rate limiting with a larger sample
# This will make multiple API calls and trigger rate limiting logic

session = SessionLocal()

try:
    print("🚀 Starting larger extraction to test rate limiting...")
    print(f"   This will fetch 500 contributions (5 pages × 100 per page)")
    print(f"   Watch for rate limit messages in the logs\n")

    result = service.extract_and_store_contributions(
        session=session,
        contributor_state="MD",
        two_year_transaction_period=2024,
        source="fec_api",
        max_results=500  # 5 API calls at 100 per page
    )

    print("\n✅ Large extraction complete!")
    print(f"   Contributions fetched: {result['contributions_fetched']}")
    print(f"   Raw filing ID: {result['raw_filing_id']}")
    print(f"   Contributions stored: {result['contributions_stored']}")
    print(f"   File hash: {result['file_hash']}")

except Exception as e:
    print(f"\n❌ Extraction failed: {e}")
    import traceback
    traceback.print_exc()
finally:
    session.close()