In [None]:
# Cell 1: Setup
%load_ext autoreload
%autoreload 2

import pandas as pd
from fund_lens_etl.clients.fec import FECAPIClient
from fund_lens_etl.extractors.fec import (
    FECScheduleAExtractor,
    FECCommitteeExtractor,
    FECCandidateExtractor,
)
from fund_lens_etl.loaders.bronze import (
    BronzeFECScheduleALoader,
    BronzeFECCommitteeLoader,
    BronzeFECCandidateLoader,
)
from fund_lens_etl.database import get_db_session
from fund_lens_etl.config import USState

print("✓ All imports successful")
print("\nThis notebook will populate bronze layer with MD 2026 data:")
print("  1. Extract & Load ALL Committees")
print("  2. Extract & Load ALL Candidates")
print("  3. Extract & Load Contributions for a few committees")

In [None]:
# Cell 2: Initialize
client = FECAPIClient()
committee_extractor = FECCommitteeExtractor(api_client=client)
candidate_extractor = FECCandidateExtractor(api_client=client)
schedule_a_extractor = FECScheduleAExtractor(api_client=client)

committee_loader = BronzeFECCommitteeLoader()
candidate_loader = BronzeFECCandidateLoader()
schedule_a_loader = BronzeFECScheduleALoader()

print("✓ All extractors and loaders initialized")

In [None]:
# Cell 3: Extract & Load ALL MD Committees
print("="*60)
print("STEP 1: Extract & Load ALL Maryland Committees (2026)")
print("="*60)

committee_df = committee_extractor.extract(
    state=USState.MD,
    cycle=2026,
)

print(f"\nExtracted {len(committee_df)} committees")

with get_db_session() as session:
    loaded = committee_loader.load(session, committee_df)
    print(f"✓ Loaded {loaded} committees to bronze_fec_committee")

print("\nCommittee type breakdown:")
print(committee_df.groupby("committee_type").size())

In [None]:
# Cell 4: Extract & Load ALL MD Candidates
print("="*60)
print("STEP 2: Extract & Load ALL Maryland Candidates (2026)")
print("="*60)

candidate_df = candidate_extractor.extract(
    state=USState.MD,
    cycle=2026,
)

print(f"\nExtracted {len(candidate_df)} candidates")

with get_db_session() as session:
    loaded = candidate_loader.load(session, candidate_df)
    print(f"✓ Loaded {loaded} candidates to bronze_fec_candidate")

print("\nOffice breakdown:")
print(candidate_df.groupby("office").size())

In [None]:
# Cell 5: Get MD Candidate Committees
print("="*60)
print("STEP 3: Get Maryland Candidate Committees")
print("="*60)

committees = schedule_a_extractor.get_candidate_committees(
    state=USState.MD,
    election_cycle=2026
)

print(f"Found {len(committees)} MD candidate committees")
print("\nSample:")
for i, comm in enumerate(committees[:5]):
    print(f"  {i+1}. {comm['committee_name']} ({comm['committee_id']}) - {comm['office']}")

In [None]:
# Cell 6: Extract & Load Schedule A for First 3 Committees
print("="*60)
print("STEP 4: Extract & Load Contributions (First 3 Committees)")
print("="*60)

total_loaded = 0

for i, committee in enumerate(committees[:3]):
    print(f"\n{i+1}. Processing: {committee['committee_name']}")
    print(f"   Committee ID: {committee['committee_id']}")

    # Extract all pages for this committee
    all_contributions = []

    try:
        for page_df, metadata in schedule_a_extractor.extract_schedule_a_pages(
            committee_id=committee['committee_id'],
            election_cycle=2026,
            starting_page=1
        ):
            all_contributions.append(page_df)

            # Show progress every 5 pages
            if metadata['page'] % 5 == 0:
                print(f"   Page {metadata['page']}/{metadata['total_pages']}: "
                      f"{len(all_contributions) * 100} records so far...")

            # Limit to first 500 records per committee for testing
            if len(all_contributions) * 100 >= 500:
                print(f"   Reached 500 record limit for testing")
                break

        if all_contributions:
            # Combine all pages
            contributions_df = pd.concat(all_contributions, ignore_index=True)
            print(f"   ✓ Extracted {len(contributions_df)} contributions")

            # Load to bronze
            with get_db_session() as session:
                loaded = schedule_a_loader.load(session, contributions_df)
                total_loaded += loaded
                print(f"   ✓ Loaded {loaded} contributions to bronze_fec_schedule_a")
        else:
            print(f"   No contributions found")

    except Exception as e:
        print(f"   ✗ Error: {e}")
        continue

print(f"\n{'='*60}")
print(f"TOTAL: Loaded {total_loaded} contributions across {i+1} committees")

In [None]:
# Cell 6 (REVISED): Extract & Load Schedule A - Skip Empty Committees
print("="*60)
print("STEP 4: Extract & Load Contributions (First 3 with data)")
print("="*60)

total_loaded = 0
committees_processed = 0
target_committees = 3

for committee in committees:
    if committees_processed >= target_committees:
        break

    print(f"\n{committees_processed + 1}. Checking: {committee['committee_name']}")
    print(f"   Committee ID: {committee['committee_id']}")

    # Extract contributions
    all_contributions = []

    try:
        for page_df, metadata in schedule_a_extractor.extract_schedule_a_pages(
            committee_id=committee['committee_id'],
            election_cycle=2026,
            starting_page=1
        ):
            # Check if committee has no data
            if metadata['page'] == 1 and metadata['total_count'] == 0:
                print(f"   ⊘ No contributions found, skipping...")
                break

            all_contributions.append(page_df)

            # Show progress every 5 pages
            if metadata['page'] % 5 == 0 or metadata['page'] == 1:
                print(f"   Page {metadata['page']}/{metadata['total_pages']}: "
                      f"{sum(len(df) for df in all_contributions)} records so far...")

            # Limit to first 500 records per committee for testing
            if sum(len(df) for df in all_contributions) >= 500:
                print(f"   Reached 500 record limit")
                break

        if all_contributions:
            # Combine all pages
            contributions_df = pd.concat(all_contributions, ignore_index=True)
            print(f"   ✓ Extracted {len(contributions_df)} contributions")

            # Load to bronze
            with get_db_session() as session:
                loaded = schedule_a_loader.load(session, contributions_df)
                total_loaded += loaded
                print(f"   ✓ Loaded {loaded} contributions to bronze_fec_schedule_a")

            committees_processed += 1
        else:
            print(f"   ⊘ No contributions, skipping...")

    except Exception as e:
        print(f"   ✗ Error: {e}")
        continue

print(f"\n{'='*60}")
print(f"TOTAL: Loaded {total_loaded} contributions from {committees_processed} committees")

In [None]:
# Cell 7: Verify Bronze Layer Population (run this again)
print("="*60)
print("BRONZE LAYER SUMMARY")
print("="*60)

with get_db_session() as session:
    from fund_lens_etl.models.bronze.fec import (
        BronzeFECCommittee,
        BronzeFECCandidate,
        BronzeFECScheduleA,
    )
    from sqlalchemy import select, func

    # Count records
    committee_count = session.execute(
        select(func.count()).select_from(BronzeFECCommittee)
    ).scalar()

    candidate_count = session.execute(
        select(func.count()).select_from(BronzeFECCandidate)
    ).scalar()

    contribution_count = session.execute(
        select(func.count()).select_from(BronzeFECScheduleA)
    ).scalar()

    print(f"\n✓ Bronze Layer Populated:")
    print(f"  Committees:    {committee_count:,}")
    print(f"  Candidates:    {candidate_count:,}")
    print(f"  Contributions: {contribution_count:,}")

    # Show which committees have contributions
    print(f"\n{'='*60}")
    print("Contributions by Committee:")

    result = session.execute(
        select(
            BronzeFECScheduleA.committee_id,
            func.count(BronzeFECScheduleA.sub_id).label('count')
        )
        .group_by(BronzeFECScheduleA.committee_id)
        .order_by(func.count(BronzeFECScheduleA.sub_id).desc())
    )

    for row in result:
        committee = session.execute(
            select(BronzeFECCommittee.name)
            .where(BronzeFECCommittee.committee_id == row.committee_id)
        ).scalar_one_or_none()

        print(f"  {row.committee_id}: {row.count:,} contributions")
        if committee:
            print(f"    ({committee})")

In [None]:
# Diagnostic: Check if duplicates were from API or database
print("="*60)
print("DIAGNOSTIC: Source of Duplicates")
print("="*60)

# Re-extract one page from ALSOBROOKS without loading
test_df = None
for page_df, metadata in schedule_a_extractor.extract_schedule_a_pages(
    committee_id="C00840017",  # ALSOBROOKS
    election_cycle=2026,
    starting_page=1
):
    test_df = page_df
    break

if test_df is not None:
    print(f"One page extracted: {len(test_df)} records")

    # Check for duplicates in the raw page
    duplicates_in_page = test_df['sub_id'].duplicated().sum()
    print(f"Duplicates within this single page: {duplicates_in_page}")

    # Check unique sub_ids
    unique_count = test_df['sub_id'].nunique()
    print(f"Unique sub_ids in page: {unique_count}")

    # Check what's already in database
    with get_db_session() as session:
        from fund_lens_etl.models.bronze.fec import BronzeFECScheduleA
        from sqlalchemy import select

        existing_sub_ids = list(test_df['sub_id'].head(10))

        result = session.execute(
            select(BronzeFECScheduleA.sub_id)
            .where(BronzeFECScheduleA.sub_id.in_(existing_sub_ids))
        )

        db_sub_ids = [row[0] for row in result]

        print(f"\nChecking first 10 sub_ids from extracted page:")
        print(f"  Already in database: {len(db_sub_ids)}/10")

        if len(db_sub_ids) > 0:
            print(f"\n  ✓ Duplicates were from EXISTING database records (UPSERT working correctly)")
        else:
            print(f"\n  ✗ No existing records found - duplicates were from API")