In [None]:
"""
Gold Transformation Flow Testing Notebook

Tests the complete gold transformation pipeline:
- Silver → Gold transformations
- Entity resolution and deduplication
- Foreign key relationships
"""

import sys
from sqlalchemy import func, select

# Add project to path
sys.path.insert(0, '/path/to/your/fund-lens-etl')  # UPDATE THIS PATH

from fund_lens_etl.database import get_session
from fund_lens_etl.flows.gold_transformation_flow import gold_transformation_flow
from fund_lens_etl.models.gold import (
    GoldCandidate,
    GoldCommittee,
    GoldContribution,
    GoldContributor,
)
from fund_lens_etl.models.silver import (
    SilverFECCandidate,
    SilverFECCommittee,
    SilverFECContribution,
)

print("✓ Imports successful")

In [None]:
"""
Verify we have data in Silver layer to transform
"""

with get_session() as session:
    silver_committees = session.execute(
        select(func.count()).select_from(SilverFECCommittee)
    ).scalar()

    silver_candidates = session.execute(
        select(func.count()).select_from(SilverFECCandidate)
    ).scalar()

    silver_contributions = session.execute(
        select(func.count()).select_from(SilverFECContribution)
    ).scalar()

    print("=" * 60)
    print("SILVER LAYER (INPUT)")
    print("=" * 60)
    print(f"Committees:    {silver_committees:,}")
    print(f"Candidates:    {silver_candidates:,}")
    print(f"Contributions: {silver_contributions:,}")
    print("=" * 60)

    if silver_contributions == 0:
        print("\n⚠ WARNING: No silver data found!")
        print("Run the silver_transformation_flow first.")

In [None]:
"""
Check what's currently in Gold layer
"""

with get_session() as session:
    gold_contributors = session.execute(
        select(func.count()).select_from(GoldContributor)
    ).scalar()

    gold_committees = session.execute(
        select(func.count()).select_from(GoldCommittee)
    ).scalar()

    gold_candidates = session.execute(
        select(func.count()).select_from(GoldCandidate)
    ).scalar()

    gold_contributions = session.execute(
        select(func.count()).select_from(GoldContribution)
    ).scalar()

    print("=" * 60)
    print("GOLD LAYER (BEFORE TRANSFORMATION)")
    print("=" * 60)
    print(f"Contributors:  {gold_contributors:,}")
    print(f"Committees:    {gold_committees:,}")
    print(f"Candidates:    {gold_candidates:,}")
    print(f"Contributions: {gold_contributions:,}")
    print("=" * 60)

In [None]:
"""
Execute the gold transformation flow
This will transform all Silver data to Gold layer
"""

# Run the flow with MD state and 2026 cycle filters
flow_result = gold_transformation_flow(
    state="MD",
    cycle=2026
)

print("\n" + "=" * 60)
print("FLOW EXECUTION COMPLETE")
print("=" * 60)
print(f"Success: {flow_result['success']}")
print(f"Validation Passed: {flow_result['validation_stats']['validation_passed']}")
print(f"Errors: {flow_result['validation_stats']['total_errors']}")
print(f"Warnings: {flow_result['validation_stats']['total_warnings']}")

In [None]:
"""
Verify data was loaded into Gold layer
"""

with get_session() as session:
    gold_contributors = session.execute(
        select(func.count()).select_from(GoldContributor)
    ).scalar()

    gold_committees = session.execute(
        select(func.count()).select_from(GoldCommittee)
    ).scalar()

    gold_candidates = session.execute(
        select(func.count()).select_from(GoldCandidate)
    ).scalar()

    gold_contributions = session.execute(
        select(func.count()).select_from(GoldContribution)
    ).scalar()

    print("=" * 60)
    print("GOLD LAYER (AFTER TRANSFORMATION)")
    print("=" * 60)
    print(f"Contributors:  {gold_contributors:,}")
    print(f"Committees:    {gold_committees:,}")
    print(f"Candidates:    {gold_candidates:,}")
    print(f"Contributions: {gold_contributions:,}")
    print("=" * 60)

In [None]:
"""
Show deduplication working - compare Silver to Gold contributors
"""

with get_session() as session:
    # Count distinct contributors in Silver
    stmt = select(
        func.count(func.distinct(SilverFECContribution.contributor_name))
    ).select_from(SilverFECContribution)
    silver_unique = session.execute(stmt).scalar()

    # Count contributors in Gold
    gold_count = session.execute(
        select(func.count()).select_from(GoldContributor)
    ).scalar()

    dedup_rate = (silver_unique - gold_count) / silver_unique if silver_unique > 0 else 0

    print("=" * 60)
    print("CONTRIBUTOR DEDUPLICATION")
    print("=" * 60)
    print(f"Silver unique names:     {silver_unique:,}")
    print(f"Gold deduplicated:       {gold_count:,}")
    print(f"Deduplication rate:      {dedup_rate:.1%}")
    print("=" * 60)

In [None]:
"""
Show sample records from each Gold table
"""

with get_session() as session:
    # Sample contributor
    stmt = select(GoldContributor).limit(1)
    contributor = session.execute(stmt).scalar_one_or_none()

    print("=" * 60)
    print("SAMPLE GOLD CONTRIBUTOR")
    print("=" * 60)
    if contributor:
        print(f"ID:         {contributor.id}")
        print(f"Name:       {contributor.name}")
        print(f"Location:   {contributor.city}, {contributor.state}")
        print(f"Employer:   {contributor.employer}")
        print(f"Confidence: {contributor.match_confidence}")

    # Sample committee
    stmt = select(GoldCommittee).limit(1)
    committee = session.execute(stmt).scalar_one_or_none()

    print("\n" + "=" * 60)
    print("SAMPLE GOLD COMMITTEE")
    print("=" * 60)
    if committee:
        print(f"ID:             {committee.id}")
        print(f"Name:           {committee.name}")
        print(f"Type:           {committee.committee_type}")
        print(f"Party:          {committee.party}")
        print(f"FEC ID:         {committee.fec_committee_id}")

    # Sample candidate
    stmt = select(GoldCandidate).limit(1)
    candidate = session.execute(stmt).scalar_one_or_none()

    print("\n" + "=" * 60)
    print("SAMPLE GOLD CANDIDATE")
    print("=" * 60)
    if candidate:
        print(f"ID:         {candidate.id}")
        print(f"Name:       {candidate.name}")
        print(f"Office:     {candidate.office}")
        print(f"State:      {candidate.state}")
        print(f"Party:      {candidate.party}")
        print(f"FEC ID:     {candidate.fec_candidate_id}")

    # Sample contribution with all FKs
    stmt = select(GoldContribution).limit(1)
    contribution = session.execute(stmt).scalar_one_or_none()

    print("\n" + "=" * 60)
    print("SAMPLE GOLD CONTRIBUTION (FACT)")
    print("=" * 60)
    if contribution:
        print(f"ID:                    {contribution.id}")
        print(f"Amount:                ${contribution.amount}")
        print(f"Date:                  {contribution.contribution_date}")
        print(f"Contributor FK:        {contribution.contributor_id}")
        print(f"Committee FK:          {contribution.recipient_committee_id}")
        print(f"Candidate FK:          {contribution.recipient_candidate_id}")
        print(f"Type:                  {contribution.contribution_type}")
        print(f"Source:                {contribution.source_system}")

In [None]:
"""
Verify FK relationships are working - join Gold tables
"""

with get_session() as session:
    # Get a contribution with all its related data via JOINs
    stmt = (
        select(
            GoldContribution,
            GoldContributor,
            GoldCommittee,
            GoldCandidate,
        )
        .join(GoldContributor, GoldContribution.contributor_id == GoldContributor.id)
        .join(GoldCommittee, GoldContribution.recipient_committee_id == GoldCommittee.id)
        .outerjoin(GoldCandidate, GoldContribution.recipient_candidate_id == GoldCandidate.id)
        .limit(1)
    )

    result = session.execute(stmt).first()

    if result:
        contribution, contributor, committee, candidate = result

        print("=" * 60)
        print("COMPLETE CONTRIBUTION WITH RELATIONSHIPS")
        print("=" * 60)
        print(f"\nContributor: {contributor.name}")
        print(f"  Location: {contributor.city}, {contributor.state}")
        print(f"  Employer: {contributor.employer}")
        print(f"\nContribution: ${contribution.amount}")
        print(f"  Date: {contribution.contribution_date}")
        print(f"  Type: {contribution.contribution_type}")
        print(f"\nRecipient Committee: {committee.name}")
        print(f"  Type: {committee.committee_type}")
        print(f"  Party: {committee.party}")

        if candidate:
            print(f"\nRecipient Candidate: {candidate.name}")
            print(f"  Office: {candidate.office}")
            print(f"  Party: {candidate.party}")
        else:
            print(f"\nRecipient Candidate: None (non-candidate committee)")

        print("=" * 60)
        print("✓ Foreign key relationships working!")

In [None]:
"""
Final summary of the Gold layer
"""

print("=" * 60)
print("GOLD LAYER SUMMARY")
print("=" * 60)

if isinstance(flow_result, dict) and 'contributor_stats' in flow_result:
    print("\nTransformation Results:")
    print(f"  Contributors:  {flow_result['contributor_stats']['total_gold_contributors']:,}")
    print(f"  Committees:    {flow_result['committee_stats']['total_committees']:,}")
    print(f"  Candidates:    {flow_result['candidate_stats']['total_candidates']:,}")
    print(f"  Contributions: {flow_result['contribution_stats']['loaded_count'] + flow_result['contribution_stats']['updated_count']:,}")

    print("\nValidation:")
    print(f"  Passed: {flow_result['validation_stats']['validation_passed']}")
    print(f"  Errors: {flow_result['validation_stats']['total_errors']}")
    print(f"  Warnings: {flow_result['validation_stats']['total_warnings']}")

    if flow_result['validation_stats']['errors']:
        print("\nErrors:")
        for error in flow_result['validation_stats']['errors']:
            print(f"  - {error}")

    if flow_result['validation_stats']['warnings']:
        print("\nWarnings:")
        for warning in flow_result['validation_stats']['warnings']:
            print(f"  - {warning}")

    print("=" * 60)
    print("✓ Gold transformation flow test complete!")
else:
    print("⚠ Unable to display summary - unexpected result structure")