In [None]:
# Cell 1: Setup
%load_ext autoreload
%autoreload 2

import pandas as pd
from fund_lens_etl.clients.maryland import MarylandCRISClient, MarylandSBEClient
from fund_lens_etl.extractors.maryland import (
    MarylandContributionExtractor,
    MarylandCommitteeExtractor,
    MarylandCandidateExtractor,
)

# Test MDCRIS client
print("Testing Maryland CRIS Client...")
cris_client = MarylandCRISClient()
print(f"✓ CRIS Client initialized")
print(f"  Base URL: {cris_client.BASE_URL}")

# Test SBE client
print("\nTesting Maryland SBE Client...")
sbe_client = MarylandSBEClient()
print(f"✓ SBE Client initialized")
print(f"  Base URL: {sbe_client.BASE_URL}")

In [None]:
# Cell 2: Test Committee Extractor
print("="*60)
print("TEST 1: Extract Committees from MDCRIS")
print("="*60)

committee_extractor = MarylandCommitteeExtractor(client=cris_client)

# Extract active committees
df = committee_extractor.extract(status="A")

print(f"\nExtracted {len(df)} active committees")
if not df.empty:
    print("\nColumns:", df.columns.tolist())
    print("\nCommittee types:")
    print(df.groupby("committee_type").size())
    print("\nSample data:")
    print(df.head()[['ccf_id', 'committee_name', 'committee_type', 'committee_status']].to_string())

In [None]:
# Cell 3: Test Contribution Extractor - Recent Contributions
print("="*60)
print("TEST 2: Extract Recent Contributions from MDCRIS")
print("="*60)

from datetime import date, timedelta

contribution_extractor = MarylandContributionExtractor(client=cris_client)

# Get contributions from last 30 days
end_date = date.today()
start_date = end_date - timedelta(days=30)

print(f"Date range: {start_date} to {end_date}")

df = contribution_extractor.extract(
    start_date=start_date,
    end_date=end_date,
)

print(f"\nExtracted {len(df)} contributions")
if not df.empty:
    print("\nColumns:", df.columns.tolist())
    print("\nContribution types:")
    print(df.groupby("contribution_type").size())
    print("\nSample data:")
    sample_cols = ['receiving_committee', 'contributor_name', 'contribution_amount', 'contribution_date']
    print(df.head()[sample_cols].to_string())

In [None]:
# Cell 4: Test Contribution Extractor - By Filing Year
print("="*60)
print("TEST 3: Extract Contributions by Filing Year (2025)")
print("="*60)

df = contribution_extractor.extract(filing_year=2025)

print(f"\nExtracted {len(df)} contributions for filing year 2025")
if not df.empty:
    print("\nContributor types:")
    print(df.groupby("contributor_type").size())
    print("\nTotal contribution amount: ${:,.2f}".format(
        df['contribution_amount'].astype(float).sum()
    ))
    print("\nTop 5 receiving committees:")
    print(df.groupby('receiving_committee')['contribution_amount'].apply(
        lambda x: x.astype(float).sum()
    ).sort_values(ascending=False).head())

In [None]:
# Cell 5: Test Candidate Extractor - Current Year
print("="*60)
print("TEST 4: Extract Candidates from Maryland SBE")
print("="*60)

candidate_extractor = MarylandCandidateExtractor(client=sbe_client)

# Extract candidates for 2026
df = candidate_extractor.extract(year=2026)

print(f"\nExtracted {len(df)} candidates for 2026")
if not df.empty:
    print("\nColumns:", df.columns.tolist())
    print("\nOffice breakdown:")
    print(df.groupby("office_name").size().sort_values(ascending=False).head(10))
    print("\nParty breakdown:")
    print(df.groupby("party").size())
    print("\nStatus breakdown:")
    print(df.groupby("status").size())
    print("\nSample data:")
    sample_cols = ['candidate_last_name', 'candidate_first_name', 'office_name', 'party', 'status']
    print(df.head()[sample_cols].to_string())

In [None]:
# Cell 6: Test Candidate Extractor - Multiple Years
print("="*60)
print("TEST 5: Extract Candidates for Multiple Years (2024, 2026)")
print("="*60)

df = candidate_extractor.extract_multiple_years(years=[2024, 2026])

print(f"\nExtracted {len(df)} total candidates")
if not df.empty:
    print("\nBy election year:")
    print(df.groupby("election_year").size())
    print("\nBy election type:")
    print(df.groupby(["election_year", "election_type"]).size())

In [None]:
# Cell 7: Verify Content Hash Uniqueness
print("="*60)
print("TEST 6: Verify Content Hash for Deduplication")
print("="*60)

# Check contributions
if 'content_hash' in df.columns:
    print(f"\nCandidate content_hash uniqueness:")
    print(f"  Total records: {len(df)}")
    print(f"  Unique hashes: {df['content_hash'].nunique()}")
    print(f"  Hash length: {df['content_hash'].iloc[0].__len__()} characters")

# Also check a contribution sample
contrib_df = contribution_extractor.extract(
    start_date=date.today() - timedelta(days=7),
    end_date=date.today(),
)
if not contrib_df.empty and 'content_hash' in contrib_df.columns:
    print(f"\nContribution content_hash uniqueness:")
    print(f"  Total records: {len(contrib_df)}")
    print(f"  Unique hashes: {contrib_df['content_hash'].nunique()}")
    print(f"  Sample hash: {contrib_df['content_hash'].iloc[0][:32]}...")

In [None]:
# Cell 8: Summary
print("="*60)
print("SUMMARY: Maryland Bronze Extractors")
print("="*60)
print("\n✓ MarylandCommitteeExtractor - Downloads committee data from MDCRIS")
print("  - Uses CCF ID as natural key")
print("  - Supports status filter (A=Active)")
print("\n✓ MarylandContributionExtractor - Downloads contribution data from MDCRIS")
print("  - Generates content hash for deduplication")
print("  - Supports date range and filing year filters")
print("\n✓ MarylandCandidateExtractor - Downloads candidate data from MD SBE")
print("  - Generates content hash for deduplication")
print("  - Handles Primary, General, and Special elections")
print("  - Supports single year and multi-year extraction")