In [4]:
import polars as pl

# File paths
prov_path = r"C:\Users\ChristopherCato\OneDrive - clarity-dx.com\code\bph\mrf-etl\prod_etl\data\input\providers_ga_uhc_20250907_183348.parquet"
rate_path = r"C:\Users\ChristopherCato\OneDrive - clarity-dx.com\code\bph\mrf-etl\prod_etl\data\input\rates_ga_uhc_20250907_183414.parquet"

# Only read the columns needed for the join (replace with actual column names)
prov_col = "provider_group_id"
rate_col = "provider_reference_id"

# Read only the join columns, lazily for memory efficiency
prov_lazy = pl.scan_parquet(prov_path).select([prov_col])
rate_lazy = pl.scan_parquet(rate_path).select([rate_col])

# Add index columns to track original row positions
prov_lazy = prov_lazy.with_row_count("prov_idx")
rate_lazy = rate_lazy.with_row_count("rate_idx")

# Perform a semi-join to get all rows in rate that have a hit in prov
hits_in_rate = rate_lazy.join(
    prov_lazy,
    left_on=rate_col,
    right_on=prov_col,
    how="semi"
)

# Collect the result (this will only materialize the hits, not the full table)
hits_result = hits_in_rate.collect()

# Show all the hits in rate
print(hits_result)

# Optionally, show how many hits there are and what percent of rate this is
rate_unique = pl.scan_parquet(rate_path).select(rate_col).unique().collect()
hits_unique = hits_result.select(rate_col).unique().height
rate_total = rate_unique.height
hits_pct = (hits_unique / rate_total * 100) if rate_total else 0

print(f"\nUnique {rate_col} in rates: {rate_total}")
print(f"Unique {rate_col} with a hit in providers: {hits_unique} ({hits_pct:.2f}%)")

  prov_lazy = prov_lazy.with_row_count("prov_idx")
  rate_lazy = rate_lazy.with_row_count("rate_idx")


shape: (3_713_605, 2)
┌──────────┬───────────────────────┐
│ rate_idx ┆ provider_reference_id │
│ ---      ┆ ---                   │
│ u32      ┆ i64                   │
╞══════════╪═══════════════════════╡
│ 0        ┆ 251                   │
│ 1        ┆ 251                   │
│ 2        ┆ 484                   │
│ 3        ┆ 484                   │
│ 4        ┆ 711                   │
│ …        ┆ …                     │
│ 3713600  ┆ 803                   │
│ 3713601  ┆ 483                   │
│ 3713602  ┆ 483                   │
│ 3713603  ┆ 696                   │
│ 3713604  ┆ 696                   │
└──────────┴───────────────────────┘

Unique provider_reference_id in rates: 637
Unique provider_reference_id with a hit in providers: 637 (100.00%)


In [5]:
# Diagnostic: Check the actual pg_uid generation
import polars as pl
import hashlib

def md5(s: str) -> str:
    return hashlib.md5(s.encode("utf-8")).hexdigest()

# Read a sample from both files
rates_sample = pl.read_parquet("data/input/rates_ga_uhc_20250907_183414.parquet").head(5)
prov_sample = pl.read_parquet("data/input/providers_ga_uhc_20250907_183348.parquet").head(5)

print("=== RATES SAMPLE ===")
print(rates_sample.select(["reporting_entity_name", "version", "provider_reference_id", "provider_group_id"]))

print("\n=== PROVIDER SAMPLE ===")
print(prov_sample.select(["reporting_entity_name", "version", "provider_group_id", "provider_reference_id"]))

# Check if the same provider_reference_id appears in both files
common_refs = rates_sample.join(
    prov_sample, 
    on="provider_reference_id", 
    how="inner"
).select("provider_reference_id").unique()

print(f"\nCommon provider_reference_ids: {common_refs.height}")

if common_refs.height > 0:
    # Show the pg_uid generation for a common provider
    common_ref = common_refs.item(0, 0)
    
    rates_row = rates_sample.filter(pl.col("provider_reference_id") == common_ref).head(1)
    prov_row = prov_sample.filter(pl.col("provider_reference_id") == common_ref).head(1)
    
    if rates_row.height > 0 and prov_row.height > 0:
        # Generate pg_uid using current logic
        rates_key = f"{rates_row.item(0, 'reporting_entity_name')}|{rates_row.item(0, 'version')}|{rates_row.item(0, 'provider_group_id')}|{rates_row.item(0, 'provider_reference_id')}"
        prov_key = f"{prov_row.item(0, 'reporting_entity_name')}|{prov_row.item(0, 'version')}|{prov_row.item(0, 'provider_group_id')}|{prov_row.item(0, 'provider_reference_id')}"
        
        print(f"\nRates key: {rates_key}")
        print(f"Provider key: {prov_key}")
        print(f"Keys match: {rates_key == prov_key}")
        print(f"Rates pg_uid: {md5(rates_key)}")
        print(f"Provider pg_uid: {md5(prov_key)}")

=== RATES SAMPLE ===


ColumnNotFoundError: provider_group_id

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'sink' <---
DF ["provider_reference_id", "negotiated_rate", "negotiated_type", "billing_class", ...]; PROJECT */15 COLUMNS

In [6]:
import pandas as pd
xref_pg_member_npi = pd.read_parquet("core/data/xrefs/xref_pg_member_npi.parquet")
print(xref_pg_member_npi.columns)
print(xref_pg_member_npi.head())

Index(['pg_uid', 'npi'], dtype='object')
                             pg_uid         npi
0  33c5ebf41b7fe9461b8ccf3202cb6604  1780875781
1  11ce3cbdcf491bc5ea76386e84a55b4d  1386400323
2  9d0a0e0ada1fa8f3ed7df5a0eea6957d  1083664288
3  08c561c102c030d0a494b84b94e8c0a7  1154433902
4  45e4780a665bccba6d35bb962e3c7b00  1184604191
