In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Files you already shared
UHC_CSV = Path("GA_UHC.csv")
GA_WC_FLAT = Path("ga_wc_flat.csv")

# --- Load ---
uhc = pd.read_csv(UHC_CSV, low_memory=False)
wc  = pd.read_csv(GA_WC_FLAT, low_memory=False)

In [2]:
import pandas as pd
import numpy as np

# --- build code -> bucket map ---
bucket_to_codes = {
    "E/M": ["99202","99203","99204","99205","99212","99213","99214","99215",
            "99281","99282","99283","99284","99285"],
    "X-ray": ["72040","72050","72100","72110","73030","73060","73090",
              "73110","73130","73502","73560","73562","73564","73610","73630"],
}
code_to_bucket = {c: b for b, lst in bucket_to_codes.items() for c in lst}

# --- keyword regex for work injury / ortho ---
kw = r'(?i)\b(shoulder|knee|hip|wrist|hand|elbow|ankle|foot|clavicle|' \
     r'fracture|orif|dislocation|sprain|strain|' \
     r'arthro(scop|plast)|injection|epidural|facet|' \
     r'cast|splint|brace|orthosis|' \
     r'physical\s*therapy|occupational\s*therapy|\bPT\b|\bOT\b|' \
     r'emg|nerve\s*conduction|ncs|\bmri\b|\bct\b|x-?ray)\b'

# work on the main uhc dataframe directly
uhc["billing_code_norm"] = uhc["billing_code"].astype(str).str.strip().str.upper()
uhc["ortho_bucket"] = uhc["billing_code_norm"].map(code_to_bucket)
desc = uhc["description"].astype(str).fillna("")

uhc["hit_reason"] = np.select(
    [uhc["ortho_bucket"].notna(), desc.str.contains(kw, regex=True, na=False)],
    ["code", "keyword"],
    default=None
)

uhc["ortho_flag"] = np.where(uhc["hit_reason"].notna(), "yes", "")

# Optionally preview the result
print(f"columns of uhc: {uhc.columns}")
print(f"total rows: {len(uhc)}")
print(f"Total rows flagged as ortho: {uhc['ortho_flag'].eq('yes').sum()}")

  [uhc["ortho_bucket"].notna(), desc.str.contains(kw, regex=True, na=False)],


columns of uhc: Index(['Unnamed: 0', 'provider_group_id', 'npi', 'tin_type', 'tin_value',
       'reporting_entity_name_x', 'reporting_entity_type_x',
       'last_updated_on_x', 'version_x', 'provider_reference_id',
       'negotiated_rate', 'negotiated_type', 'billing_class',
       'expiration_date', 'service_codes', 'billing_code', 'billing_code_type',
       'description', 'name', 'negotiation_arrangement',
       'reporting_entity_name_y', 'reporting_entity_type_y',
       'last_updated_on_y', 'version_y', 'provider_name', 'credentials',
       'gender', 'enumeration_date', 'last_updated', 'addresses',
       'primary_specialty', 'secondary_specialties', 'provider_type',
       'metadata', 'city', 'country', 'fax', 'phone', 'purpose', 'state',
       'street', 'type', 'full_zip', 'zip5', 'ZIP', 'CBSA',
       'USPS_ZIP_PREF_CITY', 'USPS_ZIP_PREF_STATE', 'RES_RATIO', 'BUS_RATIO',
       'OTH_RATIO', 'TOT_RATIO', 'CBSA Code', 'Metropolitan Division Code',
       'CSA Code', 'CBSA T

In [3]:
# Cell — Simple overlap + non-overlap lists (normalized)

import pandas as pd
import numpy as np

def norm_code(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip().str.upper()
    s = s.str.replace(r"[^A-Z0-9]", "", regex=True)   # drop spaces/dashes/etc
    is_digits = s.str.fullmatch(r"\d+")               # pure digits → pad to 5 for CPT
    s = s.where(~is_digits, s.str.zfill(5))
    return s

uhc_codes = norm_code(uhc["billing_code"]).replace({"", "NAN"}, pd.NA).dropna().unique()
wc_codes  = norm_code(wc["code"]).replace({"", "NAN"}, pd.NA).dropna().unique()

set_uhc = set(uhc_codes)
set_wc  = set(wc_codes)
inter   = set_uhc & set_wc
uhc_only = sorted(list(set_uhc - set_wc))
wc_only  = sorted(list(set_wc - set_uhc))

a, b, i = len(set_uhc), len(set_wc), len(inter)
pct_a = (i / a * 100) if a else 0.0
pct_b = (i / b * 100) if b else 0.0

print(f"UHC unique billing_code: {a:,}")
print(f"GA WC unique code:       {b:,}")
print(f"Overlap: {i:,}  ({pct_a:.1f}% of UHC, {pct_b:.1f}% of GA WC)")

print(f"\nUHC-only codes: {len(uhc_only):,}  (showing first 25)")
print(uhc_only[:25])

print(f"\nGA WC-only codes: {len(wc_only):,}  (showing first 25)")
print(wc_only[:25])


UHC unique billing_code: 6,420
GA WC unique code:       11,181
Overlap: 5,583  (87.0% of UHC, 49.9% of GA WC)

UHC-only codes: 837  (showing first 25)
['0001A', '0001U', '0002A', '0002M', '0002U', '0003A', '0003M', '0003U', '0004A', '0004M', '0005U', '0006M', '0007M', '0007U', '0008U', '0009U', '0010U', '0011A', '0011M', '0011U', '0012A', '0012M', '0012U', '0013A', '0013M']

GA WC-only codes: 5,598  (showing first 25)
['00001', '00002', '00003', '00004', '00005', '00006', '00007', '00008', '00010', '00011', '00012', '00013', '00014', '00016', '00017', '00018', '00019', '00020', '00021', '00022', '00023', '00024', '00025', '00026', '00027']


In [4]:
# Cell — Build full UHC + GA WC dataframes with GA WC rate mapped in and % of GA rate

import pandas as pd

def norm_code(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip().str.upper()
    s = s.str.replace(r"[^A-Z0-9]", "", regex=True)
    is_digits = s.str.fullmatch(r"\d+")
    return s.where(~is_digits, s.str.zfill(5))

# --- Prep GA WC: per-code reference rate (median) ---
wc_full = wc.copy()
wc_full["code_norm"] = norm_code(wc_full["code"])
wc_full["price"] = pd.to_numeric(wc_full["price"], errors="coerce")
wc_rate_map = (
    wc_full.dropna(subset=["code_norm","price"])
           .groupby("code_norm")["price"]
           .median()
           .to_dict()
)

# --- Enhance UHC with GA WC rate + % of GA rate ---
uhc_full = uhc.copy()
uhc_full["code_norm"] = norm_code(uhc_full["billing_code"])
uhc_full["negotiated_rate"] = pd.to_numeric(uhc_full["negotiated_rate"], errors="coerce")
uhc_full["ga_wc_rate"] = uhc_full["code_norm"].map(wc_rate_map)
uhc_full["pct_of_ga_wc"] = (uhc_full["negotiated_rate"] / uhc_full["ga_wc_rate"]) * 100
uhc_full["source"] = "UHC"

# --- Enhance GA WC rows similarly (their own % will be around 100 if same as median) ---
wc_full["ga_wc_rate"] = wc_full["code_norm"].map(wc_rate_map)
wc_full["pct_of_ga_wc"] = (wc_full["price"] / wc_full["ga_wc_rate"]) * 100
wc_full["source"] = "GA_WC"

# --- Combine full rows from both sources (matching & non-matching codes included; pct will be NaN where no match) ---
combined_df = pd.concat([uhc_full, wc_full], ignore_index=True, sort=False)

# Optional: keep only rows where a GA WC reference exists
combined_df = combined_df[combined_df["ga_wc_rate"].notna()].copy()
combined_df = combined_df.dropna(subset=["negotiated_rate"])
display(combined_df.head(2))
print(f"Total rows: {len(combined_df)}")

# save
combined_df.to_csv("combined_df.csv", index=False)

Unnamed: 0.1,Unnamed: 0,provider_group_id,npi,tin_type,tin_value,reporting_entity_name_x,reporting_entity_type_x,last_updated_on_x,version_x,provider_reference_id,...,modifier,component,site_of_service,apc_code,ms_drg_code,si,pi,fud,price,price_note
26187,26187.0,168.0,1700900000.0,ein,201354399.0,UnitedHealthcare of Georgia Inc.,Insurer,2025-08-01,1.0.0,168.0,...,,,,,,,,,,
26188,26188.0,168.0,1700900000.0,ein,201354399.0,UnitedHealthcare of Georgia Inc.,Insurer,2025-08-01,1.0.0,168.0,...,,,,,,,,,,


Total rows: 210145


In [5]:
print(f"sample of combined_df:")
display(combined_df.head(2))
print(f"Total rows: {len(combined_df)}")
print(f"columns: {combined_df.columns}")
# save
combined_df.head(25).to_csv("combined_df.csv", index=False)

sample of combined_df:


Unnamed: 0.1,Unnamed: 0,provider_group_id,npi,tin_type,tin_value,reporting_entity_name_x,reporting_entity_type_x,last_updated_on_x,version_x,provider_reference_id,...,modifier,component,site_of_service,apc_code,ms_drg_code,si,pi,fud,price,price_note
26187,26187.0,168.0,1700900000.0,ein,201354399.0,UnitedHealthcare of Georgia Inc.,Insurer,2025-08-01,1.0.0,168.0,...,,,,,,,,,,
26188,26188.0,168.0,1700900000.0,ein,201354399.0,UnitedHealthcare of Georgia Inc.,Insurer,2025-08-01,1.0.0,168.0,...,,,,,,,,,,


Total rows: 210145
columns: Index(['Unnamed: 0', 'provider_group_id', 'npi', 'tin_type', 'tin_value',
       'reporting_entity_name_x', 'reporting_entity_type_x',
       'last_updated_on_x', 'version_x', 'provider_reference_id',
       'negotiated_rate', 'negotiated_type', 'billing_class',
       'expiration_date', 'service_codes', 'billing_code', 'billing_code_type',
       'description', 'name', 'negotiation_arrangement',
       'reporting_entity_name_y', 'reporting_entity_type_y',
       'last_updated_on_y', 'version_y', 'provider_name', 'credentials',
       'gender', 'enumeration_date', 'last_updated', 'addresses',
       'primary_specialty', 'secondary_specialties', 'provider_type',
       'metadata', 'city', 'country', 'fax', 'phone', 'purpose', 'state',
       'street', 'type', 'full_zip', 'zip5', 'ZIP', 'CBSA',
       'USPS_ZIP_PREF_CITY', 'USPS_ZIP_PREF_STATE', 'RES_RATIO', 'BUS_RATIO',
       'OTH_RATIO', 'TOT_RATIO', 'CBSA Code', 'Metropolitan Division Code',
       'CSA Co

In [6]:
combined_df.rename(columns={'zip5': 'service_zip'}, inplace=True)  # or {'provider_zip': 'service_zip'}
print(f"columns: {combined_df.columns}")

columns: Index(['Unnamed: 0', 'provider_group_id', 'npi', 'tin_type', 'tin_value',
       'reporting_entity_name_x', 'reporting_entity_type_x',
       'last_updated_on_x', 'version_x', 'provider_reference_id',
       'negotiated_rate', 'negotiated_type', 'billing_class',
       'expiration_date', 'service_codes', 'billing_code', 'billing_code_type',
       'description', 'name', 'negotiation_arrangement',
       'reporting_entity_name_y', 'reporting_entity_type_y',
       'last_updated_on_y', 'version_y', 'provider_name', 'credentials',
       'gender', 'enumeration_date', 'last_updated', 'addresses',
       'primary_specialty', 'secondary_specialties', 'provider_type',
       'metadata', 'city', 'country', 'fax', 'phone', 'purpose', 'state',
       'street', 'type', 'full_zip', 'service_zip', 'ZIP', 'CBSA',
       'USPS_ZIP_PREF_CITY', 'USPS_ZIP_PREF_STATE', 'RES_RATIO', 'BUS_RATIO',
       'OTH_RATIO', 'TOT_RATIO', 'CBSA Code', 'Metropolitan Division Code',
       'CSA Code', 'CBSA T

join in the merged df to medicare physician data

In [7]:
# Cell — Add Medicare allowed to /mnt/data/combined_df.csv using ZIP→locality (SQLite), then compute % of Medicare

import sqlite3
import pandas as pd
import numpy as np
from pathlib import Path

DB_PATH = r"compensation_rates.db"   # <-- change me
YEAR    = 2025
DEFAULT_ZIP = "30303"                 # fallback if your CSV lacks a ZIP column

def norm_code(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip().str.upper().str.replace(r"[^A-Z0-9]", "", regex=True)
    return s.where(~s.str.fullmatch(r"\d+"), s.str.zfill(5))

def fetch_medicare_by_zip_for_codes(zip_code: str, codes: list[str]) -> pd.DataFrame:
    """Return: columns [zip_code, code, medicare_allowed] for one ZIP + many CPT codes."""
    if not codes:
        return pd.DataFrame(columns=["zip_code","code","medicare_allowed"])
    codes = sorted(set(codes))
    rows = []
    with sqlite3.connect(DB_PATH) as conn:
        conn.row_factory = sqlite3.Row
        # chunk codes to avoid SQLite param limits
        for i in range(0, len(codes), 800):
            chunk = codes[i:i+800]
            placeholders = ",".join(["?"] * len(chunk))
            sql = f"""
                SELECT
                    ? AS zip_code,
                    rvu.procedure_code AS code,
                    (
                      (COALESCE(rvu.work_rvu, 0) * COALESCE(gpci.work_gpci, 0) +
                       COALESCE(rvu.practice_expense_rvu, 0) * COALESCE(gpci.pe_gpci, 0) +
                       COALESCE(rvu.malpractice_rvu, 0) * COALESCE(gpci.mp_gpci, 0))
                      * COALESCE(cf.conversion_factor, 0)
                    ) AS medicare_allowed
                FROM medicare_locality_map mloc
                JOIN medicare_locality_meta meta
                  ON mloc.carrier_code = meta.mac_code AND mloc.locality_code = meta.locality_code
                JOIN cms_gpci gpci
                  ON TRIM(meta.fee_schedule_area) = TRIM(gpci.locality_name)
                 AND mloc.locality_code = gpci.locality_code
                JOIN cms_rvu rvu ON rvu.year = gpci.year
                JOIN cms_conversion_factor cf ON gpci.year = cf.year
                WHERE mloc.zip_code = ?
                  AND gpci.year = ?
                  AND rvu.year = ?
                  AND (rvu.modifier IS NULL OR rvu.modifier = '')
                  AND rvu.procedure_code IN ({placeholders})
            """
            params = [zip_code, zip_code, YEAR, YEAR, *chunk]
            rows.extend(conn.execute(sql, params).fetchall())
    out = pd.DataFrame([dict(r) for r in rows])
    if out.empty:
        return out
    out["code"] = norm_code(out["code"])
    out["medicare_allowed"] = pd.to_numeric(out["medicare_allowed"], errors="coerce")
    return out

df = combined_df

# Identify code & rate columns from your combined view
code_col = "code_norm" if "code_norm" in df.columns else ("code" if "code" in df.columns else "billing_code")
df["code_norm"] = norm_code(df[code_col])

rate_col = next((c for c in ["negotiated_rate","price","uhc_rate_median","ga_wc_rate"] if c in df.columns), None)
df["row_rate"] = pd.to_numeric(df[rate_col], errors="coerce") if rate_col else np.nan

# Ensure a ZIP column for Medicare mapping
zip_col = "service_zip" if "service_zip" in df.columns else ("zip" if "zip" in df.columns else None)
if zip_col is None:
    df["service_zip"] = DEFAULT_ZIP
    zip_col = "service_zip"

df[zip_col] = df[zip_col].astype(str).str.extract(r"(\d{5})")[0].fillna(DEFAULT_ZIP)

# --- Pull Medicare in groups (by ZIP) ---
pairs = df[[zip_col, "code_norm"]].dropna().drop_duplicates()
med_parts = []
for z, grp in pairs.groupby(zip_col):
    codes = grp["code_norm"].dropna().unique().tolist()
    med_parts.append(fetch_medicare_by_zip_for_codes(z, codes))
med = pd.concat(med_parts, ignore_index=True) if med_parts else pd.DataFrame(columns=["zip_code","code","medicare_allowed"])

# --- Merge + compute % of Medicare ---
if not med.empty:
    med = med.rename(columns={"zip_code": zip_col, "code":"code_norm"})
    df = df.merge(med, on=[zip_col, "code_norm"], how="left")
    df["pct_of_medicare"] = df["row_rate"] / df["medicare_allowed"]

# Save an enriched copy next to the original
out_path = ("combined_df_with_medicare.csv")
#df.to_csv(out_path, index=False)
out_path
print(f"columns: {df.columns}")

columns: Index(['Unnamed: 0', 'provider_group_id', 'npi', 'tin_type', 'tin_value',
       'reporting_entity_name_x', 'reporting_entity_type_x',
       'last_updated_on_x', 'version_x', 'provider_reference_id',
       'negotiated_rate', 'negotiated_type', 'billing_class',
       'expiration_date', 'service_codes', 'billing_code', 'billing_code_type',
       'description', 'name', 'negotiation_arrangement',
       'reporting_entity_name_y', 'reporting_entity_type_y',
       'last_updated_on_y', 'version_y', 'provider_name', 'credentials',
       'gender', 'enumeration_date', 'last_updated', 'addresses',
       'primary_specialty', 'secondary_specialties', 'provider_type',
       'metadata', 'city', 'country', 'fax', 'phone', 'purpose', 'state',
       'street', 'type', 'full_zip', 'service_zip', 'ZIP', 'CBSA',
       'USPS_ZIP_PREF_CITY', 'USPS_ZIP_PREF_STATE', 'RES_RATIO', 'BUS_RATIO',
       'OTH_RATIO', 'TOT_RATIO', 'CBSA Code', 'Metropolitan Division Code',
       'CSA Code', 'CBSA T

drop columns

In [8]:
df = df.drop(columns=[
    "reporting_entity_name_y",
    "reporting_entity_type_y",
    "ortho_bucket"
])
df.rename(columns={'reporting_entity_name_x': 'reporting_entity_name'}, inplace=True)
df.rename(columns={'reporting_entity_type_x': 'reporting_entity_type'}, inplace=True)  # or {'provider_zip': 'service_zip'}
df.rename(columns={'negotiated_rate': 'uhc_rate'}, inplace=True)
df
df.rename(columns={'ortho_flag': 'ortho_bucket'}, inplace=True)

In [9]:
df.shape

(210145, 90)

In [10]:
df = df.dropna(subset=["uhc_rate"])
df = df.dropna(subset=["ga_wc_rate"])
df = df.dropna(subset=["medicare_allowed"])
print(f"Total rows: {len(df)}")

Total rows: 209768


In [11]:
df.to_parquet("combined_df.parquet")

In [12]:
df = df.drop(columns=[
    "reporting_entity_name_y",
    "reporting_entity_type_y",
    "ortho_bucket"
])
df.rename(columns={'reporting_entity_name_x': 'reporting_entity_name'}, inplace=True)
df.rename(columns={'reporting_entity_type_x': 'reporting_entity_type'}, inplace=True)  # or {'provider_zip': 'service_zip'}
df.rename(columns={'negotiated_rate': 'uhc_rate'}, inplace=True)
df
df.rename(columns={'ortho_flag': 'ortho_bucket'}, inplace=True)

KeyError: "['reporting_entity_name_y', 'reporting_entity_type_y'] not found in axis"

In [None]:
df.to_csv("combined_df.csv", index=False)

In [None]:
import pandas as pd
sample_df_csv = pd.read_parquet("combined_df.parquet")
unique_codes_df = sample_df_csv[["billing_code", "description"]].drop_duplicates().reset_index(drop=True)
unique_codes_df.to_csv("unique_codes.csv", index=False)



join in the CCS categorization file

In [13]:
RBCS_codes = pd.read_csv("2024 RBCS Taxonomy_CSV.csv")

display(RBCS_codes.head(2))


df = pd.read_parquet("combined_df.parquet")
df = df.merge(RBCS_codes, left_on="billing_code", right_on="HCPCS_Cd", how="left")





df = df.drop(columns=[
    "RBCS_Release_Year",
    "RBCS_Reassigned",
    #"Final_CY_Prior_to_Reassignment",
    "HCPCS_CD_ADD_DT",
    "HCPCS_CD_END_DT"
])

df = df[df["ga_wc_rate"] != 0]

print(f"Number of rows in df: {len(df)}")
print(f"columns in df: {df.columns}")
print(f"columns in RBCS_codes: {RBCS_codes.columns}")



Unnamed: 0,HCPCS_Cd,RBCS_ID,RBCS_Cat,RBCS_Cat_Desc,RBCS_Cat_Subcat,RBCS_SubCat_Desc,RBCS_FamNumb,RBCS_Family_Desc,RBCS_Major_Ind,HCPCS_CD_ADD_DT,HCPCS_CD_END_DT,RBCS_Release_Year,RBCS_Reassigned,Final_CY_Prior_to_Reassignment
0,100,AA000N,A,Anesthesia,AA,Anesthesia,0.0,No RBCS Family,N,01Jan1989,31Dec9999,2020.0,0.0,9999.0
1,102,AA000N,A,Anesthesia,AA,Anesthesia,0.0,No RBCS Family,N,01Jan1989,31Dec9999,2020.0,0.0,9999.0


Number of rows in df: 171169
columns in df: Index(['Unnamed: 0', 'provider_group_id', 'npi', 'tin_type', 'tin_value',
       'reporting_entity_name', 'reporting_entity_type', 'last_updated_on_x',
       'version_x', 'provider_reference_id', 'uhc_rate', 'negotiated_type',
       'billing_class', 'expiration_date', 'service_codes', 'billing_code',
       'billing_code_type', 'description', 'name', 'negotiation_arrangement',
       'last_updated_on_y', 'version_y', 'provider_name', 'credentials',
       'gender', 'enumeration_date', 'last_updated', 'addresses',
       'primary_specialty', 'secondary_specialties', 'provider_type',
       'metadata', 'city', 'country', 'fax', 'phone', 'purpose', 'state',
       'street', 'type', 'full_zip', 'service_zip', 'ZIP', 'CBSA',
       'USPS_ZIP_PREF_CITY', 'USPS_ZIP_PREF_STATE', 'RES_RATIO', 'BUS_RATIO',
       'OTH_RATIO', 'TOT_RATIO', 'CBSA Code', 'Metropolitan Division Code',
       'CSA Code', 'CBSA Title', 'Metropolitan/Micropolitan Statistica

In [None]:
display(df.head(5))
print(df.columns)

Unnamed: 0.1,Unnamed: 0,provider_group_id,npi,tin_type,tin_value,reporting_entity_name,reporting_entity_type,last_updated_on_x,version_x,provider_reference_id,...,HCPCS_Cd_y,RBCS_ID_y,RBCS_Cat_y,RBCS_Cat_Desc_y,RBCS_Cat_Subcat_y,RBCS_SubCat_Desc_y,RBCS_FamNumb_y,RBCS_Family_Desc_y,RBCS_Major_Ind_y,Final_CY_Prior_to_Reassignment _y
58,26245.0,168.0,1700900000.0,ein,201354399.0,UnitedHealthcare of Georgia Inc.,Insurer,2025-08-01,1.0.0,168.0,...,10005,PO000O,P,Procedure,PO,Other Organ Systems,0.0,No RBCS Family,O,9999.0
59,26246.0,168.0,1700900000.0,ein,201354399.0,UnitedHealthcare of Georgia Inc.,Insurer,2025-08-01,1.0.0,168.0,...,10005,PO000O,P,Procedure,PO,Other Organ Systems,0.0,No RBCS Family,O,9999.0
60,26247.0,168.0,1700900000.0,ein,201354399.0,UnitedHealthcare of Georgia Inc.,Insurer,2025-08-01,1.0.0,168.0,...,10005,PO000O,P,Procedure,PO,Other Organ Systems,0.0,No RBCS Family,O,9999.0
61,26248.0,168.0,1700900000.0,ein,201354399.0,UnitedHealthcare of Georgia Inc.,Insurer,2025-08-01,1.0.0,168.0,...,10005,PO000O,P,Procedure,PO,Other Organ Systems,0.0,No RBCS Family,O,9999.0
62,26249.0,168.0,1700900000.0,ein,201354399.0,UnitedHealthcare of Georgia Inc.,Insurer,2025-08-01,1.0.0,168.0,...,10005,PO000O,P,Procedure,PO,Other Organ Systems,0.0,No RBCS Family,O,9999.0


Index(['Unnamed: 0', 'provider_group_id', 'npi', 'tin_type', 'tin_value',
       'reporting_entity_name', 'reporting_entity_type', 'last_updated_on_x',
       'version_x', 'provider_reference_id',
       ...
       'HCPCS_Cd_y', 'RBCS_ID_y', 'RBCS_Cat_y', 'RBCS_Cat_Desc_y',
       'RBCS_Cat_Subcat_y', 'RBCS_SubCat_Desc_y', 'RBCS_FamNumb_y',
       'RBCS_Family_Desc_y', 'RBCS_Major_Ind_y',
       'Final_CY_Prior_to_Reassignment _y'],
      dtype='object', length=110)


In [14]:
df.to_parquet("combined_df.parquet")
df.head(5).to_csv("sample_combined_df.csv")

In [None]:
df = pd.read_parquet("combined_df.parquet")
df = df[df[]]