In [None]:
import pandas as pd

# Load the sample data
chunks = pd.read_csv("npidata_pfile.csv", dtype=str, chunksize=100000)

# Header only on the first chunk
first_chunk = True

for chunk in chunks:
    # Filter for people
    df_people = chunk[chunk["Entity Type Code"] == "1"].copy()

    # Select only relevant columns
    providers = df_people[[
        "NPI",
        "Provider First Name",
        "Provider Last Name (Legal Name)",
        "Provider Middle Name",
        "Provider Credential Text",
        "Provider First Line Business Mailing Address",
        "Provider Second Line Business Mailing Address",
        "Provider Business Mailing Address City Name",
        "Provider Business Mailing Address State Name",
        "Provider Business Mailing Address Postal Code",
        "Provider Business Practice Location Telephone Number"
    ]].copy()

    # Rename to match the db schema
    providers.columns = [
        "npi",
        "first_name",
        "last_name",
        "middle_name",
        "credential",
        "address_1",
        "address_2",
        "city",
        "state",
        "zip",
        "phone_number"
   ]

# Write to CSV wothout holding everything im memory
providers.to_csv("providers.csv", mode='a', index = False, header=first_chunk) # append mode
first_chunk = False

In [3]:
import pandas as pd

# Load taxonomy data
df_tax = pd.read_csv("nucc_taxonomy_251.csv", dtype=str)

# Keep relevant columns only
taxonomy = df_tax[[
    "Code",
    "Grouping",
    "Classification",
    "Specialization",
    "Definition",
    "Display Name"
]].copy()

# Rename for database-style names
taxonomy.columns = [
    "code",
    "grouping",
    "classification",
    "specialization",
    "definition",
    "display_name"
]

# Save to clean file
taxonomy.to_csv("taxonomy.csv", index=False)


In [1]:
import pandas as pd

# Read the file in chunks (100,000 rows per chunk)
chunks = pd.read_csv("npidata_pfile.csv", dtype=str, chunksize=100000)

# Headers only
first_chunk = True 

for chunk in chunks:
    # Filter for people
    df_people = chunk[chunk["Entity Type Code"] == "1"].copy()

    # Go through each possible taxonomy entry (1 to 15)
    for i in range(1, 16):
        tax_col = f"Healthcare Provider Taxonomy Code_{i}"
        prim_col = f"Healthcare Provider Primary Taxonomy Switch_{i}"

        if tax_col in df_people.columns and prim_col in df_people.columns:
            # Extract relevant columns and rename
            temp = df_people[["NPI", tax_col, prim_col]].copy()
            temp.columns = ["provider_npi", "taxonomy_code", "primary_raw"]

            # Drop rows without taxonomy_code
            temp = temp[temp["taxonomy_code"].notna()]

            # Convert 'Y' to True, everything else to False
            temp["primary"] = temp["primary_raw"] == "Y"

            # Keep only what we need
            temp = temp[["provider_npi", "taxonomy_code", "primary"]]

            # Append to file
            temp.to_csv("provider_taxonomies.csv", mode="a", index=False, header=first_chunk)
            first_chunk = False  # Only write the header for the first block
