## HCP Filtering
- Get Datasets from the results of Dataset Builder
- Filter Medical Events table to return only rows with |HE| as a Modifier
- Upload final dataset to MapView
- Schedule refresh

### Import Libraries & Connect to Snowflake

In [1]:
### Import Libraries
import os
from komodo.client import Client
from komodo.definitions.models.cohorts.cohort_create import CohortCreate
from komodo.snowflake import get_snowflake_connection
import pandas as pd
from datetime import datetime
from komodo.dataset import upload_dataset_to_maplab
import pprint
from komodo.analytics import AnalyticDefinitionVersion, AnalyticDefinition, AnalyticDialect, InputType, create_analytic_definition, create_dataflow_from_analytic_definition_version, run_dataflow, check_dataflow_run_status

now = datetime.now()
os.chdir("/home/dragon/workspaces/current/workspace/src/cookbook/")
client = Client()

### Connect to Snowflake
print("--- Connecting to Snowflake ---")

account_id = os.getenv("KOMODO_ACCOUNT_ID")

conn = get_snowflake_connection(account_id)
curs = conn.cursor()
curs.execute("USE ROLE CUSTOMER_ROLE")
print("--- Success connecting to Snowflake ---")

  warn_incompatible_dep(


--- Connecting to Snowflake ---


DEBUG:komodo_connector.connection_creators.snowflake.connect:REST API object was created: f82bd78b-1a87-4b6c-a121-dd977d140a9d.snowflakecomputing.com:443


--- Success connecting to Snowflake ---


### Set Variables

In [2]:
### Cohort ID

cohort_id = "fltr_def_NRKYCATAYYCAFIKS"  # This is the Entyvia Cohort ID which you can get from either the UI or the Komodo Extensions tab in a Workspace

### Dataset IDs from Dataset Builder or through Cookbook 3-retrieve-cohort-data

rx_2024 = "18a767b3-f04d-49d3-936a-1484f06ecf01"
rx_2025 = "79d88eff-5173-4e4a-b592-08a88eddf90c"
providers_2024 = "fda09832-d237-4966-bd72-ad33fed772bd"
providers_2025 = "5daa29e2-4e05-4fe3-a08e-04ce4b056919"


### Get SnowflakeLocation & Turn into Pandas Dataframe

In [3]:
def get_snowflake_location(dataset_id: str) -> str:
    get_dataset_response = client.data_catalog.get_dataset(dataset_id)
    return get_dataset_response.manifestations[0].fully_qualified_name

rx_2024_table = get_snowflake_location(rx_2024)
rx_2025_table = get_snowflake_location(rx_2025)
providers_2024_table = get_snowflake_location(providers_2024)
providers_2025_table = get_snowflake_location(providers_2025)


# print the fully qualified name of the dataset table
print(f"Providers table: {rx_2024_table}")
print(f"Pharmacy Events table: {rx_2025_table}")
print(f"Patient Geography table: {providers_2024_table}")
print(f"Patient Demographics table: {providers_2025_table}")


Providers table: RESULTS.DATAFLOWS.DFB_18A767B3_F04D_49D3_936A_1484F06ECF01
Pharmacy Events table: RESULTS.DATAFLOWS.DFB_79D88EFF_5173_4E4A_B592_08A88EDDF90C
Patient Geography table: RESULTS.DATAFLOWS.DFB_FDA09832_D237_4966_BD72_AD33FED772BD
Patient Demographics table: RESULTS.DATAFLOWS.DFB_5DAA29E2_4E05_4FE3_A08E_04CE4B056919


In [4]:
# turn SF table into pandas DF
rx_2024_data = pd.read_sql(f"select * from {rx_2024_table}", conn)
rx_2025_data = pd.read_sql(f"select * from {rx_2025_table}", conn)
providers_2024_data = pd.read_sql(f"select * from {providers_2024_table}", conn)
providers_2025_data = pd.read_sql(f"select * from {providers_2025_table}", conn)


  rx_2024_data = pd.read_sql(f"select * from {rx_2024_table}", conn)


### Print Previews

In [None]:
# preview the first 5 rows
print(" --- Providers Data ---")
rx_2024_data.head()

 --- Providers Data ---


Unnamed: 0,FIRST_NAME,HCO_PRIMARY_NPI,LAST_NAME,NPI,ORGANIZATION_NAME,PRIMARY_SPECIALTY,PROVIDER_ADDRESS,PROVIDER_CITY,PROVIDER_PHONE_NUMBER,PROVIDER_STATE,PROVIDER_TYPE,PROVIDER_ZIP,SECONDARY_SPECIALTY
0,DWIGHT,1477531580,MACERO,1215312046,,Physician Assistant,825 EASTLAKE AVE E,SEATTLE,2065205000,WA,INDIVIDUAL,98109,
1,ANDREW,1437292927,REZVANI,1609983485,,Internal Medicine,300 PASTEUR DR,STANFORD,6507234000,CA,INDIVIDUAL,94305,Medical Oncology
2,BIJAL,1780653618,SHAH,1508023714,,Internal Medicine,12902 MAGNOLIA DR,TAMPA,8137458212,FL,INDIVIDUAL,33612,Hematology & Oncology
3,JANICE,1164493847,SAGNIS,1497369391,,Nurse Practitioner,1959 NE PACIFIC STREET,SEATTLE,2065438736,WA,INDIVIDUAL,98195,Acute Care
4,DAVID,1912381203,BLATZ,1427661321,,Physician Assistant,309 E 2ND ST,POMONA,9096236116,CA,INDIVIDUAL,91766,


### Additional Analysis / Transformations

In [None]:
rx_2024_query = f"""
SELECT * 
FROM {rx_2024_data}
LIMIT 1000
"""
curs.execute(rx_2024_query)
pd.set_option('display.max_columns', None)

rx_2024_final = pd.read_sql(rx_2024_query, conn)
print("--- Success querying data ---")

rx_2024_final.head(5)

  final_dataset = pd.read_sql(sql_query, conn)


Unnamed: 0,BILLING_NPI,BILLING_NPI_CONFIDENCE,BILL_TYPE_CODE,DIAGNOSIS_CODES,EVENT_SOURCE,KH_PLAN_ID,MEDICAL_EVENT_ID,MODIFIERS,NDC11,PATIENT_ID,...,PROCEDURE_CODE,PROCEDURE_CODE_TYPE,REFERRING_NPI,RENDERING_NPI,REVENUE_CODE,SERVICE_DATE,SERVICE_TO_DATE,UNITS,UNIT_TYPE,VISIT_TYPE
0,1780653618,A - KNOWN,137.0,|C9002|Z01810|I10|Z79899|,INSTITUTIONAL,6411.0,ed87c5fd0983a75201e0d6c45cbe48fb32fd030e0de0a6...,|XE|,,2KQ2VZ1R,...,93005,CPT,,1679869507,730.0,2024-11-20,2024-11-20,1.0,,OUTPATIENT
1,1568693554,A - KNOWN,,|Z86718|E6601|Z6836|,PROFESSIONAL,6411.0,878b39e0b5a3645f8e91a0acfb56d3ffb97e30d9789c96...,|QW|,,2KQ2VZ1R,...,85610,CPT,,1609846633,,2023-02-15,2023-02-15,1.0,,OUTPATIENT
2,1821156035,A - KNOWN,,|D892|N16|K648|K621|,PROFESSIONAL,6411.0,b33c1820ad36ddfd37d18ce85f75deeaf294944c7523f5...,|26|,,2KQ2VZ1R,...,88313,CPT,1003858275.0,1699084145,,2016-12-23,2016-12-23,4.0,,INPATIENT
3,1821156035,A - KNOWN,,|D892|N16|K648|K621|,PROFESSIONAL,6411.0,cb0360c4187201ce556d96c7232e3bc88a1d2f831bab06...,|26|,,2KQ2VZ1R,...,88346,CPT,1003858275.0,1699084145,,2016-12-23,2016-12-23,1.0,,INPATIENT
4,1306833595,A - KNOWN,,|D472|,PROFESSIONAL,6411.0,4e27d4d112c7c6cb051333f6118c547be619ca7149ca4c...,|26|,,2KQ2VZ1R,...,84165,CPT,1679869507.0,1134374002,,2024-12-02,2024-12-02,1.0,,OUTPATIENT


In [None]:
providers_2024_query = f"""
SELECT * 
FROM {providers_2024_data}
LIMIT 1000
"""
curs.execute(providers_2024_query)
pd.set_option('display.max_columns', None)

providers_2024_final = pd.read_sql(providers_2024_query, conn)
print("--- Success querying data ---")

providers_2024_final.head(5)

In [None]:
rx_2025_query = f"""
SELECT * 
FROM {rx_2025_data}
LIMIT 1000
"""
curs.execute(rx_2025_query)
pd.set_option('display.max_columns', None)

rx_2025_final = pd.read_sql(rx_2025_query, conn)
print("--- Success querying data ---")

rx_2025_final.head(5)

In [None]:
providers_2025_query = f"""
SELECT * 
FROM {providers_2025_data}
LIMIT 1000
"""
curs.execute(providers_2025_query)
pd.set_option('display.max_columns', None)

providers_2025_final = pd.read_sql(providers_2025_query, conn)
print("--- Success querying data ---")

providers_2025_final.head(5)

In [None]:
unique_query = f"""
SELECT PRESCRIBER_NPI
FROM {rx_2025_final}
EXCEPT
SELECT PRESCRIBER_NPI
FROM {rx_2024_final};
"""

curs.execute(unique_query)
pd.set_option('display.max_columns', None)

uniques = pd.read_sql(unique_query, conn)
print("--- Success querying data ---")

unique_providers = uniques.merge(providers_2025_final, left_on='PRESCRIBER_NPI', right_on='NPI', how='right')

print("--- Success joining unique NPIs with providers_2025 ---")
print(unique_providers.head(25))

### Save Analysis Dataset(s) to MapLab

In [None]:
# set the name of the dataset to be uploaded to the Komodo platform
# add the current date and time to the end of the dataset name to make it more distinct
final_dataset_datetime = now.strftime("%Y%m%d_%H%M%S")
final_dataset_dataset_name = "ENTYVIA_UNIQUE_HCPS" + final_dataset_datetime

# call the upload_dataset_to_maplab function
dataset_upload_dataset = upload_dataset_to_maplab(final_dataset, final_dataset_dataset_name)

# save the ID of the dataset
dataset_id = dataset_upload_dataset.id

# print the dataset ID
dataset_id

# store the dataset ID as an environment variable that can be used in subsequent cookbook files
from dotenv import load_dotenv, set_key

set_key(".env", "dataset_id", dataset_id)

DEBUG:komodo_connector.connection_creators.snowflake.connect:REST API object was created: f82bd78b-1a87-4b6c-a121-dd977d140a9d.snowflakecomputing.com:443


(True, 'dataset_id', '71f59df3-5812-42d7-99a4-07337a7bffa4')

### Set Refresh Schedule

In [None]:
### --- This table should be run as is without changes --- ###

# call the create_analytic_definition function with the AnalyticDefinition instance
create_analytic_definition_response = create_analytic_definition(analytic_definition = analytic_definition)

# save the ID of the analytic definition
create_analytic_definition_response_id = create_analytic_definition_response.get('id')

# print the analytic definition ID
print(f"Analytic definition: {create_analytic_definition_response_id}")

# save the ID of the latest analytic definition version
create_analytic_definition_response_latest_id = create_analytic_definition_response.get('latestVersion').get('id')

# print the latest analytic definition version ID
print(f"Analytic definition version: {create_analytic_definition_response_latest_id}")

Analytic definition: ea2be1a1-1514-4b78-9bfc-46eb0c477ae3
Analytic definition version: 1bac2980-3c2b-43f0-b706-33229f22348d


In [None]:
# call the create_dataflow_from_analytic_definition_version function
create_dataflow_response = create_dataflow_from_analytic_definition_version(
    analytic_definition_version_id = create_analytic_definition_response.get("latestVersion").get("id"),
    arguments = {
        "medical_events_table": "7322e8b3-6af8-4c5d-a92f-f114be9b3210", #this is a dataset ID if using datasets
    },
    refresh_schedule = "0 0 7 * *"  # The refresh schedule uses a cron job. This one runs monthly at midnight UTC on the 15th
)

In [None]:
### --- This table should be run as is without changes --- ###

# save the ID of the dataflow
dataflow_id = create_dataflow_response["id"]

# print the dataflow ID
print(f"Dataflow: {dataflow_id}")
 
# call the run_dataflow function
run_dataflow(dataflow_id)

# call the check_dataflow_run_status function
check_dataflow_run_status(dataflow_id)


Dataflow: 26b93ac2-6140-4d47-a024-9c3348789db7


{'status': 'SCHEDULED'}