## HCP Filtering
- Get Datasets from the results of Dataset Builder
- Filter Medical Events table to return only rows with |HE| as a Modifier
- Upload final dataset to MapView
- Schedule refresh

### Import Libraries & Connect to Snowflake

In [2]:
### Import Libraries
import os
from komodo.client import Client
from komodo.definitions.models.cohorts.cohort_create import CohortCreate
from komodo.snowflake import get_snowflake_connection
import pandas as pd
from datetime import datetime
from komodo.dataset import upload_dataset_to_maplab
import pprint
from komodo.analytics import AnalyticDefinitionVersion, AnalyticDefinition, AnalyticDialect, InputType, create_analytic_definition, create_dataflow_from_analytic_definition_version, run_dataflow, check_dataflow_run_status

now = datetime.now()
os.chdir("/home/dragon/workspaces/current/workspace/src/cookbook/")
client = Client()

### Connect to Snowflake
print("--- Connecting to Snowflake ---")

account_id = os.getenv("KOMODO_ACCOUNT_ID")

conn = get_snowflake_connection(account_id)
curs = conn.cursor()
curs.execute("USE ROLE CUSTOMER_ROLE")
print("--- Success connecting to Snowflake ---")

  warn_incompatible_dep(


--- Connecting to Snowflake ---


DEBUG:komodo_connector.connection_creators.snowflake.connect:REST API object was created: f82bd78b-1a87-4b6c-a121-dd977d140a9d.snowflakecomputing.com:443


--- Success connecting to Snowflake ---


### Set Variables

In [3]:
### Cohort ID

cohort_id = "fltr_def_BMEBEGSBOFPMQLJT"  # This is the Abecma Cohort ID which you can get from either the UI or the Komodo Extensions tab in a Workspace

### Dataset IDs from Dataset Builder or through Cookbook 3-retrieve-cohort-data

providers_dataset = "ca2b2a4a-dcb3-42e1-8432-366646254178" # This is the Providers table from the Cohort above
pharmacy_events_dataset = "2ff0e4cb-e282-456c-9106-2414e3ebf646" # This is the Pharmacy Events table from the Cohort above
patient_geography_dataset = "c4838ba0-7e9b-422f-be1f-128d05b3b3d0" # This is the Patient Georgraphy table from the Cohort above
patient_demographics_dataset = "48aaa687-2487-41d1-80bc-a12309da71cd" # This is the Patient Demographics table from the Cohort above
medical_events_dataset = "7322e8b3-6af8-4c5d-a92f-f114be9b3210" # This is the Medical Events table from the Cohort above
insurance_plans_dataset ="660c6778-ba5e-4295-92a9-274eabbb6feb" # This is the Insurnace Plance table from the Cohort above
closed_patients_dataset = "82509b84-9d9f-4a51-8e09-fedc8428c1cd" # This is the Closed Patients table from the Cohort above

### Get SnowflakeLocation & Turn into Pandas Dataframe

In [4]:
def get_snowflake_location(dataset_id: str) -> str:
    get_dataset_response = client.data_catalog.get_dataset(dataset_id)
    return get_dataset_response.manifestations[0].fully_qualified_name

providers_table = get_snowflake_location(providers_dataset)
pharmacy_events_table = get_snowflake_location(pharmacy_events_dataset)
patient_geography_table = get_snowflake_location(patient_geography_dataset)
patient_demographics_table = get_snowflake_location(patient_demographics_dataset)
medical_events_table = get_snowflake_location(medical_events_dataset)
insurance_plans_table = get_snowflake_location(insurance_plans_dataset)
closed_patients_table = get_snowflake_location(closed_patients_dataset)

# print the fully qualified name of the dataset table
print(f"Providers table: {providers_table}")
print(f"Pharmacy Events table: {pharmacy_events_table}")
print(f"Patient Geography table: {patient_geography_table}")
print(f"Patient Demographics table: {patient_demographics_table}")
print(f"Medical Events table: {medical_events_table}")
print(f"Insurance Plans table: {insurance_plans_table}")
print(f"Closed Patients table: {closed_patients_table}")

Providers table: RESULTS.DATAFLOWS.DFB_CA2B2A4A_DCB3_42E1_8432_366646254178
Pharmacy Events table: RESULTS.DATAFLOWS.DFB_2FF0E4CB_E282_456C_9106_2414E3EBF646
Patient Geography table: RESULTS.DATAFLOWS.DFB_C4838BA0_7E9B_422F_BE1F_128D05B3B3D0
Patient Demographics table: RESULTS.DATAFLOWS.DFB_48AAA687_2487_41D1_80BC_A12309DA71CD
Medical Events table: RESULTS.DATAFLOWS.DFB_7322E8B3_6AF8_4C5D_A92F_F114BE9B3210
Insurance Plans table: RESULTS.DATAFLOWS.DFB_660C6778_BA5E_4295_92A9_274EABBB6FEB
Closed Patients table: RESULTS.DATAFLOWS.DFB_82509B84_9D9F_4A51_8E09_FEDC8428C1CD


In [5]:
# turn SF table into pandas DF
providers_data = pd.read_sql(f"select * from {providers_table}", conn)
pharmacy_events_data = pd.read_sql(f"select * from {pharmacy_events_table}", conn)
patient_geography_data = pd.read_sql(f"select * from {patient_geography_table}", conn)
patient_demographics_data = pd.read_sql(f"select * from {patient_demographics_table}", conn)
medical_events_data = pd.read_sql(f"select * from {medical_events_table}", conn)
insurance_plans_data = pd.read_sql(f"select * from {insurance_plans_table}", conn)
closed_patients_data = pd.read_sql(f"select * from {closed_patients_table}", conn)

  providers_data = pd.read_sql(f"select * from {providers_table}", conn)
  pharmacy_events_data = pd.read_sql(f"select * from {pharmacy_events_table}", conn)
  patient_geography_data = pd.read_sql(f"select * from {patient_geography_table}", conn)
  patient_demographics_data = pd.read_sql(f"select * from {patient_demographics_table}", conn)
  medical_events_data = pd.read_sql(f"select * from {medical_events_table}", conn)
  insurance_plans_data = pd.read_sql(f"select * from {insurance_plans_table}", conn)
  closed_patients_data = pd.read_sql(f"select * from {closed_patients_table}", conn)


### Print Previews

In [6]:
# preview the first 5 rows
print(" --- Providers Data ---")
providers_data.head()

 --- Providers Data ---


Unnamed: 0,FIRST_NAME,HCO_PRIMARY_NPI,LAST_NAME,NPI,ORGANIZATION_NAME,PRIMARY_SPECIALTY,PROVIDER_ADDRESS,PROVIDER_CITY,PROVIDER_PHONE_NUMBER,PROVIDER_STATE,PROVIDER_TYPE,PROVIDER_ZIP,SECONDARY_SPECIALTY
0,DWIGHT,1477531580,MACERO,1215312046,,Physician Assistant,825 EASTLAKE AVE E,SEATTLE,2065205000,WA,INDIVIDUAL,98109,
1,ANDREW,1437292927,REZVANI,1609983485,,Internal Medicine,300 PASTEUR DR,STANFORD,6507234000,CA,INDIVIDUAL,94305,Medical Oncology
2,BIJAL,1780653618,SHAH,1508023714,,Internal Medicine,12902 MAGNOLIA DR,TAMPA,8137458212,FL,INDIVIDUAL,33612,Hematology & Oncology
3,JANICE,1164493847,SAGNIS,1497369391,,Nurse Practitioner,1959 NE PACIFIC STREET,SEATTLE,2065438736,WA,INDIVIDUAL,98195,Acute Care
4,DAVID,1912381203,BLATZ,1427661321,,Physician Assistant,309 E 2ND ST,POMONA,9096236116,CA,INDIVIDUAL,91766,


### Additional Analysis / Transformations

In [7]:
sql_query = f"""
SELECT * FROM {medical_events_table} 
    WHERE UPPER(MODIFIERS) IS NOT NULL
;
"""

# Execute the query and fetch results
final_dataset = pd.read_sql(sql_query, conn)

# Print the results
final_dataset.head()

  final_dataset = pd.read_sql(sql_query, conn)


Unnamed: 0,BILLING_NPI,BILLING_NPI_CONFIDENCE,BILL_TYPE_CODE,DIAGNOSIS_CODES,EVENT_SOURCE,KH_PLAN_ID,MEDICAL_EVENT_ID,MODIFIERS,NDC11,PATIENT_ID,...,PROCEDURE_CODE,PROCEDURE_CODE_TYPE,REFERRING_NPI,RENDERING_NPI,REVENUE_CODE,SERVICE_DATE,SERVICE_TO_DATE,UNITS,UNIT_TYPE,VISIT_TYPE
0,1780653618,A - KNOWN,137.0,|C9002|Z01810|I10|Z79899|,INSTITUTIONAL,6411.0,ed87c5fd0983a75201e0d6c45cbe48fb32fd030e0de0a6...,|XE|,,2KQ2VZ1R,...,93005,CPT,,1679869507,730.0,2024-11-20,2024-11-20,1.0,,OUTPATIENT
1,1568693554,A - KNOWN,,|Z86718|E6601|Z6836|,PROFESSIONAL,6411.0,878b39e0b5a3645f8e91a0acfb56d3ffb97e30d9789c96...,|QW|,,2KQ2VZ1R,...,85610,CPT,,1609846633,,2023-02-15,2023-02-15,1.0,,OUTPATIENT
2,1821156035,A - KNOWN,,|D892|N16|K648|K621|,PROFESSIONAL,6411.0,b33c1820ad36ddfd37d18ce85f75deeaf294944c7523f5...,|26|,,2KQ2VZ1R,...,88313,CPT,1003858275.0,1699084145,,2016-12-23,2016-12-23,4.0,,INPATIENT
3,1821156035,A - KNOWN,,|D892|N16|K648|K621|,PROFESSIONAL,6411.0,cb0360c4187201ce556d96c7232e3bc88a1d2f831bab06...,|26|,,2KQ2VZ1R,...,88346,CPT,1003858275.0,1699084145,,2016-12-23,2016-12-23,1.0,,INPATIENT
4,1306833595,A - KNOWN,,|D472|,PROFESSIONAL,6411.0,4e27d4d112c7c6cb051333f6118c547be619ca7149ca4c...,|26|,,2KQ2VZ1R,...,84165,CPT,1679869507.0,1134374002,,2024-12-02,2024-12-02,1.0,,OUTPATIENT


In [8]:
query2 = f"""
SELECT * FROM {medical_events_table}
    JOIN {providers_table} ON {medical_events_table}.RENDERING_NPI = {providers_table}.NPI
;
"""

joined = pd.read_sql(query2, conn)

joined.head()

  joined = pd.read_sql(query2, conn)


Unnamed: 0,BILLING_NPI,BILLING_NPI_CONFIDENCE,BILL_TYPE_CODE,DIAGNOSIS_CODES,EVENT_SOURCE,KH_PLAN_ID,MEDICAL_EVENT_ID,MODIFIERS,NDC11,PATIENT_ID,...,NPI,ORGANIZATION_NAME,PRIMARY_SPECIALTY,PROVIDER_ADDRESS,PROVIDER_CITY,PROVIDER_PHONE_NUMBER,PROVIDER_STATE,PROVIDER_TYPE,PROVIDER_ZIP,SECONDARY_SPECIALTY
0,1871543215,A - KNOWN,133,|C9000|Z92850|,INSTITUTIONAL,6000.0,9e505310413bf19c7a9daf79c1ebc3fe9241ec4e316458...,,,497RVPKP,...,1972868644,,Internal Medicine,300 PASTEUR DR,STANFORD,6507234000,CA,INDIVIDUAL,94305,Hematology
1,1871543215,A - KNOWN,132,|C9000|Z5111|Z79899|,INSTITUTIONAL,6000.0,4e18068999dca24cc12fbec0d8d2c3d6afc0fe57d238c0...,,,497RVPKP,...,1972868644,,Internal Medicine,300 PASTEUR DR,STANFORD,6507234000,CA,INDIVIDUAL,94305,Hematology
2,1871543215,A - KNOWN,133,|C9000|Z92850|,INSTITUTIONAL,6000.0,4c7183efdcab6fde80c96588fb5da523d2806c3ceb1943...,,,497RVPKP,...,1972868644,,Internal Medicine,300 PASTEUR DR,STANFORD,6507234000,CA,INDIVIDUAL,94305,Hematology
3,1871543215,A - KNOWN,131,|C9000|Z006|,INSTITUTIONAL,6000.0,c8194eb314014d70de28a130597258fab3dbfd774c96b6...,,,497RVPKP,...,1972868644,,Internal Medicine,300 PASTEUR DR,STANFORD,6507234000,CA,INDIVIDUAL,94305,Hematology
4,1871543215,A - KNOWN,133,|C9000|Z92850|Z9289|Z79899|,INSTITUTIONAL,6000.0,da7dee6474afa94c6193299189d1bf370ab296c35ca3ea...,,,497RVPKP,...,1972868644,,Internal Medicine,300 PASTEUR DR,STANFORD,6507234000,CA,INDIVIDUAL,94305,Hematology


### Save Analysis Dataset(s) to MapLab

In [9]:
# set the name of the dataset to be uploaded to the Komodo platform
# add the current date and time to the end of the dataset name to make it more distinct
final_dataset_datetime = now.strftime("%Y%m%d_%H%M%S")
final_dataset_dataset_name = "ABECMA_FINAL_DATASET" + final_dataset_datetime

# call the upload_dataset_to_maplab function
dataset_upload_dataset = upload_dataset_to_maplab(final_dataset, final_dataset_dataset_name)

# save the ID of the dataset
dataset_id = dataset_upload_dataset.id

# print the dataset ID
dataset_id

# store the dataset ID as an environment variable that can be used in subsequent cookbook files
from dotenv import load_dotenv, set_key

set_key(".env", "dataset_id", dataset_id)

DEBUG:komodo_connector.connection_creators.snowflake.connect:REST API object was created: f82bd78b-1a87-4b6c-a121-dd977d140a9d.snowflakecomputing.com:443


(True, 'dataset_id', '71f59df3-5812-42d7-99a4-07337a7bffa4')

In [10]:
# set the name of the dataset to be uploaded to the Komodo platform
# add the current date and time to the end of the dataset name to make it more distinct
joined_dataset_datetime = now.strftime("%Y%m%d_%H%M%S")
joined_dataset_dataset_name = "ABECMA_JOINED_DATASET" + joined_dataset_datetime

# call the upload_dataset_to_maplab function
dataset_upload_datasets = upload_dataset_to_maplab(joined, joined_dataset_dataset_name)

# save the ID of the dataset
dataset_id2 = dataset_upload_datasets.id

# print the dataset ID
print(dataset_id2)

# store the dataset ID as an environment variable that can be used in subsequent cookbook files
from dotenv import load_dotenv, set_key

set_key(".env", "dataset_id2", dataset_id2)

DEBUG:komodo_connector.connection_creators.snowflake.connect:REST API object was created: f82bd78b-1a87-4b6c-a121-dd977d140a9d.snowflakecomputing.com:443


41868ded-d503-47d3-898a-b14b88f30b41


(True, 'dataset_id2', '41868ded-d503-47d3-898a-b14b88f30b41')

### Set Refresh Schedule

In [11]:
 # create the Analytic Definition Version 
analytic_definition_version = (
    AnalyticDefinitionVersion.Builder()
    .with_version("1.0.0")
    .with_description("Filters Medical Events Table by Modifiers")
    .with_type(AnalyticDialect.PYTHON)  # Switch PYTHON for SQL if needed
    .with_input("medical_events_table", "DATA_COLLECTION", True)  # Required input. This example is a Snowflake location. Replace "STRING" with "DATA_COLLECTION" if using a Dataset
    .with_output("Abecma_Dataset_From_AD")  # Change name to whatever you need
    .with_file_path("py_analytic.py")  # this references the Python file
    .with_requirements_path("requirements.txt")  
    .build()
)

# create the Analytic Definition
analytic_definition = (
    AnalyticDefinition.Builder()
    .with_name("medical-events-modifiers-filtering") # Change name as needed
    .with_description("Python Analytic that filters Medical Events to |HE| Modifiers") # Change description as needed
    .with_version(analytic_definition_version)  
    .build()
)

In [12]:
### --- This table should be run as is without changes --- ###

# call the create_analytic_definition function with the AnalyticDefinition instance
create_analytic_definition_response = create_analytic_definition(analytic_definition = analytic_definition)

# save the ID of the analytic definition
create_analytic_definition_response_id = create_analytic_definition_response.get('id')

# print the analytic definition ID
print(f"Analytic definition: {create_analytic_definition_response_id}")

# save the ID of the latest analytic definition version
create_analytic_definition_response_latest_id = create_analytic_definition_response.get('latestVersion').get('id')

# print the latest analytic definition version ID
print(f"Analytic definition version: {create_analytic_definition_response_latest_id}")

Analytic definition: ea2be1a1-1514-4b78-9bfc-46eb0c477ae3
Analytic definition version: 1bac2980-3c2b-43f0-b706-33229f22348d


In [13]:
# call the create_dataflow_from_analytic_definition_version function
create_dataflow_response = create_dataflow_from_analytic_definition_version(
    analytic_definition_version_id = create_analytic_definition_response.get("latestVersion").get("id"),
    arguments = {
        "medical_events_table": "7322e8b3-6af8-4c5d-a92f-f114be9b3210", #this is a dataset ID if using datasets
    },
    refresh_schedule = "0 0 7 * *"  # The refresh schedule uses a cron job. This one runs monthly at midnight UTC on the 15th
)

In [14]:
### --- This table should be run as is without changes --- ###

# save the ID of the dataflow
dataflow_id = create_dataflow_response["id"]

# print the dataflow ID
print(f"Dataflow: {dataflow_id}")
 
# call the run_dataflow function
run_dataflow(dataflow_id)

# call the check_dataflow_run_status function
check_dataflow_run_status(dataflow_id)


Dataflow: 26b93ac2-6140-4d47-a024-9c3348789db7


{'status': 'SCHEDULED'}