## HCP Filtering
- Get data from a Cohort created from the Cohort Builder UI
- Filter Medical Events table to return only rows with a Modifier
- Upload final dataset to MapView
- Schedule refresh
- Export dataset to S3

### Import Libraries & Connect to Snowflake

In [None]:
# create the Komodo client
from komodo.client import Client
from komodo.definitions.models.cohorts.cohort_create import CohortCreate
from dotenv import load_dotenv, set_key
import os
from dotenv import load_dotenv
import pandas as pd
from komodo.snowflake import get_snowflake_connection
from snowflake.connector.pandas_tools import pd_writer
from datetime import datetime
from komodo.dataset import upload_dataset_to_maplab
from time import sleep
 

now = datetime.now()
load_dotenv()
client = Client()


### Connect to Snowflake
print("--- Connecting to Snowflake ---")

account_id = os.getenv("KOMODO_ACCOUNT_ID")

conn = get_snowflake_connection(account_id)
curs = conn.cursor()
curs.execute("USE ROLE CUSTOMER_ROLE")
print("--- Success connecting to Snowflake ---")

### Set Cohort Definition & Get Data

In [None]:
cohort_definition_id = "fltr_def_BMEBEGSBOFPMQLJT"  # replace this cohort definition ID

In [None]:
# create the JSON payload
cohort_payload = {
    "name": "Phompholyx Providers Cohort",
    "definition": {
        "cohort_definition": {
            "filters": [{
                "name": "filter_dfn", 
                "filter_definition": {"filter_definition_id": cohort_definition_id},
                "time_filter": {  # optional
                    "ranges": [
                        ["2024-01-01", "2024-01-31"]
                    ]
                }
            }],
            "entities": ["patient"],
            "source_filter": {
                "version": "release", 
                "include_rejected_claims": False
            },
        },
        # The below script will create tables in the MapLab UI as well
        "output_format": {
            "count_entities": False,
            "entities_to_count": [],
            "outputs": [
                {
                    "output_format": "snowflake-table",
                    "output_type": "plaid-medical-events",
                },
                {
                    "output_format": "snowflake-table",
                    "output_type": "plaid-pharmacy-events",
                },
                {
                    "output_format": "snowflake-table",
                    "output_type": "plaid-patient-demographics",
                },
                {
                    "output_format": "snowflake-table",
                    "output_type": "plaid-patient-geography",
                },
                {
                    "output_format": "snowflake-table",
                    "output_type": "plaid-plans",
                },
                {
                    "output_format": "snowflake-table",
                    "output_type": "plaid-providers",
                },
            ],
        },
    },
    "definition_schema_version": "1.0.0",
}

# create the CohortCreate instance with the JSON payload
cohort_create = CohortCreate.from_dict(cohort_payload)

In [None]:
### All of the available output formats
            # "outputs": [
            #     {
            #         "output_format": "snowflake-table",
            #         "output_type": "plaid-medical-events",
            #     },
            #     {
            #         "output_format": "snowflake-table",
            #         "output_type": "plaid-pharmacy-events",
            #     },
            #     {
            #         "output_format": "snowflake-table",
            #         "output_type": "plaid-patient-demographics",
            #     },
            #     {
            #         "output_format": "snowflake-table",
            #         "output_type": "plaid-patient-enrollment",
            #     },
            #     {
            #         "output_format": "snowflake-table",
            #         "output_type": "plaid-patient-geography",
            #     },
            #     {
            #         "output_format": "snowflake-table",
            #         "output_type": "plaid-plans",
            #     },
            #     {
            #         "output_format": "snowflake-table",
            #         "output_type": "plaid-providers",
            #     },
            #     {
            #         "output_format": "snowflake-table",
            #         "output_type": "plaid-patient-closed",
            #     },
            #     {
            #         "output_format": "snowflake-table",
            #         "output_type": "plaid-patient-insurance",
            #     },
            #     {
            #         "output_format": "snowflake-table",
            #         "output_type": "plaid-mortality",
            #     },
            #     {
            #         "output_format": "snowflake-table",
            #         "output_type": "plaid-patient-race-ethnicity",
            #     },
            # ],

In [None]:
# call the create_cohort operation
cohort_create_response = client.definitions.create_cohort(cohort_create)

# save the ID of the cohort
cohort_id = cohort_create_response.id

# print the cohort ID
cohort_id

# store the cohort ID as an environment variable that can be used across cookbook files
set_key('.env', 'cohort_id', cohort_id)

In [None]:
# cohort_id = "cht_XBWBXEDTIATCKZOK"  # replace this cohort ID if inputting manually

In [None]:
cohort_response = client.definitions.get_cohort(cohort_id)
 
while cohort_response.cohort_run.finished_at is None and cohort_response.cohort_run.error_message is None:
    cohort_response = client.definitions.get_cohort(cohort_id)
    print(f"Cohort status is {cohort_response.cohort_run.status.value}")
    if cohort_response.cohort_run.status == "FINISHED":
        break
    sleep(5)

In [None]:
def execute_snowflake_query(query, conn):
    """Execute a query against Snowflake and return results as a DataFrame"""
    try:
        return pd.read_sql(query, conn)
    except Exception as e:
        print(f"Query failed: {e}")
        return None

# Dictionary to store all datasets
datasets = {}

for index, item in enumerate(cohort_response.cohort_run.output.outputs):
    # Get dataset information
    cohort_output_dataset = item.dataset_id
    dataset = client.data_catalog.get_dataset(dataset_id=cohort_output_dataset)
    dataset_table = dataset.manifestations[0].fully_qualified_name
    
    # Extract the table type
    table_parts = dataset_table.split('_')
    table_type = '_'.join(table_parts[-2:])  # Get the last two parts
    
    # Create a query to fetch the data
    query = f"SELECT * FROM {dataset_table}"
    
    # Execute the query and store the result in a dataframe
    try:
        df = execute_snowflake_query(query, conn)
        
        if df is not None:
            datasets[table_type] = df
            print(f"Successfully created dataset for {table_type}")
        else:
            print(f"No data returned for {table_type}")
    except Exception as e:
        print(f"Error creating dataset for {table_type}: {e}")

# Print available datasets
print("\nAvailable datasets:")
for key in datasets.keys():
    print(f"- {key}: {len(datasets[key])} rows")

### Preview Data

In [None]:
# Dictionary to store all dataset references and preview DataFrames
dataset_refs = {}
dataset_previews = {}

# First, collect all the dataset references
for index, item in enumerate(cohort_response.cohort_run.output.outputs):
    cohort_output_dataset = item.dataset_id
    dataset = client.data_catalog.get_dataset(dataset_id=cohort_output_dataset)
    dataset_table = dataset.manifestations[0].fully_qualified_name
    print(f"Dataset table: {dataset_table}")
    
    # Extract the table type
    table_parts = dataset_table.split('_')
    table_type = '_'.join(table_parts[-2:])
    
    # Store the reference
    dataset_refs[table_type] = dataset_table

# Now create preview DataFrames with limited rows
for table_type, table_name in dataset_refs.items():
    # Create a query that limits the number of rows
    preview_query = f"SELECT * FROM {table_name} LIMIT 100"  # Adjust the limit as needed
    
    try:
        # Execute the query to get a preview
        # Replace conn with your actual snowflake connection
        preview_df = pd.read_sql(preview_query, conn)
        
        # Store the preview DataFrame
        dataset_previews[table_type] = preview_df
        print(f"Successfully created preview for {table_type} ({len(preview_df)} rows)")
    except Exception as e:
        print(f"Error creating preview for {table_type}: {e}")

# Function to display pandas head previews
def display_pandas_previews(previews, rows=5):
    """
    Display pandas head() previews for each dataset
    
    Parameters:
    - previews: Dictionary mapping dataset names to pandas DataFrames
    - rows: Number of rows to show in preview
    """
    for name, df in previews.items():
        print(f"\n{'='*80}\n{name} PREVIEW:\n{'='*80}")
        print(f"\nShape: {df.shape[0]} rows × {df.shape[1]} columns")
        print(f"Columns: {', '.join(df.columns)}")
        print(f"\nFirst {rows} rows:")
        print(df.head(rows))  

# Display the previews
display_pandas_previews(dataset_previews)

### Analysis

In [None]:
sql_query = f"""
SELECT * FROM COHORTS.PROD.COHORT_RUN_CHT_JCNGYWQIYLCDPJXT_0_PLAID_MEDICAL_EVENTS
    WHERE UPPER(MODIFIERS) IS NOT NULL
;
"""

# Execute the query and fetch results
final_dataset = pd.read_sql(sql_query, conn)

# Print the results
final_dataset.head()

### Save Final Dataset(s) to MapLab

In [None]:
# set the name of the dataset to be uploaded to the Komodo platform
# add the current date and time to the end of the dataset name to make it more distinct
final_dataset_datetime = now.strftime("%Y%m%d_%H%M%S")
final_dataset_dataset_name = "ABECMA_FINAL_DATASET_FROM_COHORT_RUN_" + final_dataset_datetime

# call the upload_dataset_to_maplab function
dataset_upload_dataset = upload_dataset_to_maplab(final_dataset, final_dataset_dataset_name)

# save the ID of the dataset
dataset_id = dataset_upload_dataset.id

# print the dataset ID
dataset_id

# store the dataset ID as an environment variable that can be used in subsequent cookbook files
from dotenv import load_dotenv, set_key

set_key(".env", "dataset_id", dataset_id)