# Create a Cohort then Export the Data to S3

## Import Libraries and Set Cohort ID

In [32]:
import komodo
from komodo.client import Client

# Initialize Komodo Client
client = Client()
cohort_definition_id = "fltr_def_NRKYCATAYYCAFIKS"  # replace this cohort definition ID

## Run Your Cohort

In [7]:
# import the CohortCreate model from the Definitions API/business domain
from komodo.definitions.models.cohorts.cohort_create import CohortCreate

# create the JSON payload
cohort_payload = {
    "name": "Cohort",
    "definition": {
        "cohort_definition": {
            "filters": [{
                "name": "filter_dfn", 
                "filter_definition": {"filter_definition_id": cohort_definition_id},
                "time_filter": {  # optional
                    "ranges": [
                        ["2024-01-01", "2024-01-31"]
                    ]
                }
            }],
            "entities": ["patient"],
            "source_filter": {
                "version": "release", 
                "include_rejected_claims": False
            },
        },
        "output_format": {
            "count_entities": False,
            "entities_to_count": [],
            "outputs": [{
                "output_format": "snowflake-table",
                "output_type": "plaid-providers",  # based on your subscription
            }],
        },
    },
    "definition_schema_version": "1.0.0",
}

# create the CohortCreate instance with the JSON payload
cohort_create = CohortCreate.from_dict(cohort_payload)

In [35]:
# create the Komodo client
from komodo.client import Client
client = Client()

# call the create_cohort operation
cohort_create_response = client.definitions.create_cohort(cohort_create)

# save the ID of the cohort
cohort_id = cohort_create_response.id

# print the cohort ID
cohort_id

# store the cohort ID as an environment variable that can be used across cookbook files
from dotenv import load_dotenv, set_key
set_key('.env', 'cohort_id', cohort_id)

from time import sleep
 
cohort_response = client.definitions.get_cohort(cohort_id)
 
while cohort_response.cohort_run.finished_at is None and cohort_response.cohort_run.error_message is None:
    cohort_response = client.definitions.get_cohort(cohort_id)
    print(f"Cohort status is {cohort_response.cohort_run.status.value}")
    if cohort_response.cohort_run.status == "FINISHED":
        break
    sleep(5)

Cohort status is QUEUED
Cohort status is RUNNING
Cohort status is RUNNING
Cohort status is RUNNING
Cohort status is RUNNING
Cohort status is RUNNING
Cohort status is RUNNING
Cohort status is FINISHED


In [36]:
try:
    get_cohort_response = client.definitions.get_cohort(cohort_id)
except Exception as e:
    print("Exception when calling DefinitionsApi->get_cohort: %s\n" % e)


In [37]:
try:
    aggregation_create_input = komodo.AggregationCreateInput(
        name="Cohort",
        description="Aggregation for demo cohort",
        cohort_id=cohort_id,
        group_by=["primary_specialty"],  # Example group by field
        measures=[{"measure": "internal_medicine"}],  # Example measure
    )
    create_aggregation_response = client.definitions.create_aggregation(aggregation_create_input)
    aggregation_id = create_aggregation_response.id
    print(f"Aggregation created with ID: {aggregation_id}")
except Exception as e:
    print(f"Exception when creating aggregation: {e}")
    exit()

Exception when creating aggregation: 2 validation errors for AggregationCreateInput
aggregate
  Field required [type=missing, input_value={'name': 'Cohort', 'descr...: 'internal_medicine'}]}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
cohort_definition
  Field required [type=missing, input_value={'name': 'Cohort', 'descr...: 'internal_medicine'}]}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing


In [19]:
# create the Komodo client
from komodo.client import Client

client = Client()

# call the get_cohort operation
cohort_response = client.definitions.get_cohort(cohort_id=cohort_id)

# retrieve the dataset ID of the cohort output
cohort_snowflake_dataset_id = cohort_response.cohort_run.output.outputs[0].dataset_id

# call the get_dataset operation
cohort_snowflake_dataset2 = client.data_catalog.get_dataset(dataset_id=cohort_snowflake_dataset_id)

# retrieve the fully qualified name of the dataset table
cohort_snowflake_dataset_table2 = cohort_snowflake_dataset2.manifestations[0].fully_qualified_name

# print the fully qualified name of the dataset table
print(f"Dataset table: {cohort_snowflake_dataset_table}")

Dataset table: COHORTS.PROD.COHORT_RUN_CHT_OLOEBCQUBIOFMBTP_0_PLAID_PROVIDERS


## Convert Snowflake to Pandas 

In [22]:
# import pandas and the get_snowflake_connection function
import pandas as pd
from komodo.snowflake import get_snowflake_connection

# call the get_snowflake_connection and declare a Snowflake cursor
sf_connection = get_snowflake_connection()
sf_cursor = sf_connection.cursor()

# set the role of the cursor to CUSTOMER_ROLE to read and write tables
sf_cursor.execute("USE ROLE CUSTOMER_ROLE")

# retrieve the first 1,000 rows of the cohort data
cohort_data2 = pd.read_sql(f"select * from {cohort_snowflake_dataset_table2} LIMIT 1000", sf_connection)

# preview the first 5 rows of the current Snowflake table/Pandas DataFrame
cohort_data2.head()

DEBUG:komodo_connector.connection_creators.snowflake.connect:REST API object was created: f82bd78b-1a87-4b6c-a121-dd977d140a9d.snowflakecomputing.com:443
  cohort_data2 = pd.read_sql(f"select * from {cohort_snowflake_dataset_table2} LIMIT 1000", sf_connection)


Unnamed: 0,NPI,PROVIDER_TYPE,FIRST_NAME,LAST_NAME,ORGANIZATION_NAME,HCO_PRIMARY_NPI,PRIMARY_SPECIALTY,SECONDARY_SPECIALTY,PROVIDER_PHONE_NUMBER,PROVIDER_ADDRESS,PROVIDER_CITY,PROVIDER_STATE,PROVIDER_ZIP
0,1932236387,INDIVIDUAL,LORI,ZIMMERMAN,,1114969169.0,Pediatrics,Pediatric Gastroenterology,6173556058,"300 LONGWOOD AVENUE, HUNN G",BOSTON,MA,2115
1,1932262037,INDIVIDUAL,RYAN,MILLER,,1174660120.0,Pediatrics,Pediatric Endocrinology,4106018331,2411 W BELVEDERE AVE STE 205,BALTIMORE,MD,21215
2,1932217148,ORGANIZATION,,,FLORIDA CANCER SPECIALISTS P L,,Internal Medicine,Hematology & Oncology,2392756400,3840 BROADWAY,FORT MYERS,FL,33901
3,1932230273,INDIVIDUAL,HUNG,NGUYEN,,1174533343.0,Family Medicine,,8178015704,925 WRIGHT ST,ARLINGTON,TX,76012
4,1932280138,INDIVIDUAL,JAYAN,NAIR,,1356446736.0,Internal Medicine,Medical Oncology,9419571000,1970 GOLF ST,SARASOTA,FL,34236


In [23]:
# import the current date and time
from datetime import datetime

now = datetime.now()

from komodo.dataset import upload_dataset_to_maplab

# set the name of the dataset to be uploaded to the Komodo platform
# add the current date and time to the end of the dataset name to make it more distinct
cohort_upload_datetime = now.strftime("%Y%m%d_%H%M%S")
cohort_upload_dataset_name = "COHORT_DATASET" 

# call the upload_dataset_to_maplab function
cohort_upload_dataset = upload_dataset_to_maplab(cohort_data, cohort_upload_dataset_name)

# save the ID of the dataset
dataset_id = cohort_upload_dataset.id

# print the dataset ID
dataset_id

# store the dataset ID as an environment variable that can be used in subsequent cookbook files
from dotenv import load_dotenv, set_key

set_key(".env", "dataset_id", dataset_id)

DEBUG:komodo_connector.connection_creators.snowflake.connect:REST API object was created: f82bd78b-1a87-4b6c-a121-dd977d140a9d.snowflakecomputing.com:443
[32m2025-04-18 09:00:02.827[0m | [31m[1mERROR   [0m | [36mkomodo.extensions.dataset[0m:[36mupload_dataset_to_maplab[0m:[36m126[0m - [31m[1merror writing dataset to maplab: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 18 Apr 2025 09:00:02 GMT', 'Content-Type': 'application/json', 'Content-Length': '148', 'Connection': 'keep-alive', 'x-request-id': 'b1b18a18e9104e9c994cd6a14ceb24ff', 'Access-Control-Allow-Origin': '*'})
HTTP response body: colliding_datasets=['61d7b813-b196-4b0a-b1c3-29c7d5462adc'] message='One or more of the manifestations are already in use by another dataset.'
[0m


AttributeError: 'dict' object has no attribute 'id'

In [24]:
# use this cell if you want to use the dataset ID that is stored in an environment variable

import os
from dotenv import load_dotenv
load_dotenv()

try:
    dataset_id = os.environ["dataset_id"]  # retrieve the dataset_id from the "3-retrieve-cohort-data.ipynb" cookbook file
except KeyError:
    print("Please set the variable `dataset_id` with value of your dataset id.")  # throw an error if no dataset ID can be retrieved

## Export Your Dataset

In [25]:
import pprint

# create the Komodo client
from komodo.client import Client
client = Client()

# retrieve list of Share requests
share_ids = client.connections.list_shares()
pprint.pprint(share_ids.shares)

[ShareMetadata(account_id='f82bd78b-1a87-4b6c-a121-dd977d140a9d', connection_string='arn:aws:iam::851851261022:role/PDI-s3-export', created_by='3009a19d-57af-47c1-a141-eec85b519c9f', created_time='2025-03-19 15:35:27.612861+00:00', customer_region='us-west-2', customer_s3_export_path='s3://kh-studio-test-pdi-s3-export/sandbox-maplab-enterprise', database_name=None, listing_name=None, output_file_format=<OutputFileFormatEnum.CSV: 'CSV'>, region='us-west-2', share_id='9bb1938d-3718-4e3c-9345-be89c73337de', share_name='sandboxmaplabenterprise_s3_external', share_type=<DataFormatEnum.S3_EXTERNAL: 'S3_EXTERNAL'>, updated_by='3009a19d-57af-47c1-a141-eec85b519c9f', updated_time='2025-03-19 15:35:27.612861+00:00', user_id='3009a19d-57af-47c1-a141-eec85b519c9f')]


In [26]:
share_id = '9bb1938d-3718-4e3c-9345-be89c73337de' # str | 

try:
    get_share_response = client.connections.get_share(share_id)
except Exception as e:
    print("Exception when calling ConnectionsApi->get_share: %s\n" % e)


In [27]:
# retrieve the required IAM role policies
s3_details = client.connections.get_shares_aws_details()

s3_iam_policy = s3_details.iam_policy_json
print(s3_iam_policy)

s3_iam_trust_relationship = s3_details.iam_trust_relationship_json
print(s3_iam_trust_relationship)

In [39]:
from komodo.data_deliveries.models.dataset_shares.create_dataset_share_request import CreateDatasetShareRequest

dataset_export_request = CreateDatasetShareRequest(dataset_ids=[dataset_id])
dataset_export_response = client.data_deliveries.create_dataset_share(share_id=share_id, create_dataset_share_request=dataset_export_request)
dataset_export_id = dataset_export_response.dataset_shares[0].dataset_share_id

dataset_export_id

ServiceException: (502)
Reason: Bad Gateway
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 18 Apr 2025 11:26:39 GMT', 'Content-Type': 'application/json', 'Content-Length': '112', 'Connection': 'keep-alive', 'x-request-id': 'c67eb6deec064a9098f748149edf4153', 'Access-Control-Allow-Origin': '*'})
HTTP response body: {"message":"Error submitting the scan for dataset: 61d7b813-b196-4b0a-b1c3-29c7d5462adc: Internal Server Error"}
