In [1]:
#!pip install pyathena

In [7]:
from pyathena import connect
import pandas as pd
import yaml

S3_RESULTS_PATH='s3://ecmo-athena-query-grr/results/'
REGION='ap-southeast-2'

athena = connect(s3_staging_dir=S3_RESULTS_PATH, region_name=REGION, schema_name='mimic3-sampleset')

In [3]:
# TODO ICD9 codes needed for each condition below
icd9_mappings = yaml.safe_load('''
    chroniccard_mhyn: 
        - 10000
    chronicpul_mhyn:
        - 10000
    asthma_mhyn:
        - 10000
    renal_mhyn:
        - 10000
    modliver_mhyn:
        - 10000
    mildliv_mhyn:
        - 10000
    chronicneu_mhyn: 
        - 10000
    malignantneo_mhyn:
        - 10000
    chronhaemo_mhyn:
        - 10000 
    aidshiv_mhyn:
        - 10000
    obesity_mhyn:   
        - 10000
    diabetiscomp_mhyn: 
        - 10000
    diabetes_mhyn:
        - 10000
    rheumatology_mhyr:
        - 10000
    dementia_mhyn:
        - 10000
    malnutrition_mhyn:
        - 10000
    smoker:
        - 3051
    former_smoker:
        - V1582
''')

icd9_mappings

{'chroniccard_mhyn': [10000],
 'chronicpul_mhyn': [10000],
 'asthma_mhyn': [10000],
 'renal_mhyn': [10000],
 'modliver_mhyn': [10000],
 'mildliv_mhyn': [10000],
 'chronicneu_mhyn': [10000],
 'malignantneo_mhyn': [10000],
 'chronhaemo_mhyn': [10000],
 'aidshiv_mhyn': [10000],
 'obesity_mhyn': [10000],
 'diabetiscomp_mhyn': [10000],
 'diabetes_mhyn': [10000],
 'rheumatology_mhyr': [10000],
 'dementia_mhyn': [10000],
 'malnutrition_mhyn': [10000],
 'smoker': [3051],
 'former_smoker': ['V1582']}

In [4]:
icd9_expressions = {column: ','.join([f"'{v}'" for v in values]) for column, values in icd9_mappings.items()}
icd9_expression = ',\n  '.join([f'bool_or(icd9_code IN ({expression})) AS {column}' for column, expression in icd9_expressions.items()])
mapped_icd9_expression = ', '.join([', '.join([f"'{v}'" for v in values]) for values in icd9_mappings.values()])
visible_column_names = ', '.join([c for c in icd9_mappings if c not in ['smoker', 'former_smoker']])

icd9_mapping_sql = f'''
icd9_mapping AS (
  SELECT 
  subject_id,
  {icd9_expression},
  bool_or(icd9_code NOT IN ({mapped_icd9_expression})) AS other_mhyn,
  filter(array_agg(icd9_code), x -> x NOT IN ({mapped_icd9_expression})) AS otherrisktext
  FROM diagnoses_icd
  GROUP BY subject_id
),
comorbidities AS (
  SELECT 
  subject_id, {visible_column_names}, other_mhyn,
  CASE
    WHEN smoker = true THEN 1
    WHEN former_smoker = true THEN 3
    ELSE 2
  END AS smoking_mhynfrom,
  otherrisktext      
  FROM icd9_mapping
)'''

'''icd9_unmapped AS (
    SELECT 
    subject_id,
    array_agg(icd9_code) AS otherrisktext,
    FROM diagnoses_icd
    WHERE icd9_code NOT IN ({mapped_icd9_expression})
    GROUP BY subject_id
)'''


comorbidities_sql = f'''WITH {icd9_mapping_sql}
SELECT * from comorbidities'''

print(comorbidities_sql)

WITH 
icd9_mapping AS (
  SELECT 
  subject_id,
  bool_or(icd9_code IN ('10000')) AS chroniccard_mhyn,
  bool_or(icd9_code IN ('10000')) AS chronicpul_mhyn,
  bool_or(icd9_code IN ('10000')) AS asthma_mhyn,
  bool_or(icd9_code IN ('10000')) AS renal_mhyn,
  bool_or(icd9_code IN ('10000')) AS modliver_mhyn,
  bool_or(icd9_code IN ('10000')) AS mildliv_mhyn,
  bool_or(icd9_code IN ('10000')) AS chronicneu_mhyn,
  bool_or(icd9_code IN ('10000')) AS malignantneo_mhyn,
  bool_or(icd9_code IN ('10000')) AS chronhaemo_mhyn,
  bool_or(icd9_code IN ('10000')) AS aidshiv_mhyn,
  bool_or(icd9_code IN ('10000')) AS obesity_mhyn,
  bool_or(icd9_code IN ('10000')) AS diabetiscomp_mhyn,
  bool_or(icd9_code IN ('10000')) AS diabetes_mhyn,
  bool_or(icd9_code IN ('10000')) AS rheumatology_mhyr,
  bool_or(icd9_code IN ('10000')) AS dementia_mhyn,
  bool_or(icd9_code IN ('10000')) AS malnutrition_mhyn,
  bool_or(icd9_code IN ('3051')) AS smoker,
  bool_or(icd9_code IN ('V1582')) AS former_smoker,
  bool_

In [8]:
df = pd.read_sql(comorbidities_sql, athena)

In [9]:
df

Unnamed: 0,subject_id,chroniccard_mhyn,chronicpul_mhyn,asthma_mhyn,renal_mhyn,modliver_mhyn,mildliv_mhyn,chronicneu_mhyn,malignantneo_mhyn,chronhaemo_mhyn,aidshiv_mhyn,obesity_mhyn,diabetiscomp_mhyn,diabetes_mhyn,rheumatology_mhyr,dementia_mhyn,malnutrition_mhyn,other_mhyn,smoking_mhynfrom,otherrisktext
0,10111,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2,"[4280, 51881, 42781, 496, 42731, 4241, 5849, 2..."
1,43748,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2,"[42843, 5849, 78551, 6826, 4280, 2875, 41400, ..."
2,10090,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2,"[9693, 2851, 29633, 9680, 9694, E9503, E9504, ..."
3,43779,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2,"[85221, 5849, 1970, 1539, E8859, 42652, 28521,..."
4,10067,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2,"[86404, 8504, 9584, 4275, 2851, E8232, 86389, ..."
5,10035,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2,"[43310, 2749, 6000, 412]"
6,10040,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2,"[40291, 7907, 4928, 51881, 4241, 4280, 41401, ..."
7,40204,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2,"[5990, 486, 42822, 40391, 5856, 58881, 0414, 7..."
8,42458,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2,"[4588, 5849, 4270, 27652, 2948, 72400]"
9,10114,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,2,"[4280, 4240, 4111, 2859, 4928, 29281, E9394, 2..."
