# Mimic III to Ecmo Card Mapping
## Setup
### Install  libs 
Only run pip install one when first installing notebook.

In [1]:
#!pip install pyathena
#! pip install PyAthena[SQLAlchemy]

### Imports  and settings

In [2]:
from io import StringIO
import contextlib
import yaml
from urllib.parse import quote_plus
import numpy as np
import pandas as pd
import boto3
from pyathena import connect
from sqlalchemy import create_engine
from sqlalchemy.types import String

import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

pd.set_option("display.max_rows", None)
S3_BUCKET = 'ecmo-athena-query-grr' # bucket for all created tabel data
REGION='ap-southeast-2' # AWS region for all data
WRITE_SCHEMA = "echmocard" # write schema is the schema where all intermediary and final mapped tabels are created
READ_SCHEMA = "mimic3-sampleset" # read schema is where mimic3 raw data is

## Athena
We will use Athena to do all of the mapping work by creating tables of mapped mimic 3 data.

### Athena connection
First lets get the Athena connection to the mimic 3 schema (the read schema)

In [3]:
def athena_connection(schema):
    return connect(s3_staging_dir=f's3://{S3_BUCKET}/athena/stage/', region_name=REGION, schema_name=schema)

connection = athena_connection(READ_SCHEMA)

### Create table as select
For large table inserts, we dont want to load data into dataframes locally and then write them back into Athena, so we we need a function that will overwrite tables in Athena dirctly from a select. This is a 3 step process. First drop the existing table from the Athena catalog, remove its data from S3, and then re-create the table from the select statement.

In [5]:
def athena_delete(table):
    path = f'athena/table/{WRITE_SCHEMA}/{table}'
    connection = athena_connection(WRITE_SCHEMA)

    # drop the table from catalog
    sql = f'DROP TABLE {table}'
    print(sql)
    connection.cursor().execute(sql)
    
    # remove s3 data if it exists
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(S3_BUCKET)
    for obj in bucket.objects.filter(Prefix=path):
        print(f'DELETE s3://{S3_BUCKET}/{obj.key}')
        s3.Object(bucket.name, obj.key).delete()
        
def athena_ctas(table, query):    
    athena_delete(table)
        
    # create table as select
    path = f'athena/table/{WRITE_SCHEMA}/{table}'
    sql = f"CREATE TABLE {table}\nWITH (external_location='s3://{S3_BUCKET}/{path}/') AS {query}"
    print(sql)
    

    connection = athena_connection(WRITE_SCHEMA)
    connection.cursor().execute(sql)
    
def sample(connection, table, limit=5):
    return pd.read_sql(f'select * from "{WRITE_SCHEMA}".{table} limit {limit}', connection)

## ICD9 Mapping
For many of the queries, we will need to have ICD9 codes mapped to ecmo attributes. This basic mnapping is done by reading a configuration file that lists the ecmo field name, all the ICD9 code that map to it.

### Read icd9 mapping file
Read the mappings yaml file [icd9 mappings file](./icd9_mappings.yaml)

In [5]:
with open('icd9_mappings.yaml', 'r') as stream:    
    icd9_mappings = yaml.safe_load(stream)

### Expand mapping codes
ICD9 codes are of the format <3 digit parent code><1 digit subcode><1 digit sub-subcode>. As opposed to specifying all the codes that map to an ecmocard field in the yaml config, specify the parent code and the logic below will expand the list of mappings to all possible sub codes.

In [6]:
expanded_codes = []
mapping = []
for key,mappings in icd9_mappings.items():    
    for value in mappings:
        value = str(value)
        split = value.find(' ')
        code = value[0:split]
        description = value[split+1:]
        codes = [code]
        while len(codes[-1]) < 5:
            expand = []
            for code in codes:
                expand = expand + [code + str(x) for x in range(0, 9)] 
            codes = codes + expand        
        mapping = mapping + [key]*len(codes)
        expanded_codes = expanded_codes + codes
        
mappings = pd.DataFrame({'icd9_code': expanded_codes, 'mapping': mapping})
mappings = mappings.set_index('icd9_code')#.drop_duplicates()

icd9_diagnosis = pd.read_sql('select * from d_icd_diagnoses', connection)
icd9_diagnosis = icd9_diagnosis.set_index('icd9_code')
encode = lambda x:  x.replace("'", "") .replace(',', ' ') # strings causing issues on insert
icd9_diagnosis['short_title'] = icd9_diagnosis['short_title'].apply(encode)
icd9_diagnosis['long_title'] = icd9_diagnosis['long_title'].apply(encode)
mapped = icd9_diagnosis.join(mappings)
mapped = mapped[mapped['mapping'].notna()].\
    drop_duplicates().\
    reset_index().\
    set_index('mapping').\
    drop(columns=['row_id']).\
    sort_values('mapping')

mapped.head(10)

Unnamed: 0_level_0,icd9_code,short_title,long_title
mapping,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abdopain_ceoccur_v2,78907,Abdmnal pain generalized,Abdominal pain generalized
abdopain_ceoccur_v2,78906,Abdmnal pain epigastric,Abdominal pain epigastric
abdopain_ceoccur_v2,78905,Abdmnal pain periumbilic,Abdominal pain periumbilic
abdopain_ceoccur_v2,78904,Abdmnal pain lt lwr quad,Abdominal pain left lower quadrant
abdopain_ceoccur_v2,78903,Abdmnal pain rt lwr quad,Abdominal pain right lower quadrant
abdopain_ceoccur_v2,78902,Abdmnal pain lft up quad,Abdominal pain left upper quadrant
abdopain_ceoccur_v2,78901,Abdmnal pain rt upr quad,Abdominal pain right upper quadrant
abdopain_ceoccur_v2,78900,Abdmnal pain unspcf site,Abdominal pain unspecified site
aidshiv_mhyn,42,Human immuno virus dis,Human immunodeficiency virus [HIV] disease
asthma_mhyn,49382,Cough variant asthma,Cough variant asthma


### Create ICD9 Mapping Table
After expanding the mappings, create an icd9 to ecmocard mapping table in Athena. This will create parquet files in S3 and update the glue data catalog.

In [7]:
athena_delete('icd9_map')

write_engine = create_engine(
        f'awsathena+rest://:@athena.{REGION}.amazonaws.com:443/{WRITE_SCHEMA}?' +
        f's3_staging_dir=s3://{S3_BUCKET}/athena/table/&' +
        f's3_dir=s3://{S3_BUCKET}/athena/table/&compression=snappy'
)

mapped.reset_index().to_sql(
        'icd9_map',
        write_engine,  
        index=False, 
        if_exists='replace', 
        method='multi',
        schema=WRITE_SCHEMA,
        dtype={
            "mapping": String(30),
            "icd9_code": String(10),
            'short_title': String(100),
            'long_title': String(200)
        }
    ) 

DROP TABLE icd9_map
DELETE s3://ecmo-athena-query-grr/athena/table/echmocard/icd9_map/20200511_062750_00004_k7ece_87155e5a-80cd-4d9a-8afc-4390571ce204


## Create intermediary tables
This section will perfrom initial mappings form mimic3 tables to intermediarry tables that can be later combined to produce the various ecmo card sections.

### Diagnosis table
Lets us the above function to join the mimic iii diagnosis table for all subject to the icd9 mapping table to get the diagnosis mapped. We will generate the query uign python and the mapping columns names.

In [8]:
mapped_columns = mapped.index.unique()
bool_sql = ',\n  '.join([f"bool_or(mapping = '{m}') AS {m}" for m in mapped_columns])

athena_ctas('diagnoses', f'''
    SELECT 
        hadm_id,
        subject_id,
        {bool_sql},
        array_except(zip_with(array_agg(d.icd9_code), array_agg(mapping), (c,m) -> if(m is NULL, c)), ARRAY[null]) as unmapped,
        array_agg(d.icd9_code) AS all_codes,
        array_agg(mapping) AS all_mappings
    FROM "{READ_SCHEMA}".diagnoses_icd AS d
    LEFT JOIN icd9_map USING (icd9_code)
    GROUP BY (hadm_id, subject_id)
    ORDER BY subject_id
''')

DROP TABLE diagnoses
DELETE s3://ecmo-athena-query-grr/athena/table/echmocard/diagnoses/20200511_062829_00003_6ifev_1b1a709c-187e-4aae-90e0-7828d883ae42
CREATE TABLE diagnoses
WITH (external_location='s3://ecmo-athena-query-grr/athena/table/echmocard/diagnoses/') AS 
    SELECT 
        hadm_id,
        subject_id,
        bool_or(mapping = 'abdopain_ceoccur_v2') AS abdopain_ceoccur_v2,
  bool_or(mapping = 'aidshiv_mhyn') AS aidshiv_mhyn,
  bool_or(mapping = 'asthma_mhyn') AS asthma_mhyn,
  bool_or(mapping = 'bleed_ceoccur_v2') AS bleed_ceoccur_v2,
  bool_or(mapping = 'chestpain_ceoccur_v2') AS chestpain_ceoccur_v2,
  bool_or(mapping = 'chronhaemo_mhyn') AS chronhaemo_mhyn,
  bool_or(mapping = 'chroniccard_mhyn') AS chroniccard_mhyn,
  bool_or(mapping = 'chronicneu_mhyn') AS chronicneu_mhyn,
  bool_or(mapping = 'chronicpul_mhyn') AS chronicpul_mhyn,
  bool_or(mapping = 'confusion_ceoccur_v2') AS confusion_ceoccur_v2,
  bool_or(mapping = 'conjunct_ceoccur_v2') AS conjunct_ceoccur_v2,
  

In [9]:
left = []
visible_columns = (
    'aidshiv_mhyn,malignantneo_mhyn,diabetes_mhyn,diabetiscomp_mhyn,'+
    'malnutrition_mhyn,obesity_mhyn,chronhaemo_mhyn,chronicneu_mhyn,'+
    'dementia_mhyn,asthma_mhyn,modliver_mhyn,renal_mhyn'
).split(',')
for item in mapped_columns:
    if item not in visible_columns:
        left.append(item)
print(','.join(left))

abdopain_ceoccur_v2,bleed_ceoccur_v2,chestpain_ceoccur_v2,chroniccard_mhyn,chronicpul_mhyn,confusion_ceoccur_v2,conjunct_ceoccur_v2,cough_ceoccur_v2,coughhb_ceoccur_v2,coughsput_ceoccur_v2,diarrhoea_ceoccur_v2,earpain_ceoccur_v2,fatigue_ceoccur_v2,fever_ceoccur_v2,former_smoker,headache_ceoccur_v2,jointpain_ceoccur_v2,lymp_ceoccur_v2,myalgia_ceoccur_v2,oxy_vsyn,postpart_rptestcd,pregout_rptestcd_live,pregout_rptestcd_still,pregyn_rptestcd,rash_ceoccur_v2,rheumatology_mhyr,runnynose_ceoccur_v2,seizures_cecoccur_v2,shortbreath_ceoccur_v2,skinulcers_ceoccur_v2,smoker,sorethroat_ceoccur_v2,vomit_ceoccur_v2,wheeze_ceoccur_v2


In [6]:
sample(connection, 'diagnoses')

Unnamed: 0,hadm_id,subject_id,abdopain_ceoccur_v2,aidshiv_mhyn,asthma_mhyn,bleed_ceoccur_v2,chestpain_ceoccur_v2,chronhaemo_mhyn,chroniccard_mhyn,chronicneu_mhyn,...,seizures_cecoccur_v2,shortbreath_ceoccur_v2,skinulcers_ceoccur_v2,smoker,sorethroat_ceoccur_v2,vomit_ceoccur_v2,wheeze_ceoccur_v2,unmapped,all_codes,all_mappings
0,142345,10006,False,False,False,False,False,False,True,False,...,False,False,False,True,False,False,False,"[V5867, 28529, 5672, 7850, 99591, E9342, 2749,...","[4241, V5867, 28529, 5672, 7850, 99591, 4240, ...","[chroniccard_mhyn, null, null, null, null, nul..."
1,105331,10011,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,"[07030, 07054, 30401, 2760, 570]","[07030, 07054, 30401, 2760, 2875, 570]","[null, null, null, null, chronhaemo_mhyn, null]"
2,165520,10013,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,"[2724, 4582, 78551, 486, 41071, 0389]","[2724, 20280, 4240, 4582, 42731, 78551, 486, 4...","[null, malignantneo_mhyn, chroniccard_mhyn, nu..."
3,199207,10017,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,"[2724, 81201, 99812, E8859, 8028, 4019, V1259,...","[2724, 81201, 99812, E8859, 8028, 4019, 25000,...","[null, null, null, null, null, null, diabetes_..."
4,177759,10019,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,"[5845, 78559, 5770, 5781, 2848, 4019, 49390, 7...","[5845, 78559, 5770, 51881, 5781, 2848, 5711, 5...","[null, null, null, chronicpul_mhyn, null, null..."


## ECMO Mapping
Using the mapped tables, we can now map the ecmo card fields for each section of the ecmo card.

### Demographics
#### Logic
* Join the  columns from patients and admissions to get demographics, and diagnosis for pregnency flags
* Ethnicity is mapped from admission table
* Age is calculated by subtracting todays date from DOB. Any age over 90 is set to 90, as the data is random for higher ages
* Pregnancy is determined by ICD9 diagnosis mappings to pregyn_rptestcd
* Postpartum is determined by ICD9 diagnosis mappings to postpart_rptestcd
* If any ICD9 code for birth or multiple birth without any stillbirth, pregout_rptestcd is set to live birth = 1
* Otherwise, If any ICD9 code for stillborn or multiple birth with 1 or more stillborn pregout_rptestcd is set to stillborn = 2
* Otherwise, set to null, for no birth

#### ICD9 Mappings 
The following are the mappings for the relavent icd9 diagnosis codes:

In [11]:

#mapped.loc[mappings.index.isin(['pregyn_rptestcd', 'postpart_rptestcd', 'pregout_rptestcd_live', 'pregout_rptestcd_still'])]
mapped.loc[['pregyn_rptestcd', 'postpart_rptestcd', 'pregout_rptestcd_live', 'pregout_rptestcd_still']]

Unnamed: 0_level_0,icd9_code,short_title,long_title
mapping,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pregyn_rptestcd,V221,Supervis oth normal preg,Supervision of other normal pregnancy
pregyn_rptestcd,V222,Preg state incidental,Pregnant state incidental
pregyn_rptestcd,V230,Preg w hx of infertility,Supervision of high-risk pregnancy with histor...
pregyn_rptestcd,V231,Preg w hx-trophoblas dis,Supervision of high-risk pregnancy with histor...
pregyn_rptestcd,V232,Preg w hx of abortion,Supervision of high-risk pregnancy with histor...
pregyn_rptestcd,V233,Grand multiparity,Supervision of high-risk pregnancy with grand ...
pregyn_rptestcd,V2341,Preg w hx pre-term labor,Pregnancy with history of pre-term labor
pregyn_rptestcd,V2342,Preg w hx ectopic preg,Pregnancy with history of ectopic pregnancy
pregyn_rptestcd,V235,Preg w poor reproduct hx,Supervision of high-risk pregnancy with other ...
pregyn_rptestcd,V2387,Preg w incon fetl viabil,Pregnancy with inconclusive fetal viability


In [12]:
athena_ctas('demographics', f'''
WITH 
admissions AS (
    SELECT
        subject_id,
        hadm_id,
        ethnicity,
        CASE
            WHEN ethnicity='BLACK/AFRICAN AMERICAN' THEN 2
            WHEN ethnicity='ASIAN' THEN 3
            WHEN ethnicity='HISPANIC OR LATINO' THEN 6
            WHEN ethnicity='HISPANIC/LATINO - PUERTO RICAN' THEN 6
            WHEN ethnicity='WHITE' THEN 7
            WHEN ethnicity='AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE' THEN 8
            WHEN ethnicity='UNKNOWN/NOT SPECIFIED' THEN 10
            ELSE 9 
        END AS ethnic
    FROM "{READ_SCHEMA}".admissions
),

patients AS (
    SELECT
        subject_id,
        CASE
            WHEN gender = 'F' THEN '2'
            WHEN gender = 'M' THEN '1'
            ELSE '3'
        END AS sexfrom, 
        date_diff('year', date_parse(dob, '%Y-%m-%d %H:%i:%s'), current_date) AS age
    FROM "{READ_SCHEMA}".patients
)

SELECT 
    admissions.subject_id,
    admissions.hadm_id,
    ethnic,
    if (ethnic = 9, ethnicity, NULL) AS other_ethnic,
    
    IF(age<90, age, 90) AS age_estimateyears,
    FLOOR(IF(age<90, age, 90) / 10) AS age_estimate10,
    2 AS age_estimateyearsu,
    sexfrom,
    
    if(pregyn_rptestcd, 1, 0) AS pregyn_rptestcd,
    if(postpart_rptestcd, 1, 0) AS postpart_rptestcd,
    if(pregout_rptestcd_live, 1, if (pregout_rptestcd_still, 2, null)) AS pregout_rptestcd
FROM admissions
JOIN patients ON (admissions.subject_id = patients.subject_id)
LEFT JOIN diagnoses ON (admissions.hadm_id = diagnoses.hadm_id)
ORDER BY subject_id
''')

DROP TABLE demographics
DELETE s3://ecmo-athena-query-grr/athena/table/echmocard/demographics/20200511_063420_00003_hskjf_9797ec86-284a-4322-9d4f-ce73b345fbf0
CREATE TABLE demographics
WITH (external_location='s3://ecmo-athena-query-grr/athena/table/echmocard/demographics/') AS 
WITH 
admissions AS (
    SELECT
        subject_id,
        hadm_id,
        ethnicity,
        CASE
            WHEN ethnicity='BLACK/AFRICAN AMERICAN' THEN 2
            WHEN ethnicity='ASIAN' THEN 3
            WHEN ethnicity='HISPANIC OR LATINO' THEN 6
            WHEN ethnicity='HISPANIC/LATINO - PUERTO RICAN' THEN 6
            WHEN ethnicity='WHITE' THEN 7
            WHEN ethnicity='AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE' THEN 8
            WHEN ethnicity='UNKNOWN/NOT SPECIFIED' THEN 10
            ELSE 9 
        END AS ethnic
    FROM "mimic3-sampleset".admissions
),

patients AS (
    SELECT
        subject_id,
        CASE
            WHEN gender = 'F' THEN '2'
            WHEN ge

In [13]:
sample(connection, 'demographics')

Unnamed: 0,subject_id,hadm_id,ethnic,other_ethnic,age_estimateyears,age_estimate10,age_estimateyearsu,sexfrom,pregyn_rptestcd,postpart_rptestcd,pregout_rptestcd
0,10006,142345,2,,-73,-7,2,2,0,0,
1,10011,105331,10,,-70,-7,2,2,0,0,
2,10013,165520,10,,-18,-1,2,2,0,0,
3,10017,199207,7,,-55,-5,2,2,0,0,
4,10019,177759,7,,-94,-9,2,1,0,0,


### Comorbidities 

#### ICD9 Mappings 
The following are the mappings for the relavent icd9 diagnosis codes:

In [14]:
diagnoses_columns = ('aidshiv_mhyn,malignantneo_mhyn,diabetes_mhyn,diabetiscomp_mhyn,'+
    'malnutrition_mhyn,obesity_mhyn,chronhaemo_mhyn,chronicneu_mhyn,'+
    'dementia_mhyn,asthma_mhyn,modliver_mhyn,renal_mhyn'
                    ).split(',')
mapped.loc[diagnoses_columns]

Unnamed: 0_level_0,icd9_code,short_title,long_title
mapping,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aidshiv_mhyn,42,Human immuno virus dis,Human immunodeficiency virus [HIV] disease
malignantneo_mhyn,20025,Burkitts tumor inguin,Burkitts tumor or lymphoma lymph nodes of ing...
malignantneo_mhyn,1915,Mal neo cereb ventricle,Malignant neoplasm of ventricles
malignantneo_mhyn,1914,Mal neo occipital lobe,Malignant neoplasm of occipital lobe
malignantneo_mhyn,1465,Mal neo epiglottis junct,Malignant neoplasm of junctional region of oro...
malignantneo_mhyn,1913,Mal neo parietal lobe,Malignant neoplasm of parietal lobe
malignantneo_mhyn,1912,Mal neo temporal lobe,Malignant neoplasm of temporal lobe
malignantneo_mhyn,1911,Malig neo frontal lobe,Malignant neoplasm of frontal lobe
malignantneo_mhyn,1910,Malign neopl cerebrum,Malignant neoplasm of cerebrum except lobes a...
malignantneo_mhyn,1908,Malign neopl eye NEC,Malignant neoplasm of other specified sites of...


#### Query Logic 
* Each column is taken from diagnosis mapping table. If there is an ICD9 Code present during their admission, than the features is flagged as True
* Any ICD9 codes that are in listed in the diagnosis for the admission, but are not mapped are assigned to otherrisktext.

In [15]:
subsql = ',\n  '.join([f"if({m} is null, 3, if({m}, 1, 2)) AS {m}" for m in diagnoses_columns])

athena_ctas('comorbidities', f'''
SELECT 
  subject_id, 
  hadm_id,
  {subsql}, 
  if(cardinality(unmapped) > 0, 1, 0) AS other_mhyn,
  CASE
    WHEN smoker THEN 1
    WHEN former_smoker THEN 3
    ELSE 2
  END AS smoking_mhyn,
  unmapped AS otherrisktext      
FROM diagnoses
ORDER BY subject_id
''')

DROP TABLE comorbidities
DELETE s3://ecmo-athena-query-grr/athena/table/echmocard/comorbidities/20200511_065243_00003_73dhr_ee77ab8e-d9b2-4d9c-b0af-ebd248b1ed5d
CREATE TABLE comorbidities
WITH (external_location='s3://ecmo-athena-query-grr/athena/table/echmocard/comorbidities/') AS 
SELECT 
  subject_id, 
  hadm_id,
  if(aidshiv_mhyn is null, 3, if(aidshiv_mhyn, 1, 2)) AS aidshiv_mhyn,
  if(malignantneo_mhyn is null, 3, if(malignantneo_mhyn, 1, 2)) AS malignantneo_mhyn,
  if(diabetes_mhyn is null, 3, if(diabetes_mhyn, 1, 2)) AS diabetes_mhyn,
  if(diabetiscomp_mhyn is null, 3, if(diabetiscomp_mhyn, 1, 2)) AS diabetiscomp_mhyn,
  if(malnutrition_mhyn is null, 3, if(malnutrition_mhyn, 1, 2)) AS malnutrition_mhyn,
  if(obesity_mhyn is null, 3, if(obesity_mhyn, 1, 2)) AS obesity_mhyn,
  if(chronhaemo_mhyn is null, 3, if(chronhaemo_mhyn, 1, 2)) AS chronhaemo_mhyn,
  if(chronicneu_mhyn is null, 3, if(chronicneu_mhyn, 1, 2)) AS chronicneu_mhyn,
  if(dementia_mhyn is null, 3, if(dementia_mhyn,

In [16]:
sample(connection, 'comorbidities')

Unnamed: 0,subject_id,hadm_id,aidshiv_mhyn,malignantneo_mhyn,diabetes_mhyn,diabetiscomp_mhyn,malnutrition_mhyn,obesity_mhyn,chronhaemo_mhyn,chronicneu_mhyn,dementia_mhyn,asthma_mhyn,modliver_mhyn,renal_mhyn,other_mhyn,smoking_mhyn,otherrisktext
0,10006,142345,2,2,1,2,2,2,2,2,2,2,2,2,1,1,"[V5867, 28529, 5672, 7850, 99591, E9342, 2749,..."
1,10011,105331,2,2,2,2,2,2,1,2,2,2,2,2,1,2,"[07030, 07054, 30401, 2760, 570]"
2,10013,165520,2,1,2,2,2,2,2,2,2,2,2,2,1,2,"[2724, 4582, 78551, 486, 41071, 0389]"
3,10017,199207,2,2,1,2,2,2,1,2,2,2,2,2,1,2,"[2724, 81201, 99812, E8859, 8028, 4019, V1259,..."
4,10019,177759,2,2,2,2,2,2,2,2,2,2,1,2,1,2,"[5845, 78559, 5770, 5781, 2848, 4019, 49390, 7..."


### Onset and admissions
#### Logic
* Hospital admission date and time used (not ICU admission time)
* Admission location indicated other hospital transfer

In [17]:
athena_ctas('onset_and_admissions', f'''
SELECT
    subject_id,
    hadm_id,
    date_format(date_parse(admittime, '%Y-%m-%d %H:%i:%s'), '%Y-%m-%d') AS hostdat,
    date_format(date_parse(admittime, '%Y-%m-%d %H:%i:%s'), '%H:%i:%s') AS hosttim,
    CASE 
        WHEN admission_location = 'TRANSFER FROM OTHER HEALT' THEN 2 
        WHEN admission_location = '** INFO NOT AVAILABLE **' THEN 4
        ELSE 3
    END AS hooccur
FROM
    "{READ_SCHEMA}".admissions
ORDER BY subject_id
''')

DROP TABLE onset_and_admissions
DELETE s3://ecmo-athena-query-grr/athena/table/echmocard/onset_and_admissions/20200511_065307_00001_2vf7i_efccd040-d6c6-446d-969d-52172bbd24c9
CREATE TABLE onset_and_admissions
WITH (external_location='s3://ecmo-athena-query-grr/athena/table/echmocard/onset_and_admissions/') AS 
SELECT
    subject_id,
    hadm_id,
    date_format(date_parse(admittime, '%Y-%m-%d %H:%i:%s'), '%Y-%m-%d') AS hostdat,
    date_format(date_parse(admittime, '%Y-%m-%d %H:%i:%s'), '%H:%i:%s') AS hosttim,
    CASE 
        WHEN admission_location = 'TRANSFER FROM OTHER HEALT' THEN 2 
        WHEN admission_location = '** INFO NOT AVAILABLE **' THEN 4
        ELSE 3
    END AS hooccur
FROM
    "mimic3-sampleset".admissions
ORDER BY subject_id



In [18]:
sample(connection, 'onset_and_admissions')

Unnamed: 0,subject_id,hadm_id,hostdat,hosttim,hooccur
0,10006,142345,2164-10-23,21:09:00,3
1,10011,105331,2126-08-14,22:32:00,3
2,10013,165520,2125-10-04,23:36:00,3
3,10017,199207,2149-05-26,17:19:00,3
4,10019,177759,2163-05-14,20:43:00,3


### Admission signs and symptoms

#### ICD9 Mappings 
The following are the mappings for the relavent icd9 diagnosis codes:

In [19]:
diagnoses_columns = (
    'sorethroat_ceoccur_v2,smoker,seizures_cecoccur_v2,conjunct_ceoccur_v2,earpain_ceoccur_v2,'+
    'rheumatology_mhyr,chroniccard_mhyn,bleed_ceoccur_v2,runnynose_ceoccur_v2,chronicpul_mhyn,'+
    'skinulcers_ceoccur_v2,jointpain_ceoccur_v2,myalgia_ceoccur_v2,confusion_ceoccur_v2,'+
    'fever_ceoccur_v2,fatigue_ceoccur_v2,rash_ceoccur_v2,headache_ceoccur_v2,lymp_ceoccur_v2,'+
    'shortbreath_ceoccur_v2,wheeze_ceoccur_v2,cough_ceoccur_v2,coughhb_ceoccur_v2,coughsput_ceoccur_v2,'+
    'chestpain_ceoccur_v2,vomit_ceoccur_v2,diarrhoea_ceoccur_v2,abdopain_ceoccur_v2,oxy_vsyn'
).split(',')
mapped.loc[diagnoses_columns]

Unnamed: 0_level_0,icd9_code,short_title,long_title
mapping,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sorethroat_ceoccur_v2,462,Acute pharyngitis,Acute pharyngitis
sorethroat_ceoccur_v2,7841,Throat pain,Throat pain
sorethroat_ceoccur_v2,0341,Scarlet fever,Scarlet fever
sorethroat_ceoccur_v2,0340,Strep sore throat,Streptococcal sore throat
smoker,30430,Cannabis depend-unspec,Cannabis dependence unspecified
smoker,30522,Cannabis abuse-episodic,Cannabis abuse episodic
smoker,30431,Cannabis depend-contin,Cannabis dependence continuous
smoker,30432,Cannabis depend-episodic,Cannabis dependence episodic
smoker,30523,Cannabis abuse-in remiss,Cannabis abuse in remission
smoker,30433,Cannabis depend-remiss,Cannabis dependence in remission


#### Logic
* Each column is taken from diagnosis mapping table. If there is an ICD9 Code present during their admission, than the features is flagged as True

In [20]:
category_sql = ',\n  '.join([f"if({m} is null, 3, if({m}, 1, 2)) AS {m}" for m in diagnoses_columns])
athena_ctas('admission_signs_and_symptoms', f'''
SELECT 
  subject_id, 
  hadm_id,
  {category_sql}   
FROM diagnoses
ORDER BY subject_id
''')

DROP TABLE admission_signs_and_symptoms
DELETE s3://ecmo-athena-query-grr/athena/table/echmocard/admission_signs_and_symptoms/20200511_065504_00003_74jqt_6cd73e17-6d0f-444f-b524-2e88b0780839
CREATE TABLE admission_signs_and_symptoms
WITH (external_location='s3://ecmo-athena-query-grr/athena/table/echmocard/admission_signs_and_symptoms/') AS 
SELECT 
  subject_id, 
  hadm_id,
  if(sorethroat_ceoccur_v2 is null, 3, if(sorethroat_ceoccur_v2, 1, 2)) AS sorethroat_ceoccur_v2,
  if(smoker is null, 3, if(smoker, 1, 2)) AS smoker,
  if(seizures_cecoccur_v2 is null, 3, if(seizures_cecoccur_v2, 1, 2)) AS seizures_cecoccur_v2,
  if(conjunct_ceoccur_v2 is null, 3, if(conjunct_ceoccur_v2, 1, 2)) AS conjunct_ceoccur_v2,
  if(earpain_ceoccur_v2 is null, 3, if(earpain_ceoccur_v2, 1, 2)) AS earpain_ceoccur_v2,
  if(rheumatology_mhyr is null, 3, if(rheumatology_mhyr, 1, 2)) AS rheumatology_mhyr,
  if(chroniccard_mhyn is null, 3, if(chroniccard_mhyn, 1, 2)) AS chroniccard_mhyn,
  if(bleed_ceoccur_v2 is n

In [21]:
sample(connection, 'admission_signs_and_symptoms')

Unnamed: 0,subject_id,hadm_id,sorethroat_ceoccur_v2,smoker,seizures_cecoccur_v2,conjunct_ceoccur_v2,earpain_ceoccur_v2,rheumatology_mhyr,chroniccard_mhyn,bleed_ceoccur_v2,...,shortbreath_ceoccur_v2,wheeze_ceoccur_v2,cough_ceoccur_v2,coughhb_ceoccur_v2,coughsput_ceoccur_v2,chestpain_ceoccur_v2,vomit_ceoccur_v2,diarrhoea_ceoccur_v2,abdopain_ceoccur_v2,oxy_vsyn
0,10006,142345,2,1,2,2,2,2,1,2,...,2,2,2,2,2,2,2,2,2,2
1,10011,105331,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
2,10013,165520,2,2,2,2,2,2,1,2,...,2,2,2,2,2,2,2,2,2,2
3,10017,199207,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
4,10019,177759,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


### Outcomes 
Now we will create an outcomes table that we can use as labels in a machine learning model.

#### Logic
* If the admissions hospital_expire_flag is set, then the outcome is death = 4. 
* Otherwise, If there is a discharge time, discharged alive = 1 
* Otherwise they are assumed to be hospitalized  = 2

In [22]:
athena_ctas('outcomes', f'''
SELECT 
    subject_id,
    hadm_id,
    CASE
        WHEN hospital_expire_flag = '1' THEN 4
        WHEN dischtime <> null THEN 1
        ELSE 2
    END AS dsterm
FROM admissions
ORDER BY subject_id
''')

DROP TABLE outcomes
DELETE s3://ecmo-athena-query-grr/athena/table/echmocard/outcomes/20200511_065531_00003_q3ik4_b99a32b4-2387-4993-b3a0-aded5aecad7c
CREATE TABLE outcomes
WITH (external_location='s3://ecmo-athena-query-grr/athena/table/echmocard/outcomes/') AS 
SELECT 
    subject_id,
    hadm_id,
    CASE
        WHEN hospital_expire_flag = '1' THEN 4
        WHEN dischtime <> null THEN 1
        ELSE 2
    END AS dsterm
FROM admissions
ORDER BY subject_id



In [23]:
sample(connection, 'outcomes')

Unnamed: 0,subject_id,hadm_id,dsterm
0,10006,142345,2
1,10011,105331,4
2,10013,165520,4
3,10017,199207,2
4,10019,177759,4


### Treatments 

#### Logic
| ECMO column         | ECMO Form Name | Mimic iii table | Mimic iii column | Transform Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
|---------------------|----------------|-----------------|------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| oxygen_cmoccur      | treatment      | PROCEDURES_ICD  | icd9_code        | Oxygen Therapies/Procedures  939.5 - Hyperbaric oxygenation 004.9 - Supersaturated oxygen therapy 396.5 - Extracorporeal membrane oxygenation [ECMO] 949.6 - Other oxygen enrichment 011.6 - Intracranial oxygen monitoring?? 896.7 - Monitoring of cardiac output by oxygen consumption technique?? Other dependence on machines, supplemental oxygen                                                                                                                                                                                                                                                                                                                                                                                                                        |
| noninvasive_proccur | treatment      | PROCEDURES_ICD  | icd9_code        | Non-invasive ventilation  939.0 - Non-invasive mechanical ventilation                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
| invasive_proccur    | treatment      | PROCEDURES_ICD  | icd9_code        | Invasive ventilation (any)   967 - Other continuous invasive mechanical ventilation 967.0 - Continuous invasive mechanical ventilation of unspecified duration 967.1 - Continuous invasive mechanical ventilation for 96 consecutive hours or more 967.2 - Continuous invasive mechanical ventilation for less than 96 consecutive hours                                                                                                                                                                                                                                                                                                                                                                                                                                      |
| pronevent_prtrt     | treatment      | PROCEDURES_ICD  | icd9_code        | NO ICD9_CODE that maps to Prone ventilation.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
| inhalednit_cmtrt    | treatment      | PROCEDURES_ICD  | icd9_code        | 0012 - Administration of inhaled nitric oxide                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
| tracheo_prtrt       | treatment      | PROCEDURES_ICD  | icd9_code        | All Following have been mapped. Please review.  965.5 - Tracheostomy toilette icd9pcs  [312.9](https://icd.codes/icd9pcs/r/3129?ref=Tracheostomy) Other permanent tracheostomy icd9pcs  312.1 Mediastinal tracheostomy icd9pcs  317.4 Revision of tracheostomy icd9pcs  312 Permanent tracheostomy icd9pcs  972.3 Replacement of tracheostomy tube icd9pcs  311 Temporary tracheostomy icd9pcs  973.7 Removal of tracheostomy tube icd9pcs  314.2 Laryngoscopy and other tracheoscopy icd9pcs  314.1 Tracheoscopy through artificial stoma icd9pcs                                                                                                                                                                                                                            |
| extracorp_prtrt     | treatment      | PROCEDURES_ICD  | icd9_code        | All Following have been mapped. Please review.  985.9 Extracorporeal shockwave lithotripsy of other sites icd9pcs  509.2 Extracorporeal hepatic assistance icd9pcs  132 Extracapsular extraction of lens by linear extraction technique icd9pcs  985.1 Extracorporeal shockwave lithotripsy [ESWL] of the kidney, ureter and/or bladder icd9pcs  985.2 Extracorporeal shockwave lithotripsy [ESWL] of the gallbladder and/or bile duct icd9pcs  135 Other extracapsular extraction of lens icd9pcs  985 Extracorporeal shockwave lithotripsy [ESWL] icd9pcs  396.5 Extracorporeal membrane oxygenation [ECMO] icd9pcs  135.9 Other extracapsular extraction of lens icd9pcs  396.1 Extracorporeal circulation auxiliary to open heart surgery icd9pcs                         |
| rrt_prtrt           | treatment      | PROCEDURES_ICD  | icd9_code        | 394.3 Removal of arteriovenous shunt for renal dialysis icd9pcs  392.7 Arteriovenostomy for renal dialysis icd9pcs  394.2 Revision of arteriovenous shunt for renal dialysis icd9pcs  395.5 Reimplantation of aberrant renal vessel  556.1 Renal autotransplantation icd9pcs  392.4 Aorta-renal bypass icd9pcs  884.5 Arteriography of renal arteries icd9pcs  389.5 Venous catheterization for renal dialysis icd9pcs  394.3 Removal of arteriovenous shunt for renal dialysis icd9pcs  395.5 Reimplantation of aberrant renal vessel icd9pcs  002.5 Intravascular imaging of renal vessels icd9pcs  392.7 Arteriovenostomy for renal dialysis icd9pcs  553.2 Open ablation of renal lesion or tissue icd9pcs  553.3 Percutaneous ablation of renal lesion or tissue icd9pcs |
| inotrop_cmtrt       | treatment      | PROCEDURES_ICD  | icd9_code        | Following code included in the mapping  0017 - Infusion of vasopressor agent                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |

#### Notes
Procedure number in ICD-PCS matching keyword 'oxygen' are all captured. they may not all be categorised as therapies.


In [16]:
athena_delete('treatments')
athena_ctas('treatments', f'''

SELECT icustayz.subject_id, icustayz.hadm_id, icustayz.icu_hostdat, icustayz.hoedat, icustayz.hodur, 
oxygen_cmoccur, noninvasive_proccur, invasive_proccur, inhalednit_cmtrt, 
tracheo_prtrt, extracorp_prtrt, rrt_prtrt, inotrop_cmtrt
FROM

(SELECT
subject_id, hadm_id,
min(date_parse(intime,'%Y-%m-%d %H:%i:%S')) as icu_hostdat,
max(date_parse(outtime,'%Y-%m-%d %H:%i:%S')) as hoedat,
sum(cast(los as decimal)) as hodur
FROM
"mimic3-sampleset".icustays
group by subject_id, hadm_id) icustayz RIGHT OUTER JOIN


(SELECT subject_id, hadm_id,
CASE WHEN oxygen_cmoccur = true THEN 'yes' ELSE 'no' END AS oxygen_cmoccur,
CASE WHEN noninvasive_proccur = true THEN 'yes' ELSE 'no' END AS noninvasive_proccur,
CASE WHEN invasive_proccur = true THEN 'yes' ELSE 'no' END AS invasive_proccur,
CASE WHEN inhalednit_cmtrt = true THEN 'yes' ELSE 'no' END AS inhalednit_cmtrt,
CASE WHEN tracheo_prtrt = true THEN 'yes' ELSE 'no' END AS tracheo_prtrt,
CASE WHEN extracorp_prtrt = true THEN 'yes' ELSE 'no' END AS extracorp_prtrt,
CASE WHEN rrt_prtrt = true THEN 'yes' ELSE 'no' END AS rrt_prtrt,
CASE WHEN inotrop_cmtrt = true THEN 'yes' ELSE 'no' END AS inotrop_cmtrt
from(
SELECT
subject_id, hadm_id,
bool_or(icd9_code LIKE '967%') AS invasive_proccur,
bool_or(icd9_code IN ('9390')) AS noninvasive_proccur,
bool_or(icd9_code IN ('9395', '0049', '3965','9496','0116', '8967')) AS oxygen_cmoccur,
bool_or(icd9_code IN ('0012')) AS inhalednit_cmtrt,
bool_or(icd9_code IN ('9655','3129','3121','3174','312','3174','312', '9723','311','9737','3142','3141')) AS tracheo_prtrt,
bool_or(icd9_code IN ('9859','5092','132','9851','9852','135','985', '3965','1359','3961')) AS extracorp_prtrt,
bool_or(icd9_code IN ('3943','3927','3942','3955','5561','3942','3955', '5561','3924','8845','3895','3943','3955','0025','3927','5532','5533')) AS rrt_prtrt,
bool_or(icd9_code IN ('0017')) AS inotrop_cmtrt
FROM
"mimic3-sampleset".procedures_icd
group by subject_id, hadm_id
)) proc on proc.subject_id = icustayz.subject_id and proc.hadm_id=icustayz.hadm_id



''')

DROP TABLE treatments
DROP TABLE treatments
CREATE TABLE treatments
WITH (external_location='s3://ecmo-athena-query-grr/athena/table/echmocard/treatments/') AS 

SELECT icustayz.subject_id, icustayz.hadm_id, icustayz.icu_hostdat, icustayz.hoedat, icustayz.hodur, 
oxygen_cmoccur, noninvasive_proccur, invasive_proccur, inhalednit_cmtrt, 
tracheo_prtrt, extracorp_prtrt, rrt_prtrt, inotrop_cmtrt
FROM

(SELECT
subject_id, hadm_id,
min(date_parse(intime,'%Y-%m-%d %H:%i:%S')) as icu_hostdat,
max(date_parse(outtime,'%Y-%m-%d %H:%i:%S')) as hoedat,
sum(cast(los as decimal)) as hodur
FROM
"mimic3-sampleset".icustays
group by subject_id, hadm_id) icustayz RIGHT OUTER JOIN


(SELECT subject_id, hadm_id,
CASE WHEN oxygen_cmoccur = true THEN 'yes' ELSE 'no' END AS oxygen_cmoccur,
CASE WHEN noninvasive_proccur = true THEN 'yes' ELSE 'no' END AS noninvasive_proccur,
CASE WHEN invasive_proccur = true THEN 'yes' ELSE 'no' END AS invasive_proccur,
CASE WHEN inhalednit_cmtrt = true THEN 'yes' ELSE 'no' EN

In [17]:
sample(connection, 'treatments')

Unnamed: 0,subject_id,hadm_id,icu_hostdat,hoedat,hodur,oxygen_cmoccur,noninvasive_proccur,invasive_proccur,inhalednit_cmtrt,tracheo_prtrt,extracorp_prtrt,rrt_prtrt,inotrop_cmtrt
0,10132,197611,2123-08-23 20:03:32,2123-08-24 15:11:37,1.0,no,no,no,no,no,no,no,no
1,10044,124073,2152-10-03 02:02:49,2152-10-07 16:52:38,5.0,no,no,no,no,no,no,no,no
2,44083,131048,2112-05-23 12:32:06,2112-05-25 14:59:50,2.0,no,no,no,no,no,no,no,no
3,10090,176805,2124-01-12 14:27:16,2124-01-13 17:50:10,1.0,no,no,yes,no,no,no,no,no
4,10094,168074,2180-02-29 18:54:39,2180-03-04 21:20:36,4.0,no,no,no,no,no,no,no,yes


## Combine with Redcap
Now that we have some mimic iii fields translated to ecmocard fields, lets combine them with redcap data to augment our dataset.

### Create features and labels
We will use the comorbidities as features and the outcomes field as a label to create an XGBoost model to predict mortality.

#### Get RedCap features

In [None]:
features = 'malignantneo_mhyn,diabetes_mhyn,diabetiscomp_mhyn,obesity_mhyn,chronhaemo_mhyn,chronicneu_mhyn,'+\
           'dementia_mhyn,chroniccard_mhyn,aidshiv_mhyn,chronicpul_mhyn,asthma_mhyn,modliver_mhyn,renal_mhyn,'+\
           'other_mhyn'
transforms = ',\n  '.join([f"if({m} = '1', 1, 0) AS {m}" for m in features.split(',')])

redcap_features = pd.read_sql(f"""
SELECT
    if(dsterm = '4', 1, 0) AS dsterm,
    {transforms},
    if(smoking_mhyn = '1', 1, 0) AS smoker,
    if(smoking_mhyn = '3', 1, 0) AS former_smoker
FROM redcap.etl_redcap
""", connection)
redcap_features.head(5)

#### Get the mimic iii features

In [None]:
transforms = ',\n  '.join([f"if({m} = 1, 1, 0) AS {m}" for m in features.split(',')])
mimic_features = pd.read_sql(f'''
SELECT
    if(dsterm = 4, 1, 0) AS dsterm,
    {transforms},
    if(smoking_mhyn = 1, 1, 0) AS smoker,
    if(smoking_mhyn = 3, 1, 0) AS former_smoker
FROM echmocard.comorbidities AS c
JOIN echmocard.outcomes AS o ON (c.subject_id = o.subject_id)
''', connection)

mimic_features.head(10)

#### Combine and convert columns to integers

In [None]:
combined = pd.concat([mimic_features, redcap_features])
combined['dsterm'] = (combined['dsterm'] == 4).astype(int)
for feature in features.split(','):
    combined[feature] = (combined[feature] == True).astype(int)
combined.head(10)

## Create XGBoost model

#### Split into test, train and validation

In [None]:
rand_split = np.random.rand(len(combined))
train_data = combined[rand_split < 0.8]
validation_data = combined[(rand_split >= 0.8) & (rand_split < 0.9)]
test_data = combined[rand_split >= 0.9]

#### Write data to S3

In [None]:
INPUT_DATA_PATH = 'ml/comorbidities_xgboost/data'

# function that write files directly to s3
@contextlib.contextmanager
def s3_file(file):
    buffer = StringIO()
    yield buffer
    print(f"writing s3://{S3_BUCKET}/{INPUT_DATA_PATH}/{file}")
    s3 = boto3.resource('s3')
    s3.Object(S3_BUCKET, INPUT_DATA_PATH+'/'+file).put(Body=buffer.getvalue())
    
with s3_file('data.csv') as file: 
    train_data.to_csv(file, index=False, header=True)
    
with s3_file('train.csv') as file: 
    train_data.to_csv(file, index=False, header=False)    

with s3_file('val.csv') as file: 
    validation_data.to_csv(file, index=False, header=False)

with s3_file('test.csv') as file: 
    test_data.to_csv(file, index=False, header=False)



### Fit the model
#### Get training instance
Now we will get the container for XGBoost in our region, as well as a sagemaker session and execution role for training

In [None]:
container = get_image_uri(boto3.Session().region_name, 'xgboost', '0.90-1')
session = sagemaker.Session()
role = get_execution_role()

#### Create an estimator

In [None]:
xgb = sagemaker.estimator.Estimator(
    container,
    role, 
    train_instance_count= 1, 
    train_instance_type= 'ml.m4.xlarge',
    output_path= f's3://{S3_BUCKET}/ml/comorbidities_xgboost/model',
    sagemaker_session=session
)

#### Set the known hyperparameters

In [None]:
xgb.set_hyperparameters(
    subsample=0.8,
    silent=0,
    objective='binary:logistic',
    num_round=100,
    eval_metric='auc'
)

#### Hyperparameter tuning
Now we can create an HPO job to find the best hyperparmarameters by training multiple models.

In [None]:
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                        'min_child_weight': ContinuousParameter(1, 10),
                        'alpha': ContinuousParameter(0, 2),
                        'max_depth': IntegerParameter(1, 10)}

hpo_tuner = HyperparameterTuner(
    xgb,
    'validation:auc',
    hyperparameter_ranges,
    max_jobs=20,
    max_parallel_jobs=1,
    strategy='Bayesian'
)

s3_input_train = sagemaker.s3_input(s3_data=f's3://{S3_BUCKET}/{INPUT_DATA_PATH}/train.csv', content_type='csv')
s3_input_val = sagemaker.s3_input(s3_data=f's3://{S3_BUCKET}/{INPUT_DATA_PATH}/val.csv', content_type='csv')

In [None]:
hpo_tuner.fit({'train': s3_input_train, 'validation': s3_input_val}, include_cls_metadata=False)

In [None]:
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=hpo_tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']