## Cohort building

In this notebook we prototype the cohort building process before parts if the code are turned into more reproducible functions that can become part of the proto pacage.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

establish connection to DB and define helper function for running queries

In [5]:
import pandas as pd
from proto.etl.config import SSHInfoEicu, DBInfoEicu
from proto.etl.utils import connect_to_db_via_ssh, run_eicu_query, get_column_completeness, load_schema_for_modelling

conn = connect_to_db_via_ssh(SSHInfoEicu, DBInfoEicu)
cursor = conn.cursor()
query_schema = 'set search_path to eicu_crd;'

### Work out how many patient each hospital has, keep patients from top 5

In [3]:
query = """
    select hospitalid, count(patientunitstayid) as n
    from patient 
    group by hospitalid 
    order by n desc
"""

df_hospitals = run_eicu_query(query, conn)
df_hospitals.head()

Unnamed: 0,hospitalid,n
0,73,7059
1,167,6092
2,264,5237
3,420,4679
4,176,4328


In [4]:
# 30 hospitals (out of the 208) have half of the patients
df_hospitals.head(30).n.sum()

# for the basic cohort building we'll keep the top 5
top_hospitals = df_hospitals.hospitalid[:5].values

### Create a new view with the patients from the top 5 hospitals

- a separate sql script was created for this now, in `setup/eicu_sql_scripts/top5hospitals/patient_top5hospitals.sql`
- but you can execute it from here too

In [5]:
# keep patients from top 5 in a new view called patient_top5hospitals so we can reuse them
query = """
    create view patient_top5hospitals as
    with top_hospitals as (
      select hospitalid, count(patientunitstayid) as n
      from patient
      group by hospitalid
      order by n desc
      limit 5
    )
    select * 
    from patient
    join (
      select hospitalid as top5hospitalid from top_hospitals
    ) as tt
    on tt.top5hospitalid = patient.hospitalid
"""
cursor.execute(query_schema + query)

DuplicateTable: relation "patient_top5hospitals" already exists


### Extract schema for all tables

In [10]:
query = """
select table_name from information_schema.tables
where table_schema = 'eicu_crd'
"""
df_tables = run_eicu_query(query, conn)
df_tables

Unnamed: 0,table_name
0,admissiondrug
1,admissiondx
2,allergy
3,apacheapsvar
4,apachepatientresult
5,apachepredvar
6,careplancareprovider
7,careplaneol
8,careplangeneral
9,careplangoal


In [11]:
df_schemas = None
eicu_tables = df_tables.table_name.values[:31]
for table in eicu_tables:
    query = """
    select *
    from information_schema.columns
    where table_schema = 'eicu_crd'
    and table_name =  '%s'
    """ % table
    df_schema = run_eicu_query(query, conn)
    if df_schemas is None:
        df_schemas = df_schema
    else:
        df_schemas = pd.concat([df_schemas, df_schema], axis=0)
df_schemas = df_schemas.reset_index().drop('index', axis=1)
df_schemas.to_csv('../setup/eicu_schema.csv')

In [13]:
df_schemas.head(25)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,eicu,eicu_crd,admissiondrug,admissiondrugid,1,,NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
1,eicu,eicu_crd,admissiondrug,patientunitstayid,2,,NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
2,eicu,eicu_crd,admissiondrug,drugoffset,3,,NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
3,eicu,eicu_crd,admissiondrug,drugenteredoffset,4,,NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
4,eicu,eicu_crd,admissiondrug,drugnotetype,5,,YES,character varying,255.0,1020.0,...,NO,,,,,,NO,NEVER,,YES
5,eicu,eicu_crd,admissiondrug,specialtytype,6,,YES,character varying,255.0,1020.0,...,NO,,,,,,NO,NEVER,,YES
6,eicu,eicu_crd,admissiondrug,usertype,7,,NO,character varying,255.0,1020.0,...,NO,,,,,,NO,NEVER,,YES
7,eicu,eicu_crd,admissiondrug,rxincluded,8,,YES,character varying,5.0,20.0,...,NO,,,,,,NO,NEVER,,YES
8,eicu,eicu_crd,admissiondrug,writtenineicu,9,,YES,character varying,5.0,20.0,...,NO,,,,,,NO,NEVER,,YES
9,eicu,eicu_crd,admissiondrug,drugname,10,,NO,character varying,255.0,1020.0,...,NO,,,,,,NO,NEVER,,YES


### Define  important tables and columns

In [14]:
tables_to_keep = [
    'admissiondx', 
    'apacheapsvar',
    'apachepatientresult', 
    'apachepredvar', 
    'diagnosis', 
    'hospital',
    'lab', 
    'medication', 
    'nurseassessment', 
    'nursecare', 
    'nursecharting',
    'pasthistory', 
    'patient', 
    'treatment',
    'vitalaperiodic',
    'vitalperiodic'
]

In [15]:
df_schema_final = df_schemas[df_schemas.table_name.isin(tables_to_keep)]
df_schema_final = df_schema_final.set_index('table_name')
df_schema_final = df_schema_final[
    [
        'column_name',
        'ordinal_position',
        'data_type',
        'udt_name'       
    ]
]
df_schema_final.to_csv('../setup/modelling_tables_schema.csv')

### Modify the `modelling_tables_schema.csv` manually

I went through all the columns to:
- identify the time domain column
- mark certain cols for deletion which were either non-relevant or not populated (relying on the `get_column_completeness`)
- using this updated version, we now can create a naming object that holds all relevant table info for cohort building.
- note this final updated version (`modelling_schema.csv`) was moved to the etl folder

In [44]:
cols = [
    'noninvasivesystolic',
    'noninvasivediastolic',
    'noninvasivemean',
    'paop',
    'cardiacoutput',
    'cardiacinput',
    'svr',
    'svri',
    'pvr',
    'pvri'
]
get_column_completeness('vitalaperiodic', cols, conn)

noninvasivesystolic     88.053315
noninvasivediastolic    88.063824
noninvasivemean         88.569557
paop                     0.142077
cardiacoutput            5.161081
cardiacinput             1.038888
svr                      6.775992
svri                     1.030649
pvr                      0.076195
pvri                     0.075729
dtype: float64

In [42]:
cols = [
    'temperature',
    'sao2',
    'heartrate',
    'respiration',
    'cvp',
    'etco2',
    'systemicsystolic',
    'systemicdiastolic',
    'systemicmean',
    'pasystolic',
    'padiastolic',
    'pamean',
    'st1',
    'st2',
    'st3',
    'icp'
]

get_column_completeness('vitalperiodic', cols, conn)

temperature           9.001937
sao2                 90.616198
heartrate            99.528301
respiration          87.611368
cvp                  13.061665
etco2                 3.015807
systemicsystolic     18.977738
systemicdiastolic    18.976979
systemicmean         19.131762
pasystolic            2.809772
padiastolic           2.809429
pamean                2.829539
st1                  38.592952
st2                  40.873118
st3                  37.635932
icp                   1.793958
dtype: float64

#### Have a look at the final `modelling_schema.csv`

In [20]:
df_schema_final = load_schema_for_modelling()
df_schema_final.head(20)

Unnamed: 0_level_0,column_name,ordinal_position,data_type,udt_name,time_var,numeric,to_keep,data_table
table_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
admissiondx,admissiondxid,1,integer,int4,0,1,1,1
admissiondx,patientunitstayid,2,integer,int4,0,1,1,1
admissiondx,admitdxenteredoffset,3,integer,int4,1,1,1,1
admissiondx,admitdxpath,4,character varying,varchar,0,0,1,1
admissiondx,admitdxname,5,character varying,varchar,0,0,0,1
admissiondx,admitdxtext,6,character varying,varchar,0,0,0,1
apacheapsvar,apacheapsvarid,1,integer,int4,0,1,1,1
apacheapsvar,patientunitstayid,2,integer,int4,0,1,1,1
apacheapsvar,intubated,3,smallint,int2,0,1,1,1
apacheapsvar,vent,4,smallint,int2,0,1,1,1
