In [None]:
import pandas as pd
import numpy as np
import psycopg2
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline


sqluser = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'

#sqluser = 'mimicuser'
#dbname = 'mimic'

# Password keyword only necessary if user-specified password required to connect to database
con = psycopg2.connect(dbname=dbname, user=sqluser, password='postgre')
cur = con.cursor()

query = "SELECT" \
        " a.subject_id, a.hadm_id, a.ethnicity, a.insurance, p.gender, cast(a.admittime as date), cast(a.dischtime as date)" \
        ", ROUND( (cast(a.admittime as date) - cast(p.dob as date)) / 365.242)" \
        " AS age_on_admiss" \
        ", cast(a.dischtime as date) - cast(a.admittime as date)" \
        " AS length_of_stay" \
        " FROM admissions a" \
        " INNER JOIN patients p" \
        " ON p.subject_id = a.subject_id" \
        " GROUP BY" \
        " a.subject_id, a.hadm_id, a.ethnicity, a.insurance, p.gender, a.admittime, p.dob, a.dischtime" \
        " ORDER BY a.subject_id;"

cur.execute('SET search_path to ' + schema_name)
df = pd.read_sql_query(query, con)

df.dtypes

In [None]:
df.info()

How many rows correspond to neonates?

In [None]:
len(df[df['age_on_admiss'] < 1])

How many categories of ethnicity are there?

In [None]:
df['ethnicity'].unique()

How many categories of gender are there?

In [None]:
df['gender'].unique()

How many types of insurance are represented?

In [None]:
df['insurance'].unique()

## Readmissions Query and filtering data frame for adult patients

In [None]:
query_rm = """WITH readmits AS
(
            SELECT subject_id, hadm_id, admittime, dischtime, diagnosis, insurance, admission_type,
            cast(admittime as date) - lag(cast(dischtime as date))
                                            OVER (PARTITION BY subject_id ORDER BY admittime)
                                            AS readmit_time
            FROM admissions
)
SELECT subject_id, hadm_id, readmit_time
FROM readmits
WHERE readmit_time <= 30
ORDER BY subject_id, admittime;"""
#cur.execute('SET search_path to ' + schema_name)
df_rm = pd.read_sql_query(query_rm, con)
df_rm.dtypes

In [None]:
df2 = df.merge(df_rm, on = ["subject_id","hadm_id"], how = "outer")
df2['is_readmit'] = np.where(df2.readmit_time.isnull(), False, True)
df2['readmit_time'] = np.where(df2.readmit_time.isnull(), 0, df2.readmit_time)

In [None]:
# Replacing age of >89 y.o. patients with 90
df2['age_on_admiss'] = np.where(df2['age_on_admiss']>100, 90, df2['age_on_admiss'])

# Only keeping rows corresponding to "adults" (non-neonates)
df2 = df2[df2['age_on_admiss'] > 0].copy().reset_index(drop=True)

In [None]:
df2.head()

In [None]:
len(df2)

## Creating readmission vs. nonreadmission classes

In [None]:
# Grouping admissions by patient and sorting by admission time within each patient

df2_grouped = df2.groupby('subject_id').apply(pd.DataFrame.sort_values, 'admittime')

In [None]:
df2_grouped.head()

In [None]:
# Checking grouping using a patient with known readmission

df2_grouped.loc[99982]

In [None]:
# Creating a new column indicating whether each stay was followed by an early readmission

# Filling in placeholder "False" values
df2_follow = df2_grouped.copy()
df2_follow['followed_by_readmit'] = False

# Replacing values with True/False based on previous row's "is_readmit" value (only if the rows have same subject_id)
# df2_follow.index[0][1] -> 0
for i in range(len(df2_follow)-1):
    if df2_follow.index[i][0] == df2_follow.index[i+1][0]:
        df2_follow.iloc[i, df2_follow.columns.get_loc('followed_by_readmit')] = df2_follow.iloc[i+1, df2_follow.columns.get_loc('is_readmit')]

In [None]:
# Checking assignment using same patient with known readmission

df2_follow.loc[99982]

In [None]:
# Re-setting index

df2_follow.index = df2_follow.index.droplevel(0)
df2_follow.head()

## Pulling in CHARTEVENTS and LABEVENTS data

### Vital signs data

In [None]:
%%time
# Pulling in vital signs data
# Code adapted from https://github.com/MIT-LCP/mimic-code/blob/master/concepts/pivot/pivoted-vital.sql

query_vitals = """ with ce as
(
  select ce.hadm_id
    , ce.charttime
    , (case when itemid in (211,220045) and valuenum > 0 and valuenum < 300 then valuenum else null end) as HeartRate
    , (case when itemid in (51,442,455,6701,220179,220050) and valuenum > 0 and valuenum < 400 then valuenum else null end) as SysBP
    , (case when itemid in (615,618,220210,224690) and valuenum > 0 and valuenum < 70 then valuenum else null end) as RespRate
    , (case when itemid in (223761,678) and valuenum > 70 and valuenum < 120 then (valuenum-32)/1.8 -- converted to degC in valuenum call
               when itemid in (223762,676) and valuenum > 10 and valuenum < 50  then valuenum else null end) as TempC
    , (case when itemid in (646,220277) and valuenum > 0 and valuenum <= 100 then valuenum else null end) as SpO2
  from chartevents ce
  -- exclude rows marked as error
  where ce.error IS DISTINCT FROM 1
  and ce.itemid in
  (
  -- HEART RATE
  211, --"Heart Rate"
  220045, --"Heart Rate"

  -- Systolic

  51, --	Arterial BP [Systolic]
  442, --	Manual BP [Systolic]
  455, --	NBP [Systolic]
  6701, --	Arterial BP #2 [Systolic]
  220179, --	Non Invasive Blood Pressure systolic
  220050, --	Arterial Blood Pressure systolic

  -- RESPIRATORY RATE
  618,--	Respiratory Rate
  615,--	Resp Rate (Total)
  220210,--	Respiratory Rate
  224690, --	Respiratory Rate (Total)


  -- SPO2, peripheral
  646, 220277,


  -- TEMPERATURE
  223762, -- "Temperature Celsius"
  676,	-- "Temperature C"
  223761, -- "Temperature Fahrenheit"
  678 --	"Temperature F"

  )
)
select
    ce.hadm_id
  , ce.charttime
  , avg(HeartRate) as HeartRate
  , avg(SysBP) as SysBP
  , avg(RespRate) as RespRate
  , avg(TempC) as TempC
  , avg(SpO2) as SpO2
from ce
group by ce.hadm_id, ce.charttime
order by ce.hadm_id, ce.charttime;"""

df_vitals = pd.read_sql_query(query_vitals, con)

print(df_vitals.head(5))

In [None]:
df_vitals.info()

### Calculating min., median, and max. of all vital signs for each hospital stay

In [None]:
pd.isnull(df_vitals).any()

No rows where hadm_id is null, so no need to drop rows at this point.

In [None]:
# Aggregating and calculating statistics for each hospital stay

vitals_stats = df_vitals.groupby('hadm_id')['heartrate', 'sysbp', 'resprate', 'tempc', 'spo2'].agg(['min', 'median', 'max'])
vitals_stats.head()

In [None]:
# Resetting index to make hadm_id a column again for merging consistency

vitals_statsdf = vitals_stats.reset_index()
vitals_statsdf.head()

In [None]:
# Flattening columns to single layer

vitals_statsdf.columns = ['_'.join(col).rstrip('_') for col in vitals_statsdf.columns.values]

In [None]:
vitals_statsdf.head()

### Lab data

In [None]:
%%time
# Pulling in lab data
# Code adapted from https://github.com/MIT-LCP/mimic-code/blob/master/concepts/pivot/pivoted-lab.sql, added RDW values

query_lab = """
SELECT
    pvt.hadm_id, pvt.charttime
  , avg(CASE WHEN label = 'BILIRUBIN' THEN valuenum ELSE null END) as BILIRUBIN
  , avg(CASE WHEN label = 'CREATININE' THEN valuenum ELSE null END) as CREATININE
  , avg(CASE WHEN label = 'HEMATOCRIT' THEN valuenum ELSE null END) as HEMATOCRIT
  , avg(CASE WHEN label = 'HEMOGLOBIN' THEN valuenum ELSE null END) as HEMOGLOBIN
  , avg(CASE WHEN label = 'LACTATE' THEN valuenum ELSE null END) as LACTATE
  , avg(CASE WHEN label = 'PLATELET' THEN valuenum ELSE null END) as PLATELET
  , avg(CASE WHEN label = 'PTT' THEN valuenum ELSE null END) as PTT
  , avg(CASE WHEN label = 'INR' THEN valuenum ELSE null END) as INR
  , avg(CASE WHEN label = 'WBC' THEN valuenum ELSE null end) as WBC
  , avg(CASE WHEN label = 'RDW' THEN valuenum ELSE null end) as RDW
FROM
( -- begin query that extracts the data
  SELECT le.hadm_id, le.charttime
  -- here we assign labels to ITEMIDs
  -- this also fuses together multiple ITEMIDs containing the same data
  , CASE
        WHEN itemid = 50885 THEN 'BILIRUBIN'
        WHEN itemid = 50912 THEN 'CREATININE'
        WHEN itemid = 50810 THEN 'HEMATOCRIT'
        WHEN itemid = 51221 THEN 'HEMATOCRIT'
        WHEN itemid = 50811 THEN 'HEMOGLOBIN'
        WHEN itemid = 51222 THEN 'HEMOGLOBIN'
        WHEN itemid = 50813 THEN 'LACTATE'
        WHEN itemid = 51265 THEN 'PLATELET'
        WHEN itemid = 51275 THEN 'PTT'
        WHEN itemid = 51237 THEN 'INR'
        WHEN itemid = 51300 THEN 'WBC'
        WHEN itemid = 51301 THEN 'WBC'
        WHEN itemid = 51277 THEN 'RDW'
      ELSE null
    END AS label
  , -- add in some sanity checks on the values
  -- the where clause below requires all valuenum to be > 0, so these are only upper limit checks
    CASE
      WHEN itemid = 50885 and valuenum >   150 THEN null -- mg/dL 'BILIRUBIN'
      WHEN itemid = 50912 and valuenum >   150 THEN null -- mg/dL 'CREATININE'
      WHEN itemid = 50810 and valuenum >   100 THEN null -- % 'HEMATOCRIT'
      WHEN itemid = 51221 and valuenum >   100 THEN null -- % 'HEMATOCRIT'
      WHEN itemid = 50811 and valuenum >    50 THEN null -- g/dL 'HEMOGLOBIN'
      WHEN itemid = 51222 and valuenum >    50 THEN null -- g/dL 'HEMOGLOBIN'
      WHEN itemid = 50813 and valuenum >    50 THEN null -- mmol/L 'LACTATE'
      WHEN itemid = 51265 and valuenum > 10000 THEN null -- K/uL 'PLATELET'
      WHEN itemid = 51275 and valuenum >   150 THEN null -- sec 'PTT'
      WHEN itemid = 51237 and valuenum >    50 THEN null -- 'INR'
      WHEN itemid = 51300 and valuenum >  1000 THEN null -- K/uL 'WBC'
      WHEN itemid = 51301 and valuenum >  1000 THEN null -- K/uL 'WBC'
      WHEN itemid = 51277 and valuenum >   100 THEN null -- % 'RDW'
    ELSE le.valuenum
    END AS valuenum
  FROM labevents le
  WHERE le.ITEMID in
  (
    -- comment is: LABEL | CATEGORY | FLUID | NUMBER OF ROWS IN LABEVENTS
    50885, -- BILIRUBIN, TOTAL | CHEMISTRY | BLOOD | 238277
    50912, -- CREATININE | CHEMISTRY | BLOOD | 797476
    51221, -- HEMATOCRIT | HEMATOLOGY | BLOOD | 881846
    50810, -- HEMATOCRIT, CALCULATED | BLOOD GAS | BLOOD | 89715
    51222, -- HEMOGLOBIN | HEMATOLOGY | BLOOD | 752523
    50811, -- HEMOGLOBIN | BLOOD GAS | BLOOD | 89712
    50813, -- LACTATE | BLOOD GAS | BLOOD | 187124
    51265, -- PLATELET COUNT | HEMATOLOGY | BLOOD | 778444
    51275, -- PTT | HEMATOLOGY | BLOOD | 474937
    51237, -- INR(PT) | HEMATOLOGY | BLOOD | 471183
    51301, -- WHITE BLOOD CELLS | HEMATOLOGY | BLOOD | 753301
    51300, -- WBC COUNT | HEMATOLOGY | BLOOD | 2371
    51277  -- RDW | HEMATOLOGY | BLOOD | 746817
  )
  AND valuenum IS NOT NULL AND valuenum > 0 -- lab values cannot be 0 and cannot be negative
) pvt
GROUP BY pvt.hadm_id, pvt.charttime
ORDER BY pvt.hadm_id, pvt.charttime;"""

df_lab = pd.read_sql_query(query_lab, con)

print(df_lab.head(5))

In [None]:
df_lab.info()

In [None]:
# Dropping rows where hadm_id is null

df2_lab = df_lab.dropna(axis=0, how='any', subset=['hadm_id'])
df2_lab.info()

### Calculating min., median, and max. of all vital signs for each hospital stay

In [None]:
# Aggregating and calculating statistics for each hospital stay

lab_stats = df2_lab.groupby('hadm_id')['bilirubin', 'creatinine', 'hematocrit', 'hemoglobin', 'lactate', 'platelet',
                                      'ptt', 'inr', 'wbc', 'rdw'].agg(['min', 'median', 'max'])
lab_stats.head()

In [None]:
# Resetting index to make hadm_id a column again for merging consistency

lab_statsdf = lab_stats.reset_index()
lab_statsdf.head()

In [None]:
# Flattening columns to single layer

lab_statsdf.columns = ['_'.join(col).rstrip('_') for col in lab_statsdf.columns.values]
lab_statsdf.head()

### What is the average number of each type of measurement per stay?

In [None]:
def labStats(feature):
    df_labStat = df2_lab[['hadm_id', feature]]
    labStat_sorted = df_labStat.set_index('hadm_id').sort_index()
    labStat_grouped = labStat_sorted.groupby(labStat_sorted.index).count()
    print('Ave. number of', feature, 'measurements per hospital stay:', labStat_grouped.mean()[0])
    print('Std. dev of', feature, 'measurements per hospital stay:', labStat_grouped.std()[0])
    print('Min. number of', feature, 'measurements per hospital stay:', labStat_grouped.min()[0])
    print('Max. number of', feature, 'measurements per hospital stay:', labStat_grouped.max()[0])
    return labStat_grouped.head()

for i in df_lab.columns[2:]:
    print(labStats(i))

Looks like there is large variation in terms of number of each measurement per hospital stay.

## Distributions for lab features

In [None]:
def labDistr(feature):
    feature_data = df_lab[feature].dropna()
    plt.figure()
    plt.show()
    return sns.distplot(feature_data)

for i in df_lab.columns[2:]:
    print(labDistr(i))

## Pulling in prescription data

In [None]:
%%time
# Creating one dataframe per drug (we can consolidate after feature selection?)

def query_prescrip(drug_desc):
    general_query = """ 
        SELECT p.subject_id, p.hadm_id, p.startdate, p.enddate,
        p.drug, p.drug_name_poe, p.drug_name_generic, a.admittime, a.dischtime
        FROM prescriptions p
        INNER JOIN admissions a
        ON p.hadm_id = a.hadm_id
        WHERE startdate < dischtime
        AND """
    drug_specific = drug_desc
    whole_query = general_query + drug_specific
    return pd.read_sql_query(whole_query, con)

# Vancomycin and neo-synephrine require special queries
df_vanco = query_prescrip("""(lower(drug) LIKE '%vancomycin%' 
OR lower(drug) LIKE '%vancocin%' 
OR lower(drug_name_poe) LIKE '%vancomycin%'
OR lower(drug_name_generic) LIKE '%vancomycin%');""")

df_neosyn = query_prescrip("""(lower(drug) LIKE 'neo%ephrine%'
      OR lower(drug) LIKE 'phenylephrine%');""")

# Other drugs use same query format
def gen_drug_desc(drug):
    gen_drug_query = "(lower(drug) LIKE '%" + drug + "%' OR lower(drug_name_poe) LIKE '%"\
                        + drug + "%' OR lower(drug_name_generic) LIKE '%" + drug + "%');"
    return gen_drug_query

df_neostig = query_prescrip(gen_drug_desc('neostigmine'))
df_glycopyr = query_prescrip(gen_drug_desc('glycopyrrolate'))
df_ceftriax = query_prescrip(gen_drug_desc('ceftriaxone'))
df_atropsulf = query_prescrip(gen_drug_desc('atropine sulfate'))
df_chlorhex = query_prescrip(gen_drug_desc('chlorhexidine'))
df_nitroglyc = query_prescrip(gen_drug_desc('nitroglycerin'))
df_nitropruss = query_prescrip(gen_drug_desc('nitroprusside'))
df_lansopraz = query_prescrip(gen_drug_desc('lansoprazole'))
    

#df_vanco = query_prescrip(vanco_desc)
#query_prescrip(neosyn_desc)


In [None]:
# How many rows in each dataframe?

print('vancomycin:', df_vanco.shape[0], 'rows')
print('neosynephrine:', df_neosyn.shape[0], 'rows')
print('neostigmine:', df_neostig.shape[0], 'rows')
print('glycopyrrolate:', df_glycopyr.shape[0], 'rows')
print('ceftriaxone:', df_ceftriax.shape[0], 'rows')
print('atropine sulfate:', df_atropsulf.shape[0], 'rows')
print('chlorhexidine:', df_chlorhex.shape[0], 'rows')
print('nitroglycerin:', df_nitroglyc.shape[0], 'rows')
print('nitroprusside:', df_nitropruss.shape[0], 'rows')
print('lansoprazole:', df_lansopraz.shape[0], 'rows')

In [None]:
df_vanco.head()

In [None]:
# Converting prescription feature to binary

drug_list = ['vancomycin', 'neosynephrine', 'neostigmine', 'glycopyrrolate', 'ceftriaxone', 
'atropine sulfate', 'chlorhexidine', 'nitroglycerin', 'nitroprusside', 'lansoprazole']
df_druglist = [df_vanco, df_neosyn, df_neostig, df_glycopyr, df_ceftriax, df_atropsulf, df_chlorhex,  df_nitroglyc,
              df_nitropruss, df_lansopraz]

# Create new copy of df2 with one column per prescription feature, values True/False 
df2_prescrip = df2.copy()
for i in range(len(drug_list)):
    df2_prescrip[drug_list[i]] = df2_prescrip['hadm_id'].isin(df_druglist[i]['hadm_id'])
df2_prescrip.head()

## Pulling in procedure data

In [None]:
%%time
# Creating one dataframe per procedure (we can consolidate after feature selection?)

def query_proc(proc_desc):
    general_query = """ 
        SELECT pr.subject_id, pr.hadm_id, d.icd9_code, d.short_title, d.long_title
        FROM procedures_icd pr
        INNER JOIN d_icd_procedures d
        ON pr.icd9_code = d.icd9_code
        WHERE """
    proc_specific = proc_desc
    whole_query = general_query + proc_specific
    return pd.read_sql_query(whole_query, con)

df_parentinf = query_proc("short_title = 'Parent infus nutrit sub';")
df_serumtransf = query_proc("short_title = 'Serum transfusion NEC';")
df_skinsut = query_proc("short_title = 'Skin closure NEC';")
df_vasopress = query_proc("short_title = 'Infusion of vasopressor';")
df_mammCAB = query_proc("short_title = '1 int mam-cor art bypass';")
df_aortoCBT = query_proc("short_title = 'Aortocor bypas-2 cor art' OR short_title = 'Aortocor bypas-3 cor art';")
df_platetransf = query_proc("short_title = 'Platelet transfusion';")
df_intercath = query_proc("short_title = 'Insert intercostal cath';")

In [None]:
# How many rows in each dataframe?

print('parenteral infusion:', df_parentinf.shape[0], 'rows')
print('serum transfusion:', df_serumtransf.shape[0], 'rows')
print('skin suture:', df_skinsut.shape[0], 'rows')
print('infusion of vasopressor:', df_vasopress.shape[0], 'rows')
print('single internal mammary-coronary artery bypass:', df_mammCAB.shape[0], 'rows')
print('aortocoronary bypass T:', df_aortoCBT.shape[0], 'rows')
print('platelet transfusion:', df_platetransf.shape[0], 'rows')
print('insertion of intercostal catheter:', df_intercath.shape[0], 'rows')


In [None]:
df_parentinf.head()

In [None]:
143045 in df_aortoCBT['hadm_id'].unique()

In [None]:
# Converting procedure feature to binary

proc_list = ['parent_inf', 'serum_transf', 'skin_sut', 'inf_vasopress',
            'single_int_mammCAB', 'aorto_CBT', 'platelet_transf', 'ins_inter_cath']
df_proclist = [df_parentinf, df_serumtransf, df_skinsut, df_vasopress, df_mammCAB, df_aortoCBT, df_platetransf, df_intercath]

# Create new copy of df2 with one column per procedure feature, values True/False 
df2_proc = df2.copy()
for i in range(len(proc_list)):
    df2_proc[proc_list[i]] = df2_proc['hadm_id'].isin(df_proclist[i]['hadm_id'])
df2_proc.head()

## Pulling in diagnosis category ("department") data

In [None]:
q_merge = """SELECT d.SUBJECT_ID, d.HADM_ID, d.ICD9_CODE, lookup.SHORT_TITLE, lookup.LONG_TITLE 
                from DIAGNOSES_ICD d
                left join d_icd_diagnoses lookup 
                on d.ICD9_CODE = lookup.ICD9_CODE;"""

df_department = pd.read_sql_query(q_merge, con)

In [None]:
len(df_department)

In [None]:
df_department.head()

In [None]:
df_department['icd9_code'].value_counts()

In [None]:
# Reading the ICD 9 Codes. First three digit are important. Last two if available are supplementary
df_department['ICD_PC'] = df_department['icd9_code'].astype(str).str[:3]

In [None]:
df_department.head()

In [None]:
#converting to numeric
df_department['int_icd9']=pd.to_numeric(df_department.ICD_PC, errors='coerce')

In [None]:
df_department.head()

In [None]:
conditions = [
    (df_department['int_icd9'] > 0) & (df_department['int_icd9'] <= 139),
    (df_department['int_icd9'] >= 140) & (df_department['int_icd9'] <= 239),
    (df_department['int_icd9'] >= 240) & (df_department['int_icd9'] <= 279),
    (df_department['int_icd9'] >= 280) & (df_department['int_icd9'] <= 289),
    (df_department['int_icd9'] >= 290) & (df_department['int_icd9'] <= 319),
    (df_department['int_icd9'] >= 320) & (df_department['int_icd9'] <= 389),
    (df_department['int_icd9'] >= 390) & (df_department['int_icd9'] <= 459),
    (df_department['int_icd9'] >= 460) & (df_department['int_icd9'] <= 519),
    (df_department['int_icd9'] >= 520) & (df_department['int_icd9'] <= 579),
    (df_department['int_icd9'] >= 580) & (df_department['int_icd9'] <= 629),
    (df_department['int_icd9'] >= 630) & (df_department['int_icd9'] <= 679),
    (df_department['int_icd9'] >= 680) & (df_department['int_icd9'] <= 709),
    (df_department['int_icd9'] >= 710) & (df_department['int_icd9'] <= 739),
    (df_department['int_icd9'] >= 740) & (df_department['int_icd9'] <= 759),
    (df_department['int_icd9'] >= 760) & (df_department['int_icd9'] <= 779),
    (df_department['int_icd9'] >= 780) & (df_department['int_icd9'] <= 799),
    (df_department['int_icd9'] >= 800) & (df_department['int_icd9'] <= 999),
    (df_department.icd9_code.str.startswith('E', na=False)) | (df_department.icd9_code.str.startswith('V', na=False)),
    
]
choices = ['infectious and parasitic diseases', 'neoplasms',\
           'endocrine, nutritional and metabolic diseases, and immunity disorders',\
           'diseases of the blood and blood-forming organs','mental disorders',\
           'diseases of the nervous system and sense organs','diseases of the circulatory system',\
           'diseases of the respiratory system','diseases of the digestive system',\
           'diseases of the genitourinary system','complications of pregnancy, childbirth, and the puerperium',\
           'diseases of the skin and subcutaneous tissue','diseases of the musculoskeletal system and connective tissue',\
          'congenital anomalies','certain conditions originating in the perinatal period','symptoms, signs, and ill-defined conditions',\
          'injury and poisoning','external causes of injury and supplemental classification']
df_department['Department'] = np.select(conditions, choices, default='null')
#print(df_merge)

In [None]:
df_department.head(15)

In [None]:
#df_merge = df_merge.drop(['ICD_PC'],axis=1)

In [None]:
df_department.Department.value_counts()

In [None]:
df_department2 = df_department[df_department['Department']!='null']

In [None]:
#ICD codes are null
df_department2.head()

In [None]:
df_department_freq = df_department2.groupby('hadm_id')['Department'].nunique()

In [None]:
df_department_freq = df_department_freq.reset_index();

In [None]:
df_department_freq = df_department_freq.rename(index=str, columns = {"Department": "No_of_Depart"});

In [None]:
df_department3 = pd.merge(df_department2,df_department_freq, on = 'hadm_id', how = 'inner')

In [None]:
len(df_department3)

In [None]:
# Four most frequent diagnoses per hospital stay
df_top1_depart = df_department3.groupby(['hadm_id'])['Department'].agg(lambda x:x.value_counts().index[0])

In [None]:
df_top1_depart = df_top1_depart.reset_index()

In [None]:
df_top1_depart = df_top1_depart.rename(index=str, columns = {'Department': 'Depart1' } );

In [None]:
df_department4 = df_department3[['hadm_id','No_of_Depart']].drop_duplicates()
df_top1_merge = pd.merge(df_top1_depart[['hadm_id','Depart1']],df_department4 , on ='hadm_id', how = 'left')

In [None]:
df_top1_merge.head()

In [None]:
len(df_top1_merge)

In [None]:
#Some of the patients have only one diagnoses.
df_top2_depart = df_department3.groupby(['hadm_id'])['Department'].value_counts().nsmallest()
df_top2_depart

In [None]:
df_department3.Department.nunique()

In [None]:
df_circ = df_department3[df_department3['Department']=='diseases of the circulatory system']
df_geni = df_department3[df_department3['Department']=='diseases of the genitourinary system']
df_resp = df_department3[df_department3['Department']=='diseases of the respiratory system']

In [None]:
df_endo = df_department3[df_department3['Department']=='endocrine, nutritional and metabolic diseases, and immunity disorders']
df_musko = df_department3[df_department3['Department']=='diseases of the musculoskeletal system and connective tissue']
df_blood = df_department3[df_department3['Department']=='diseases of the blood and blood-forming organs']
df_diges = df_department3[df_department3['Department']=='diseases of the digestive system']
df_nerv = df_department3[df_department3['Department']=='diseases of the nervous system and sense organs']

In [None]:
df_neo = df_department3[df_department3['Department']=='neoplasms']
df_sym = df_department3[df_department3['Department']=='symptoms, signs, and ill-defined conditions']
df_ext = df_department3[df_department3['Department']=='external causes of injury and supplemental classification']
df_ment = df_department3[df_department3['Department']=='mental disorders']
df_injury = df_department3[df_department3['Department']=='injury and poisoning']
df_skin = df_department3[df_department3['Department']=='diseases of the skin and subcutaneous tissue']
df_infect = df_department3[df_department3['Department']=='infectious and parasitic diseases']
df_congen = df_department3[df_department3['Department']=='congenital anomalies']


In [None]:
df_pren = df_department3[df_department3['Department']=='certain conditions originating in the perinatal period']
df_preg = df_department3[df_department3['Department']=='complications of pregnancy, childbirth, and the puerperium']


In [None]:
depart_list = ['diseases of the circulatory system','diseases of the genitourinary system','diseases of the respiratory system',\
              'endocrine, nutritional and metabolic diseases, and immunity disorders','diseases of the musculoskeletal system and connective tissue',\
              'diseases of the blood and blood-forming organs','diseases of the digestive system','diseases of the nervous system and sense organs',\
              'neoplasms','symptoms, signs, and ill-defined conditions','external causes of injury and supplemental classification',\
              'mental disorders','injury and poisoning','diseases of the skin and subcutaneous tissue','infectious and parasitic diseases',\
              'congenital anomalies']
df_departlist = [df_circ, df_geni, df_resp, df_endo, df_musko, df_blood, df_diges, df_nerv, df_neo, df_sym, df_ext, df_ment, df_injury, df_skin, df_infect, df_congen]


# Create new copy of df2 with one column per procedure feature, values True/False 
df_depart = df2.copy()
for i in range(len(depart_list)):
    
    df_depart[depart_list[i]] = df_depart['hadm_id'].isin(df_departlist[i]['hadm_id'])
    

df_depart.head()

In [None]:
df_depart.columns

In [None]:
df_depart_final = pd.merge(df_depart,df_top1_merge , on ='hadm_id', how = 'left')

In [None]:
df_depart_final.head()

In [None]:
df_depart_final.columns

## Merging dataframes

In [None]:
#Merging Admission, Vitals, Lab and Diagnoses(Department) Dataframes

In [None]:
merge_df1 = pd.merge(df2_follow, lab_statsdf, on ='hadm_id', how ='inner')

In [None]:
len(merge_df1)

In [None]:
merge_df2 = pd.merge(merge_df1, vitals_statsdf, on = 'hadm_id', how = 'inner')

In [None]:
len(merge_df2)

In [None]:
merge_df3 = pd.merge(merge_df2, df_depart_final[['hadm_id','diseases of the circulatory system',
       'diseases of the genitourinary system',
       'diseases of the respiratory system',
       'endocrine, nutritional and metabolic diseases, and immunity disorders',
       'diseases of the musculoskeletal system and connective tissue',
       'diseases of the blood and blood-forming organs',
       'diseases of the digestive system',
       'diseases of the nervous system and sense organs', 'neoplasms',
       'symptoms, signs, and ill-defined conditions',
       'external causes of injury and supplemental classification',
       'mental disorders', 'injury and poisoning',
       'diseases of the skin and subcutaneous tissue',
       'infectious and parasitic diseases', 'congenital anomalies', 'Depart1',
       'No_of_Depart']], on='hadm_id', how = 'left')

In [None]:
merge_df3.head()

In [None]:
merge_df3.dtypes;

In [None]:
len(merge_df3)

In [None]:
#Merging prescription and procedures data
merge_df4 = pd.merge(df2_proc,df2_prescrip[['hadm_id','vancomycin', 'neosynephrine',
       'neostigmine', 'glycopyrrolate', 'ceftriaxone', 'atropine sulfate',
       'chlorhexidine', 'nitroglycerin', 'nitroprusside', 'lansoprazole']], on = 'hadm_id', how='inner')

In [None]:
#Merging df4 and df3 together
merge_df5 = pd.merge(merge_df3,merge_df4[['hadm_id','parent_inf', 'serum_transf', 'skin_sut', 'inf_vasopress',
       'single_int_mammCAB', 'aorto_CBT', 'platelet_transf', 'ins_inter_cath',
       'vancomycin', 'neosynephrine', 'neostigmine', 'glycopyrrolate',
       'ceftriaxone', 'atropine sulfate', 'chlorhexidine', 'nitroglycerin',
       'nitroprusside', 'lansoprazole']], on='hadm_id', how = 'inner')

In [None]:
merge_df5.head()

In [None]:
print(len(merge_df5.columns))
#print(merge_df5.columns.values)

## Selecting the readmissions/nonreadmissions instances

In [None]:
df_readmit = merge_df5[merge_df5['followed_by_readmit'] == True]
num_readmit = len(df_readmit)
print('There were', num_readmit, 'stays which were followed by early readmissions.')

(There were 3188 such stays when we selected readmits before merging with all the other feature dataframes.)

In [None]:
df_nonreadmit = merge_df5[merge_df5['followed_by_readmit'] == False]#.reset_index(drop=True)
num_nonreadmit = len(df_nonreadmit)
print('There were', num_nonreadmit, 'stays which were NOT followed by early readmissions.')

(There were 47678 such stays when we selected nonreadmits before merging with all the other feature dataframes.)

In [None]:
# Randomly selection (without replacement) of equal number of stays not followed by readmissions
import random
# Ensuring reproducibility
random.seed(10)

nonreadmit_samp = random.sample(range(num_nonreadmit), num_readmit)
# Showing first 10
nonreadmit_samp[:10]

In [None]:
df_nonreadmit_samp = df_nonreadmit.iloc[nonreadmit_samp,:]
df_nonreadmit_samp.head()

# Concatenating the readmission and non-readmission-sample dataframes

In [None]:
df_data = pd.concat([df_readmit, df_nonreadmit_samp]).reset_index(drop=True)
print(len(df_data), 'rows')

In [None]:
df_data.head()

In [None]:
df_data.columns.values

### Visualize distribution of vital signs data for readmits vs. nonreadmits

In [None]:
# Selecting only vital signs and followed_by_readmit columns

df_data_vitals = df_data[np.concatenate((vitals_statsdf.columns.values[1:], ['followed_by_readmit']))]

In [None]:
df_data_vitals.head()

In [None]:
# Creating violinplot for all vital signs

vitals_colnames = df_data_vitals.iloc[:, :-1].columns.values
df_dv_melt = df_data_vitals.melt(id_vars = 'followed_by_readmit', value_vars=vitals_colnames,
                                 var_name='Vital Signs', value_name='Measured')

with sns.plotting_context('poster'):
    plt.figure(figsize=(10, 5))
    sns.violinplot(x='Vital Signs', y='Measured', hue='followed_by_readmit', data=df_dv_melt, split=True)
    plt.xticks(rotation=60)
    plt.savefig('vitals_violin.png', bbox_inches='tight')

In [None]:
# Creating individual violinplot for each vital sign

vitals_name_unit = ['Heart Rate (bpm)', 'Systolic Blood Pressure (mmHg)', 'Respiratory Rate (insp/min)',
                    'Temperature (degC)', 'Blood Oxygen Saturation (%)']
df_hr_melt = df_data_vitals.melt(id_vars = 'followed_by_readmit', value_vars=vitals_colnames[:3],
                                 var_name='Measure', value_name=vitals_name_unit[0])
df_sbp_melt = df_data_vitals.melt(id_vars = 'followed_by_readmit', value_vars=vitals_colnames[3:6],
                                 var_name='Measure', value_name=vitals_name_unit[1])
df_rr_melt = df_data_vitals.melt(id_vars = 'followed_by_readmit', value_vars=vitals_colnames[6:9],
                                 var_name='Measure', value_name=vitals_name_unit[2])
df_t_melt = df_data_vitals.melt(id_vars = 'followed_by_readmit', value_vars=vitals_colnames[9:12],
                                 var_name='Measure', value_name=vitals_name_unit[3])
df_sat_melt = df_data_vitals.melt(id_vars = 'followed_by_readmit', value_vars=vitals_colnames[12:15],
                                 var_name='Measure', value_name=vitals_name_unit[4])

df_vitals_list = [df_hr_melt, df_sbp_melt, df_rr_melt, df_t_melt, df_sat_melt]
with sns.plotting_context('poster'):
    for i in range(len(df_vitals_list)):
        plt.figure(figsize=(8,6))
        sns.violinplot(x='Measure', y=vitals_name_unit[i], hue='followed_by_readmit', data=df_vitals_list[i], split=True)
        plt.xticks(rotation=45)
        if i < 3:
            plt.legend(loc='upper left')
        else:
            plt.legend(loc='lower right')
        plt.savefig('vitals' + str(i) + '_violin.png', bbox_inches='tight')

### Visualize distribution of lab data for readmits vs. nonreadmits

In [None]:
# Selecting only lab values and followed_by_readmit columns

df_data_lab = df_data[np.concatenate((lab_statsdf.columns.values[1:], ['followed_by_readmit']))]

In [None]:
df_data_lab.head()

In [None]:
# Creating violinplot for all lab data

lab_colnames = df_data_lab.iloc[:, :-1].columns.values
df_lab_melt = df_data_lab.melt(id_vars = 'followed_by_readmit', value_vars=lab_colnames,
                                 var_name='Lab Values', value_name='Measured')

with sns.plotting_context('poster'):
    plt.figure(figsize=(20, 5))
    sns.violinplot(x='Lab Values', y='Measured', hue='followed_by_readmit', data=df_lab_melt, split=True)
    plt.xticks(rotation=60)
    plt.savefig('lab_violin.png', bbox_inches='tight')

In [None]:
# Creating individual violinplot for each lab value

lab_name_unit = ['Bilirubin (mg/dL)', 'Creatinine (mg/dL)', 'Hematocrit (%)', 'Hemoglobin (g/dL)', 'Lactate (mmol/L)',
                 'Platelet (K/uL)', 'PTT (sec)', 'INR', 'WBC (K/uL)', 'RDW (%)']
'''df_bili_melt = df_data_lab.melt(id_vars = 'followed_by_readmit', value_vars=lab_colnames[:3],
                                 var_name='Measure', value_name=lab_name_unit[0])
df_vitals_list = [df_hr_melt, df_sbp_melt, df_rr_melt, df_t_melt, df_sat_melt]
with sns.plotting_context('poster'):
    for i in range(len(df_vitals_list)):
        plt.figure(figsize=(8,6))
        sns.violinplot(x='Measure', y=vitals_name_unit[i], hue='followed_by_readmit', data=df_vitals_list[i], split=True)
        plt.xticks(rotation=45)
        if i < 3:
            plt.legend(loc='upper left')
        else:
            plt.legend(loc='lower right')
        plt.savefig('vitals' + str(i) + '_violin.png', bbox_inches='tight')'''

In [None]:
#merge_df = df2.merge(df_vitals, on = "hadm_id", how = "inner")

In [None]:
#merge_agg_df = merge_df.groupby('hadm_id').agg('mean')


In [None]:
merge_agg_df.isnull

In [None]:
#merge_agg_df.isnull().sum()

In [None]:
#merge_agg_df.shape

In [None]:
#merge_agg_df.describe()

In [None]:
#merge_agg_df.isnull().sum()

In [None]:
#merge_agg_df.describe()

In [None]:
#(merge_agg_df.isnull().sum()/merge_agg_df.count())* 100

In [None]:
corr = merge_agg_df.corr()
sns.heatmap(corr)


In [None]:
merge_agg_df.age_on_admiss.describe()

In [None]:
q10 = 'select distinct(icd9_code) from procedures_icd'
df10 = pd.read_sql_query(q10, con)

In [None]:
df10;