In [1]:
import pandas as pd
import numpy as np
import math
import time
import numpy as np
# If pandas is not installed, please uncomment the following line:
#!pip install pandas
#!pip install sklearn
#!pip install pathos


Import data

In [2]:
#full

def read_mimic_csv(path):
    start = time.time()
    TextFileReader = pd.read_csv(path, chunksize=100000, iterator=True, low_memory=False)
    df = pd.concat(TextFileReader, ignore_index=True)
    print(path, ":" , round(time.time() - start, 1), 'seconds')
    return df



In [3]:
path = './payload/full/' # change as needed
patients = read_mimic_csv(path + 'PATIENTS.csv')
diagnoses = read_mimic_csv(path + 'DIAGNOSES_ICD.csv')
admissions = read_mimic_csv(path + 'ADMISSIONS.csv')


./payload/full/PATIENTS.csv : 0.1 seconds
./payload/full/DIAGNOSES_ICD.csv : 1.0 seconds
./payload/full/ADMISSIONS.csv : 0.7 seconds


#### ICD9 Feature Engineering

In [4]:
ccs = open("ccs.txt", "r").read()

# Create dictionary. Key is ICD9 code for a diagnosis. Value is general description of diagnosis.
ccs = ccs[ccs.find('Tuberculosis'):]
icd9={}

def update_icd9(cur_value, section):
    while section:
        if section[:4] == '\n\n':
            print('new value')
            section = section[4:]
            cur_value = section[:section.find('\n')]
            section = section[section.find('\n'):]
        elif section[0] == ' ':
            section = section[1:]
        elif section[:2] == '\n':
            section = section[2:]
        else:
            if section.find(' ') >= 0: # not end of document
                if -1 < section.find('\n') < section.find(' '): # if end of line
                    cur_key = section[:section.find('\n')]
                else: # if not end of line
                    cur_key = section[:section.find(' ')]
                section = section[section.find(' '):]
                icd9[cur_key] = cur_value

            else: # end of section
                cur_key = section
                icd9[cur_key] = cur_value
                section = ""
            
for section in ccs.split(sep='\n\n'): # for each family of codes
    cur_value = section[:section.find('\n')] # get the name for that family
    section = section[section.find('\n')+1:] # and for all the codes under that family
    update_icd9(cur_value, section) # add those codes as keys to a dictionary, where their values
                                    # are the name for the family of codes

diagnoses.ICD9_CODE = diagnoses.ICD9_CODE.apply(lambda x: icd9.get(x,-1))

#### Create LOS feature

In [5]:
# commented out for mortality classification:
# admissions = admissions[pd.isnull(admissions['DEATHTIME'])]
df = admissions[['SUBJECT_ID',
                 'HADM_ID',
                 'ADMISSION_TYPE',
                 'ADMITTIME']].copy()

df['LOS'] = (pd.to_datetime(admissions['DISCHTIME']) - pd.to_datetime(admissions['ADMITTIME'])).astype('timedelta64[h]') 
df['ADMITTIME'] = pd.to_datetime(admissions['ADMITTIME']) 

In [9]:
df = df[df['LOS'] >= 0]

In [10]:
diagnoses = pd.get_dummies(diagnoses[['HADM_ID','ICD9_CODE']], drop_first=False)
diagnoses = diagnoses.groupby('HADM_ID').agg('sum')
df = pd.merge(df,
              diagnoses,
              on='HADM_ID',
              how='left') 

Note to self: There are negative LOS values for when a patient dies prior to arriving to the hospital. I keep these in for mortality classification. But these values kinda lead to meaningless LOS values.

#### Extracting age feature

In [11]:
# For mortality classification, I'm keeping DOD_HOSP so I can create a boolean response for death
# NB: DOD includes ALL deaths (before & after), while DOD_HOSP only includes deaths occuring inside the hospital. 
df = pd.merge(df, # drop DOD_HOSP too if not classifying mortality
              patients.drop(columns = ['DOD', 'DOD_SSN','ROW_ID','EXPIRE_FLAG']),
              on='SUBJECT_ID',
              how='left') 
median_dob_shift = 300 - 91.4 # For old patients (median age of 91.4), dob was shifted to be 300 yrs prior to first visit
df['AGE'] = (pd.to_datetime(df['ADMITTIME']).dt.date - pd.to_datetime(df['DOB']).dt.date)
df['AGE'] = [age.days/365 if age.days/365<300 else age.days/365-median_dob_shift for age in df['AGE']]

"DOB is the date of birth of the given patient. Patients who are older than 89 years old at any time in the database have had their date of birth shifted to obscure their age and comply with HIPAA. The shift process was as follows: the patient’s age at their first admission was determined. The date of birth was then set to exactly 300 years before their first admission"

#### Extracting whether-they-died feature

In [12]:
df['DIED'] = df['DOD_HOSP'].apply(lambda x: not pd.isnull(x))

#### Trig transform for admit time

In [13]:
df['ADMITHOUR_trig_x'] = pd.to_datetime(df['ADMITTIME']).dt.hour.apply(math.cos)
df['ADMITHOUR_trig_y'] = pd.to_datetime(df['ADMITTIME']).dt.hour.apply(math.sin)

In [14]:
df.drop(['DOD_HOSP','DOB'], axis=1, inplace=True)

##### Input Events

Create dummy variables and drop those with less information

In [15]:
print('Shape before adding dummy variables:',df.shape)
df = pd.get_dummies(df, drop_first=True)
print('Shape after adding dummy variables:', df.shape)

# It turns out ADMITHOUR after trig transform is highly predictive of whether you die

Shape before adding dummy variables: (58878, 292)
Shape after adding dummy variables: (58878, 294)


In [17]:
df = df.drop(columns = ['SUBJECT_ID'])

In [19]:
first_chartevents = read_mimic_csv('first_day_chartevents.csv')
first_chartevents = first_chartevents.drop(columns = ["Unnamed: 0"])
df = pd.merge(df, first_chartevents, on='HADM_ID',
              how='left') 

first_day_chartevents.csv : 0.2 seconds


16.464577436447144

In [21]:
first_serv_pres_micro = read_mimic_csv('first_serv_pres_micro.csv')

first_serv_pres_micro.csv : 0.4 seconds


In [24]:
first_serv_pres_micro = first_serv_pres_micro.drop(columns = ['Unnamed: 0'])


In [36]:
hadm = first_serv_pres_micro.index.values
first_serv_pres_micro['HADM_ID'] = hadm

In [38]:
df = pd.merge(df, first_serv_pres_micro, on="HADM_ID", how = 'left')


In [41]:
time1 = time.time()

df.to_csv('pat_adm_diag_chart.csv')

time.time() - time1

20.11062216758728

In [62]:
bert_df.to_csv('bert.csv')