In [2]:
import os

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [3]:
# Read in MIMIC files
raw_adm = pd.read_csv("/home/dc925/project/data/graphmimic/ADMISSIONS.csv.gz")
raw_patients = pd.read_csv('/home/dc925/project/data/graphmimic/PATIENTS.csv.gz')
dicd = pd.read_csv('/home/dc925/project/data/graphmimic/DIAGNOSES_ICD.csv.gz')
picd = pd.read_csv('/home/dc925/project/data/graphmimic/PROCEDURES_ICD.csv.gz')
raw_notes = pd.read_csv("/home/dc925/project/data/graphmimic/NOTEEVENTS.csv")

# Only keep the admission IDs that have notes
adm = raw_adm.copy()
adm_in_notes = raw_notes.HADM_ID.unique()
adm = adm[adm.HADM_ID.isin(adm_in_notes)]
# Subset notes to nursing and physician notes
notes = raw_notes.copy()
categories_keep = ['Nursing', 'Physician ', 'Nursing/other']
notes = notes[notes.CATEGORY.isin(categories_keep)]

# Merge notes with patient data
notes = notes.merge(
    adm[['HADM_ID', 'DISCHTIME', 'HOSPITAL_EXPIRE_FLAG']],
    on='HADM_ID', how='left'
)

# Time manipulation to only keep notes more than 24 hrs before discharge
notes.DISCHTIME = pd.to_datetime(notes.DISCHTIME)
notes.CHARTTIME = pd.to_datetime(notes.CHARTTIME)
notes.CHARTDATE = pd.to_datetime(notes.CHARTDATE) + pd.DateOffset(hours=23)

notes.CHARTTIME = notes.CHARTTIME.fillna(notes.CHARTDATE)

notes = notes[notes.CHARTTIME < notes.DISCHTIME - pd.DateOffset(hours=24)]

  interactivity=interactivity, compiler=compiler, result=result)


In [47]:
# Subset relevant columns and create balanced dataset
keep_cols = ['HADM_ID', 'SUBJECT_ID', 'TEXT', 'HOSPITAL_EXPIRE_FLAG']

pos_notes = notes.loc[notes.HOSPITAL_EXPIRE_FLAG == 1, keep_cols]

neg_notes = (
    notes
    .loc[:, keep_cols]
    .query("HOSPITAL_EXPIRE_FLAG == 0")
    .groupby("HADM_ID")
    .apply(lambda df: df.sample(n=4) if df.shape[0] >= 4 else df)
    .reset_index(drop=True)
)
sampled_notes = pd.concat([pos_notes, neg_notes]).drop_duplicates()
sampled_notes.HOSPITAL_EXPIRE_FLAG.value_counts()

0.0    135531
1.0    124702
Name: HOSPITAL_EXPIRE_FLAG, dtype: int64

In [48]:
# Get ICD-9 codes per visit and merge with notes
gp = dicd.groupby('HADM_ID')
icd_codes = gp['ICD9_CODE'].apply(list)
icd_codes = icd_codes.reset_index()
sampled_notes = sampled_notes.merge(
    icd_codes,
    on='HADM_ID', how='left'
)

# Function to convert ICD9 codes to broader version (take the first 3 characters; first 4 characters for E-codes)
def simplify_icd(code_list):
    new_list = []
    for code in code_list:
        code = str(code)
        if code=='nan':
            continue
        if code[0]=='E':
            new_code = code[:4]
        else:
            new_code = code[:3]
        new_list.append(new_code)
    return new_list

sampled_notes['BROAD_ICD9_CODE'] = sampled_notes['ICD9_CODE'].apply(lambda x: simplify_icd(x))

In [49]:
sampled_notes

Unnamed: 0,HADM_ID,SUBJECT_ID,TEXT,HOSPITAL_EXPIRE_FLAG,ICD9_CODE,BROAD_ICD9_CODE
0,146431.0,31916,"Respiratory failure, acute (not ARDS/[**Doctor...",1.0,"[5070, 51881, 55220, 1970, 1987, 1983, 5849, 5...","[507, 518, 552, 197, 198, 198, 584, 511, V66, ..."
1,116532.0,29487,Chief Complaint:\n 24 Hour Events:\n EKG - A...,1.0,"[03843, 5185, 78552, 42823, 486, 2762, 5990, 5...","[038, 518, 785, 428, 486, 276, 599, 584, 707, ..."
2,116532.0,29487,No significant events overnight\n Renal fail...,1.0,"[03843, 5185, 78552, 42823, 486, 2762, 5990, 5...","[038, 518, 785, 428, 486, 276, 599, 584, 707, ..."
3,111458.0,31820,"Sepsis, Severe (with organ dysfunction)\n As...",1.0,"[1970, 5849, 1578, 5119, 5990, 0388, 5582, 286...","[197, 584, 157, 511, 599, 038, 558, 286, 518, ..."
4,116532.0,29487,"Chief Complaint: urosepsis, erspiratory failur...",1.0,"[03843, 5185, 78552, 42823, 486, 2762, 5990, 5...","[038, 518, 785, 428, 486, 276, 599, 584, 707, ..."
...,...,...,...,...,...,...
260228,199995.0,19412,NEURO: PT ALERT ORIENTED FOLLOWING COMMANDS. G...,0.0,"[4210, 7464, 42971, 30401, 4412, 44284, V1259,...","[421, 746, 429, 304, 441, 442, V12, 041, 305, ..."
260229,199995.0,19412,Resp: [**Name (NI) 97**] pt intubated postop ...,0.0,"[4210, 7464, 42971, 30401, 4412, 44284, V1259,...","[421, 746, 429, 304, 441, 442, V12, 041, 305, ..."
260230,199998.0,27200,Resp Care\n\nPt received from OR and placed on...,0.0,"[41401, 9971, 9975, 42731, 78820, 4111, V4582,...","[414, 997, 997, 427, 788, 411, V45, E878, 429,..."
260231,199998.0,27200,Neuro: tmax 100.4; MAE; A&O x3; FC; very plea...,0.0,"[41401, 9971, 9975, 42731, 78820, 4111, V4582,...","[414, 997, 997, 427, 788, 411, V45, E878, 429,..."


In [50]:
# Count unique broad ICD9 codes

from collections import Counter
broad_codes = sampled_notes['BROAD_ICD9_CODE']
bc = list(broad_codes)
all_codes = [item for sublist in bc for item in sublist]
c = Counter(all_codes)
len(c)

1034

In [51]:
# Clean text
def isolate(text, chars):
    for c in chars:
        text = text.replace(c, f" {c} ")
    return text
def replace(text, chars, new=""):
    for c in chars:
        text = text.replace(c, new)
    return text
def clean_text(text):
    text = replace(text, "[**")
    text = replace(text, "**]")
    text = isolate(text, "~!@#$%^&*()_+-={}:\";',./<>?\\|`'")
    text = text.lower()
    
    return text

In [52]:
sampled_notes['TEXT'] = sampled_notes.TEXT.apply(clean_text)

In [53]:
sampled_notes

Unnamed: 0,HADM_ID,SUBJECT_ID,TEXT,HOSPITAL_EXPIRE_FLAG,ICD9_CODE,BROAD_ICD9_CODE
0,146431.0,31916,"respiratory failure , acute ( not ards / doc...",1.0,"[5070, 51881, 55220, 1970, 1987, 1983, 5849, 5...","[507, 518, 552, 197, 198, 198, 584, 511, V66, ..."
1,116532.0,29487,chief complaint : \n 24 hour events : \n ekg...,1.0,"[03843, 5185, 78552, 42823, 486, 2762, 5990, 5...","[038, 518, 785, 428, 486, 276, 599, 584, 707, ..."
2,116532.0,29487,no significant events overnight\n renal fail...,1.0,"[03843, 5185, 78552, 42823, 486, 2762, 5990, 5...","[038, 518, 785, 428, 486, 276, 599, 584, 707, ..."
3,111458.0,31820,"sepsis , severe ( with organ dysfunction ) \...",1.0,"[1970, 5849, 1578, 5119, 5990, 0388, 5582, 286...","[197, 584, 157, 511, 599, 038, 558, 286, 518, ..."
4,116532.0,29487,"chief complaint : urosepsis , erspiratory fa...",1.0,"[03843, 5185, 78552, 42823, 486, 2762, 5990, 5...","[038, 518, 785, 428, 486, 276, 599, 584, 707, ..."
...,...,...,...,...,...,...
260228,199995.0,19412,neuro : pt alert oriented following commands ...,0.0,"[4210, 7464, 42971, 30401, 4412, 44284, V1259,...","[421, 746, 429, 304, 441, 442, V12, 041, 305, ..."
260229,199995.0,19412,resp : name ( ni ) 97 pt intubated postop ...,0.0,"[4210, 7464, 42971, 30401, 4412, 44284, V1259,...","[421, 746, 429, 304, 441, 442, V12, 041, 305, ..."
260230,199998.0,27200,resp care\n\npt received from or and placed on...,0.0,"[41401, 9971, 9975, 42731, 78820, 4111, V4582,...","[414, 997, 997, 427, 788, 411, V45, E878, 429,..."
260231,199998.0,27200,neuro : tmax 100 . 4 ; mae ; a & o x3 ; f...,0.0,"[41401, 9971, 9975, 42731, 78820, 4111, V4582,...","[414, 997, 997, 427, 788, 411, V45, E878, 429,..."


In [54]:
# # this is for counting num tokens to see distribution of lengths
# from transformers import AutoTokenizer, BertTokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# sample = sampled_notes.sample(10000)
# sample['tokens'] = sample['TEXT'].apply(lambda x: tokenizer(x)['input_ids'])
# sample['length'] = sample['tokens'].apply(lambda x: len(x))
# sample['length'].plot.hist(bins=50)

In [55]:
final_sampled_notes = sampled_notes[['HADM_ID', 'SUBJECT_ID', 'TEXT', 'BROAD_ICD9_CODE', 'HOSPITAL_EXPIRE_FLAG']]

In [56]:
final_sampled_notes

Unnamed: 0,HADM_ID,SUBJECT_ID,TEXT,BROAD_ICD9_CODE,HOSPITAL_EXPIRE_FLAG
0,146431.0,31916,"respiratory failure , acute ( not ards / doc...","[507, 518, 552, 197, 198, 198, 584, 511, V66, ...",1.0
1,116532.0,29487,chief complaint : \n 24 hour events : \n ekg...,"[038, 518, 785, 428, 486, 276, 599, 584, 707, ...",1.0
2,116532.0,29487,no significant events overnight\n renal fail...,"[038, 518, 785, 428, 486, 276, 599, 584, 707, ...",1.0
3,111458.0,31820,"sepsis , severe ( with organ dysfunction ) \...","[197, 584, 157, 511, 599, 038, 558, 286, 518, ...",1.0
4,116532.0,29487,"chief complaint : urosepsis , erspiratory fa...","[038, 518, 785, 428, 486, 276, 599, 584, 707, ...",1.0
...,...,...,...,...,...
260228,199995.0,19412,neuro : pt alert oriented following commands ...,"[421, 746, 429, 304, 441, 442, V12, 041, 305, ...",0.0
260229,199995.0,19412,resp : name ( ni ) 97 pt intubated postop ...,"[421, 746, 429, 304, 441, 442, V12, 041, 305, ...",0.0
260230,199998.0,27200,resp care\n\npt received from or and placed on...,"[414, 997, 997, 427, 788, 411, V45, E878, 429,...",0.0
260231,199998.0,27200,neuro : tmax 100 . 4 ; mae ; a & o x3 ; f...,"[414, 997, 997, 427, 788, 411, V45, E878, 429,...",0.0


In [57]:
subjects = final_sampled_notes[['SUBJECT_ID', "HOSPITAL_EXPIRE_FLAG"]].drop_duplicates()

train_subj, rest_subj = train_test_split(
    subjects, 
    test_size=0.25, 
    random_state=0,
    stratify=subjects.HOSPITAL_EXPIRE_FLAG
)

valid_subj, test_subj = train_test_split(
    rest_subj.SUBJECT_ID.values,
    test_size=0.6,
    random_state=1,
    stratify=rest_subj.HOSPITAL_EXPIRE_FLAG
)

train_subj = train_subj.SUBJECT_ID.values

In [58]:
train_notes = final_sampled_notes[final_sampled_notes.SUBJECT_ID.isin(train_subj)].reset_index(drop=True)
valid_notes = final_sampled_notes[final_sampled_notes.SUBJECT_ID.isin(valid_subj)].reset_index(drop=True)
test_notes = final_sampled_notes[final_sampled_notes.SUBJECT_ID.isin(test_subj)].reset_index(drop=True)

In [59]:
train_notes

Unnamed: 0,HADM_ID,SUBJECT_ID,TEXT,BROAD_ICD9_CODE,HOSPITAL_EXPIRE_FLAG
0,116532.0,29487,chief complaint : \n 24 hour events : \n ekg...,"[038, 518, 785, 428, 486, 276, 599, 584, 707, ...",1.0
1,116532.0,29487,no significant events overnight\n renal fail...,"[038, 518, 785, 428, 486, 276, 599, 584, 707, ...",1.0
2,111458.0,31820,"sepsis , severe ( with organ dysfunction ) \...","[197, 584, 157, 511, 599, 038, 558, 286, 518, ...",1.0
3,116532.0,29487,"chief complaint : urosepsis , erspiratory fa...","[038, 518, 785, 428, 486, 276, 599, 584, 707, ...",1.0
4,116532.0,29487,chief complaint : \n 24 hour events : \n ...,"[038, 518, 785, 428, 486, 276, 599, 584, 707, ...",1.0
...,...,...,...,...,...
204845,199995.0,19412,neuro : pt alert oriented following commands ...,"[421, 746, 429, 304, 441, 442, V12, 041, 305, ...",0.0
204846,199995.0,19412,resp : name ( ni ) 97 pt intubated postop ...,"[421, 746, 429, 304, 441, 442, V12, 041, 305, ...",0.0
204847,199998.0,27200,resp care\n\npt received from or and placed on...,"[414, 997, 997, 427, 788, 411, V45, E878, 429,...",0.0
204848,199998.0,27200,neuro : tmax 100 . 4 ; mae ; a & o x3 ; f...,"[414, 997, 997, 427, 788, 411, V45, E878, 429,...",0.0


In [60]:
train_notes.to_csv("/home/dc925/project/data/graphmimic/train.csv", index=False, sep='\t')
valid_notes.to_csv("/home/dc925/project/data/graphmimic/valid.csv", index=False, sep='\t')
test_notes.to_csv("/home/dc925/project/data/graphmimic/test.csv", index=False, sep='\t')

In [61]:
# Create subset for experiments

train_sample = train_notes.sample(frac=0.1)
valid_sample = valid_notes.sample(frac=0.1)
test_sample = test_notes.sample(frac=0.1)

In [62]:
train_sample.to_csv("/home/dc925/project/data/graphmimic/sample/train.csv", index=False, sep='\t')
valid_sample.to_csv("/home/dc925/project/data/graphmimic/sample/valid.csv", index=False, sep='\t')
test_sample.to_csv("/home/dc925/project/data/graphmimic/sample/test.csv", index=False, sep='\t')

In [63]:
train_sample

Unnamed: 0,HADM_ID,SUBJECT_ID,TEXT,BROAD_ICD9_CODE,HOSPITAL_EXPIRE_FLAG
178646,174194.0,13101,pt . being maintained on 5 cpap - 15 ips - 60 ...,"[482, 482, 428, 707, 584, 518, 201, 038, 292]",0.0
180784,176307.0,26186,update\no : resp status : cpap 15ps 5 peep ....,"[433, 507, 518, 790, 041, 342, 784, 244]",0.0
47149,128652.0,31,"resp care note\nintubated , mech vented in si...","[345, 201, 515, 486, 401, 362, E933, 437]",1.0
49475,108620.0,6112,respiratory care note\n\npatient received from...,"[444, 434, 785, 427, 728, 518, 584, 492, 428, ...",1.0
179309,174872.0,31795,neonatology attending\nfull term infant referr...,"[V30, 747, 745]",0.0
...,...,...,...,...,...
136213,132835.0,3322,neonatology attending\n\ndol 56 pma 32 4 / 7 w...,"[748, 779, 765, 770, 779, 747, 530, 362, 592, ...",0.0
150093,146252.0,23835,tsicu nursing admit note\npt admitted from ed ...,"[825, 814, 825, E816, 873]",0.0
59661,102438.0,9380,resp care : pt remains trached with 8 . 0 por...,"[038, 518, 530, 519, 510, 511, 518, 584, 403, ...",1.0
172301,168222.0,18666,neonatology attending admit note : \n\n34 5 - ...,"[V31, 770, 765, 774, 765, V29, V05]",0.0


In [21]:
# Take a look at how the most common codes for expired vs. non-expired patients differ

In [64]:
# Create a dictionary of broad ICD codes to descriptions
broad_codes_list = pd.read_csv('/home/dc925/project/data/graphmimic/UMLS/broad_codes.txt', sep='|', header=None)
broad_codes_list.columns = ["CUI", "ICD_BROAD", "DESC"]
broad_dict = broad_codes_list.set_index("ICD_BROAD").to_dict()['DESC']
broad_dict['E849'] = "Place of occurrence" # this wasn't in UMLS for some reason

In [72]:
# Check that all codes that we got from MIMIC are covered in the dictionary
all_codes_in_mimic = list(c.keys())
for code in all_codes_in_mimic:
    if code not in broad_dict:
        print(code)

In [75]:
# Separate into positive and negative examples and create a counter for the codes in each df
all_pos_notes = final_sampled_notes[final_sampled_notes['HOSPITAL_EXPIRE_FLAG']==1.0]
all_neg_notes = final_sampled_notes[final_sampled_notes['HOSPITAL_EXPIRE_FLAG']==0.0]

pos_codes = list(all_pos_notes['BROAD_ICD9_CODE'])
neg_codes = list(all_neg_notes['BROAD_ICD9_CODE'])
all_pos_codes = [item for sublist in pos_codes for item in sublist]
all_neg_codes = [item for sublist in neg_codes for item in sublist]
pos_c = Counter(all_pos_codes)
neg_c = Counter(all_neg_codes)


In [89]:
pos_top = pos_c.most_common(20)

In [90]:
for code in pos_top:
    print(code[0], broad_dict[code[0]], code[1])

518 Other diseases of lung 106410
276 Disorders of fluid, electrolyte, and acid-base balance 91201
427 Cardiac dysrhythmias 87431
428 Heart failure 74649
584 Acute kidney failure 73108
038 Septicemia 66964
995 Certain adverse effects not elsewhere classified 53791
785 Symptoms involving cardiovascular system 50867
285 Other and unspecified anemias 40214
250 Diabetes mellitus 37904
707 Chronic ulcer of skin 36011
401 Essential hypertension 35124
599 Other disorders of urethra and urinary tract 28840
482 Other bacterial pneumonia 28738
998 Other complications of procedures, NEC 28103
414 Other forms of chronic ischemic heart disease 27330
V45 Other postprocedural states 26875
486 Pneumonia, organism NOS 25853
287 Purpura and other hemorrhagic conditions 24726
997 Complications affecting specified body systems, not elsewhere classified 24562


In [91]:
neg_top = neg_c.most_common(20)

In [92]:
for code in neg_top:
    print(code[0], broad_dict[code[0]], code[1])

401 Essential hypertension 46985
428 Heart failure 44849
427 Cardiac dysrhythmias 44558
276 Disorders of fluid, electrolyte, and acid-base balance 39213
414 Other forms of chronic ischemic heart disease 36798
250 Diabetes mellitus 36073
518 Other diseases of lung 30699
272 Disorders of lipoid metabolism 30492
285 Other and unspecified anemias 27836
765 Disorders relating to short gestation and unspecified low birthweight 22623
584 Acute kidney failure 22384
V45 Other postprocedural states 20347
V29 Observation and evaluation of newborns for suspected condition not found 18340
V30 Single liveborn 17830
599 Other disorders of urethra and urinary tract 17346
V05 Need for prophylactic vaccination and inoculation against single diseases 16979
530 Diseases of esophagus 16392
998 Other complications of procedures, NEC 14804
V58 Encounter for other and unspecified procedures and aftercare 14127
997 Complications affecting specified body systems, not elsewhere classified 14083
