In [1]:
import numpy as np
import pandas as pd
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.DtypeWarning)
pd.set_option('display.max_columns', None)

# 2. cohort selection via inclusion criteria

In order to facilitate the speed of data preprocessing steps, filtering analyzing cohort can be considered as a great way. 

Accordingly we did some filtering before the refining process.

If you use pandas.read_csv, reducing the number of cohorts will be led to saving large amount of time.

In [2]:
icustays = pd.read_csv('processed_data/communal/icustays_wage,ethn,gender.csv')

In [8]:
tmp_df = pd.read_csv('processed_data/sepsis/suspected_infection.csv')
antibiotic_names = tmp_df.loc[(tmp_df.category=='antibiotics')&(tmp_df.includedin=='prescriptions'), 'name'].tolist()
antibiotic_ids = tmp_df.loc[(tmp_df.category=='antibiotics')&(tmp_df.includedin=='inputevents'), 'id'].tolist()
tmp_df = pd.read_csv('processed_data/sepsis/icd_diagnoses_lists.csv')
icd_10_codes = tmp_df.loc[tmp_df.icd_version == 10, 'icd_code'].tolist()
icd_9_codes = tmp_df.loc[tmp_df.icd_version == 9, 'icd_code'].tolist()
blood_culture_ids = [70011, 70012, 225401, 225437]

In [9]:
d_items = pd.read_csv('processed_data/communal/d_items.csv')

In [13]:
tmp = '|'.join(antibiotic_names)
tmp_cond = (d_items.label.astype('str').str.lower().str.contains(tmp)) & (d_items.category == 'Antibiotics')
d_items.loc[tmp_cond, d_items.columns[0:2]].to_csv('tmp_2.csv', index = False)

In [18]:
set(d_items.loc[tmp_cond].itemid) < set(antibiotic_ids)

True

In [4]:
prescriptions = pd.read_csv('hosp/prescriptions.csv')
inputevents = pd.read_csv('icu/inputevents.csv')
microbiology = pd.read_csv('hosp/microbiologyevents.csv')
procedure = pd.read_csv('icu/procedureevents.csv')
diagnosis = pd.read_csv('hosp/diagnoses_icd.csv')

In [47]:
prescriptions.info()
inputevents.info()
microbiology.info()
procedure.info()
diagnosis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16219412 entries, 0 to 16219411
Data columns (total 20 columns):
 #   Column             Dtype  
---  ------             -----  
 0   subject_id         int64  
 1   hadm_id            int64  
 2   pharmacy_id        int64  
 3   poe_id             object 
 4   poe_seq            float64
 5   starttime          object 
 6   stoptime           object 
 7   drug_type          object 
 8   drug               object 
 9   formulary_drug_cd  object 
 10  gsn                object 
 11  ndc                float64
 12  prod_strength      object 
 13  form_rx            object 
 14  dose_val_rx        object 
 15  dose_unit_rx       object 
 16  form_val_disp      object 
 17  form_unit_disp     object 
 18  doses_per_24_hrs   float64
 19  route              object 
dtypes: float64(3), int64(3), object(14)
memory usage: 2.4+ GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9442345 entries, 0 to 9442344
Data columns (total 25 columns):
 #   

In [48]:
tmp = '|'.join(antibiotic_names)
tmp_cond = prescriptions.drug.astype('str').str.lower().str.contains(tmp)
print(prescriptions.loc[tmp_cond].drug.value_counts())
print(prescriptions.loc[tmp_cond].shape[0])
print(len(prescriptions.loc[tmp_cond].hadm_id.unique()))
AB_id_from_pres = prescriptions.loc[tmp_cond].hadm_id.unique().tolist()

Vancomycin                      190313
CefePIME                         54551
Ciprofloxacin HCl                42450
Piperacillin-Tazobactam          41277
MetRONIDAZOLE (FLagyl)           38202
                                 ...  
Nitrofurantoin Macrocrystals         1
Vancomycin 250mg                     1
Erythromycin Tpoical Gel             1
Penicillin                           1
Oxacillin Desensitization            1
Name: drug, Length: 306, dtype: int64
748284
172809


In [49]:
tmp_cond = inputevents.itemid.isin(antibiotic_ids)
print(inputevents.loc[tmp_cond].itemid.value_counts())
print(inputevents.loc[tmp_cond].shape[0])
print(len(inputevents.loc[tmp_cond].hadm_id.unique()))
AB_id_from_input = inputevents.loc[tmp_cond].hadm_id.unique().tolist()

225798    128898
225851     65059
225893     63551
225884     59686
225850     40952
225883     40513
225855     18563
225859     18163
225888     12055
225892     11405
225842     10440
225837     10283
225853     10157
225843      8448
225881      7211
225860      6279
225879      5876
225845      4968
225885      4964
225869      4953
225899      4727
225847      3087
225863      2477
225875      2476
225890      1881
225905      1817
225902      1745
225876      1546
225865      1376
225866       970
225838       830
225873       662
225898       423
229064       369
225840       351
225848       338
225862       332
229587       286
228003       188
227691       181
225844       141
225903        96
225871        95
229061        34
225896        25
225886        23
225877        15
225895        14
225868        13
Name: itemid, dtype: int64
558942
43164


In [50]:
tmp_cond = procedure.itemid.isin(blood_culture_ids)
print(procedure.loc[tmp_cond].itemid.value_counts())
print(procedure.loc[tmp_cond].shape[0])
print(len(procedure.loc[tmp_cond].hadm_id.unique()))
BC_id_from_proc = procedure.loc[tmp_cond].hadm_id.unique().tolist()

225401    23668
225437      318
Name: itemid, dtype: int64
23986
11844


In [51]:
tmp_cond = microbiology.spec_itemid.isin(blood_culture_ids)
print(microbiology.loc[tmp_cond].spec_itemid.value_counts())
print(microbiology.loc[tmp_cond].shape[0])
print(len(microbiology.loc[tmp_cond].hadm_id.unique()))
BC_id_from_micro = microbiology.loc[tmp_cond].hadm_id.unique().tolist()

70012    679894
70011     10372
Name: spec_itemid, dtype: int64
690266
75497


In [52]:
tmp = [idx for idx, code in enumerate(diagnosis.icd_code.astype('str')) if code.startswith(tuple(icd_9_codes+icd_10_codes))]
print(diagnosis.iloc[tmp].icd_code.value_counts())
print(diagnosis.iloc[tmp].shape[0])
print(len(diagnosis.iloc[tmp].hadm_id.unique()))
IRD_id_from_diag = diagnosis.iloc[tmp].hadm_id.unique().tolist()

5990     20452
486      11196
N390     11180
J189      6272
07054     5651
         ...  
B389         1
K0511        1
53120        1
M0019        1
K8036        1
Name: icd_code, Length: 1821, dtype: int64
292957
163951


Cohort selection rules are:
1. The analyzing cohort should admitted to the ICU. 
2. The age at the intime should be same or larger than 18.
3. Should show some patholgy of infection: antibiotics prescribed, blood culture test done, diagnosed with some infection related diagnosis.

In [54]:
age_cond = icustays.age_at_intime >= 18
AB_cond = icustays.hadm_id.isin(AB_id_from_input+AB_id_from_pres)
BC_cond = icustays.hadm_id.isin(BC_id_from_micro+BC_id_from_proc)
IRD_cond = icustays.hadm_id.isin(IRD_id_from_diag)

icustays['age_cond'] = np.where(age_cond, 1, 0)
icustays['AB_cond'] = np.where(AB_cond, 1, 0)
icustays['BC_cond'] = np.where(BC_cond, 1, 0)
icustays['IRD_cond'] = np.where(IRD_cond, 1, 0)

icustays['suspected_infection'] = np.where((age_cond & AB_cond & BC_cond & IRD_cond), 1, 0)
icustays

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los,age_at_intime,gender,race,age_cond,AB_cond,BC_cond,IRD_cond,suspected_infection
0,10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.410266,52.560502,F,WHITE,1,0,0,1,0
1,10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.497535,76.488664,F,BLACK/AFRICAN AMERICAN,1,1,0,0,0
2,10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.118032,55.887135,F,WHITE,1,1,1,1,1
3,10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.948113,55.966177,F,WHITE,1,1,0,1,0
4,10001725,25563031,31205490,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2110-04-11 15:52:22,2110-04-12 23:59:56,1.338588,46.275785,F,WHITE,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76938,19999442,26785317,32336619,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2148-11-19 14:23:43,2148-11-26 13:12:15,6.950370,43.886575,M,WHITE,1,1,0,1,0
76939,19999625,25304202,31070865,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2139-10-10 19:18:00,2139-10-11 18:21:28,0.960741,82.774806,M,WHITE,1,1,0,1,0
76940,19999828,25744818,36075953,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2149-01-08 18:12:00,2149-01-10 13:11:02,1.790995,48.023995,F,WHITE,1,1,1,1,1
76941,19999840,21033226,38978960,Trauma SICU (TSICU),Surgical Intensive Care Unit (SICU),2164-09-12 09:26:28,2164-09-17 16:35:15,5.297766,58.699708,M,WHITE,1,1,1,0,0


In [55]:
tmp_cond = icustays.suspected_infection == 1
print(len(icustays.loc[tmp_cond].subject_id.unique()))
print(len(icustays.loc[tmp_cond].hadm_id.unique()))
print(len(icustays.loc[tmp_cond].stay_id.unique()))

18365
21925
26628


In [57]:
icustays.to_csv('processed_data/sepsis/icustays_wsusinf.csv', index=False)

# 3. filtering selected-cohorts-relevant data

In [1]:
import pandas as pd
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.DtypeWarning)
pd.set_option('display.max_columns', None)

In [2]:
icustays = pd.read_csv('processed_data/sepsis/icustays_wsusinf.csv')
tmp_cond = icustays.suspected_infection == 1
subjectid = icustays.loc[tmp_cond].subject_id.unique().tolist()
hadmid = icustays.loc[tmp_cond].hadm_id.unique().tolist()
stayid = icustays.loc[tmp_cond].stay_id.unique().tolist()

In [3]:
dirs = ['hosp/omr.csv', 'hosp/labevents.csv', 'icu/chartevents.csv', 'icu/inputevents.csv', 'icu/outputevents.csv', 'icu/procedureevents.csv']

In [5]:
for f_name in dirs:
    df = pd.read_csv(f_name)
    if f_name.find('omr.csv') != -1:
        tmp_cond = df.subject_id.isin(subjectid)
        df.loc[tmp_cond].to_csv('processed_data/sepsis/omr_SCR.csv', index=False)

    elif f_name.find('labevents.csv') != -1:
        tmp_cond = df.hadm_id.isin(hadmid)
        df.loc[tmp_cond].to_csv('processed_data/sepsis/LE_SCR.csv', index=False)

    elif f_name.find('chartevents.csv') != -1:
        tmp_cond = df.stay_id.isin(stayid)
        df.loc[tmp_cond].to_csv('processed_data/sepsis/CE_SCR.csv', index=False)

    elif f_name.find('inputevents.csv') != -1:
        tmp_cond = df.stay_id.isin(stayid)
        df.loc[tmp_cond].to_csv('processed_data/sepsis/IE_SCR.csv', index=False)
        
    elif f_name.find('outputevents.csv') != -1:
        tmp_cond = df.stay_id.isin(stayid)
        df.loc[tmp_cond].to_csv('processed_data/sepsis/OE_SCR.csv', index=False)

    elif f_name.find('procedureevents.csv') != -1:
        tmp_cond = df.stay_id.isin(stayid)
        df.loc[tmp_cond].to_csv('processed_data/sepsis/PE_SCR.csv', index=False)
    
    print(f'{f_name} done!')

hosp/omr.csv done!
hosp/labevents.csv done!
icu/chartevents.csv done!
icu/inputevents.csv done!
icu/outputevents.csv done!
icu/procedureevents.csv done!
