PURPOSE:
------------------------------
This creates a list of icustayIDs of patients who develop sepsis at some point 
in the ICU. records charttime for onset of sepsis. Uses sepsis3 criteria

STEPS:
-------------------------------
IMPORT DATA FROM CSV FILES  
FLAG PRESUMED INFECTION  
PREPROCESSING  
REFORMAT in 4h time slots  
COMPUTE SOFA at each time step  
FLAG SEPSIS  

note: the process generates the same features as the final MDP dataset, most of which are not used to compute SOFA

# Import all data

In [257]:
import pandas as pd
abx = pd.read_csv('/Users/faaiz/exportdir/abx.csv', sep='|')
culture = pd.read_csv('/Users/faaiz/exportdir/culture.csv', sep='|')
microbio = pd.read_csv('/Users/faaiz/exportdir/microbio.csv', sep='|')
demog = pd.read_csv('/Users/faaiz/exportdir/demog.csv', sep='|')
ce010 = pd.read_csv('/Users/faaiz/exportdir/ce010000.csv', sep='|')
ce1020 = pd.read_csv('/Users/faaiz/exportdir/ce1000020000.csv', sep='|')
ce2030 = pd.read_csv('/Users/faaiz/exportdir/ce2000030000.csv', sep='|')
ce3040 = pd.read_csv('/Users/faaiz/exportdir/ce3000040000.csv', sep='|')
ce4050 = pd.read_csv('/Users/faaiz/exportdir/ce4000050000.csv', sep='|')
ce5060 = pd.read_csv('/Users/faaiz/exportdir/ce5000060000.csv', sep='|')
ce6070 = pd.read_csv('/Users/faaiz/exportdir/ce6000070000.csv', sep='|')
ce7080 = pd.read_csv('/Users/faaiz/exportdir/ce7000080000.csv', sep='|')
ce8090 = pd.read_csv('/Users/faaiz/exportdir/ce8000090000.csv', sep='|')
ce90100 = pd.read_csv('/Users/faaiz/exportdir/ce90000100000.csv', sep='|')
lab_ce = pd.read_csv('/Users/faaiz/exportdir/labs_ce.csv', sep='|').rename(columns = {'charttime': 'timestp'}, inplace = False)
lab_le = pd.read_csv('/Users/faaiz/exportdir/labs_le.csv', sep='|')
labU = pd.concat([lab_ce, lab_le], ignore_index = True)
MV = pd.read_csv('/Users/faaiz/exportdir/mechvent.csv', sep='|')
inputpreadm = pd.read_csv('/Users/faaiz/exportdir/preadm_fluid.csv', sep='|')
inputMV = pd.read_csv('/Users/faaiz/exportdir/fluid_mv.csv', sep='|')
inputCV = pd.read_csv('/Users/faaiz/exportdir/fluid_cv.csv', sep='|')
vasoMV = pd.read_csv('/Users/faaiz/exportdir/vaso_mv.csv', sep='|')
vasoCV = pd.read_csv('/Users/faaiz/exportdir/vaso_cv.csv', sep='|')
UOpreadm = pd.read_csv('/Users/faaiz/exportdir/preadm_uo.csv', sep='|')
UO = pd.read_csv('/Users/faaiz/exportdir/uo.csv', sep='|')

reflabs = pd.read_csv('/Users/faaiz/exportdir/Reflabs.csv', header=None)
refvitals = pd.read_csv('/Users/faaiz/exportdir/Refvitals.csv', header=None)
sample_and_hold = pd.read_csv('/Users/faaiz/exportdir/sample_and_hold.csv')

# Initial Data Manipulations

In [258]:
# if charttime is empty but chartdate isn't
microbio.loc[microbio['charttime'].isnull(), 'charttime'] = microbio['chartdate'] 
microbio = microbio.drop(columns = 'chartdate')
bacterio = pd.concat([microbio, culture], ignore_index = True)

In [259]:
demog['morta_90'] = demog['morta_90'].fillna(0)
demog['morta_hosp'] = demog['morta_hosp'].fillna(0)
demog['elixhauser'] = demog['elixhauser'].fillna(0)

In [260]:
inputMV.head()
inputMV['normrate'] = inputMV['rate']*inputMV['tev']/inputMV['amount']

In [261]:
def find_icustay_id_from_demog(hadm_id, time, subject_id = None):
    if subject_id is not None:
        df = demog.loc[demog['subject_id'] == subject_id]
    else:
        df = demog.loc[demog['hadm_id'] == hadm_id]
    for index, row in df.iterrows():
        if (time >= row['intime'] - 48*3600 and time <= row['outtime'] + 48*3600) or len(df) == 1:
            return row['icustay_id']
    df2 = df.loc[df['hadm_id'] == hadm_id]
    if len(df2) == 1:
        return df2['icustay_id'].values[0]
    return None

In [262]:
# TODO: to be removed!
len(bacterio)
len(bacterio.loc[bacterio['icustay_id'].isnull()])

2003

In [263]:
bacterio.head()
# Fill in missing ICUSTAY IDs in bacterio
for index, row in bacterio.iterrows():
    if pd.isna(row['icustay_id']):
        charttime, hadm_id, subject_id = row['charttime'], row['hadm_id'], row['subject_id']
        icustayid = find_icustay_id_from_demog(hadm_id, charttime, subject_id)
        if icustayid is not None:
            bacterio.at[index,'icustay_id'] = icustayid

In [264]:
# Fill in missing ICUSTAY IDs in bacterio
for index, row in abx.iterrows():
    if pd.isna(row['icustay_id']):
        time, hadm_id = row['startdate'], row['hadm_id']
        icustayid = find_icustay_id_from_demog(hadm_id, time)
        if icustayid is not None:
            abx.at[index,'icustay_id'] = icustayid

# Find presumed onset of infection according to sepsis3 guidelines

In [265]:
from sklearn.metrics.pairwise import euclidean_distances

onset = pd.DataFrame(columns=['subject_id', 'icustay_id', 'onsettime'])

for icustayid in range(1,100001):
    ab = pd.Series.to_numpy(abx.loc[abx['icustay_id'] == icustayid + 200000, 'startdate'])
    bact = pd.Series.to_numpy(bacterio.loc[bacterio['icustay_id'] == icustayid + 200000, 'charttime'])
    subj_bact = pd.Series.to_numpy(bacterio.loc[bacterio['icustay_id'] == icustayid + 200000, 'subject_id'])
    
    if len(ab) > 0 and len(bact) > 0:
        D = euclidean_distances(ab.reshape(-1,1), bact.reshape(-1,1))/3600
        for i in range(len(D)):
            M = min(D[i])
            I = D[i].argmin()
            ab1 = ab[i]
            bact1 = bact[I]
            
            if M <= 24 and ab1 <= bact1:
                onset = onset.append({'subject_id': subj_bact[0], 'icustay_id': icustayid, 'onsettime': ab1}, ignore_index=True)
                break
            elif M <= 72 and ab1 >= bact1:
                onset = onset.append({'subject_id': subj_bact[0], 'icustay_id': icustayid, 'onsettime': bact1}, ignore_index=True)
                break
        

In [266]:
len(onset.loc[onset['onsettime']>0])

65

# Replacing item_ids with column numbers from reference tables

In [267]:
def find_the_col_number(df, num):
    for index, row in df.iterrows():
        arr = pd.Series.to_numpy(row)
        if num in arr:
            return int(index+1)
        
def replace_itemids_with_col_num(df, reftable):
    for index, row in df.iterrows():
        df.at[index, 'colnum'] = find_the_col_number(reftable, row['itemid'])
    df.drop(columns=['itemid'], inplace=True)

In [268]:
replace_itemids_with_col_num(labU, reflabs)
replace_itemids_with_col_num(ce010,refvitals)
replace_itemids_with_col_num(ce1020,refvitals)
replace_itemids_with_col_num(ce2030,refvitals)
replace_itemids_with_col_num(ce3040,refvitals)
replace_itemids_with_col_num(ce4050,refvitals)
replace_itemids_with_col_num(ce5060,refvitals)
replace_itemids_with_col_num(ce6070,refvitals)
replace_itemids_with_col_num(ce7080,refvitals)
replace_itemids_with_col_num(ce8090,refvitals)
replace_itemids_with_col_num(ce90100,refvitals)

# Initial reformat with chartevents, labs and mechvent

gives an array with all unique charttime (1 per row) and all items in columns.
## IMPORTANT 
Here i use -48 -> +24 because that's for sepsis3 cohort defintion!!  
I need different time period for the MDP (-24 -> +48)

In [317]:
winb4=49   #lower limit for inclusion of data (48h before time flag)
winaft=25  # upper limit (24h after)
irow = 0
reformat, qstime = pd.DataFrame(), pd.DataFrame()

for icustayid in range(1,100001):
    qst = onset.loc[onset['icustay_id'] == icustayid, 'onsettime']
    if len(qst) > 0 and qst.values[0] > 0:
        d1 = demog.loc[demog['icustay_id'] == icustayid + 200000, ['age', 'dischtime']]
        
        if len(d1['age']) > 0 and d1['age'].values[0] > 6574:
            if icustayid < 10000:
                temp = ce010.loc[ce010['icustay_id'] == icustayid + 200000]
            elif icustayid < 20000:
                temp = ce1020.loc[ce1020['icustay_id'] == icustayid + 200000]
            elif icustayid < 30000:
                temp = ce2030.loc[ce2030['icustay_id'] == icustayid + 200000]
            elif icustayid < 40000:
                temp = ce3040.loc[ce3040['icustay_id'] == icustayid + 200000]
            elif icustayid < 50000:
                temp = ce4050.loc[ce4050['icustay_id'] == icustayid + 200000]
            elif icustayid < 60000:
                temp = ce5060.loc[ce5060['icustay_id'] == icustayid + 200000]
            elif icustayid < 70000:
                temp = ce6070.loc[ce6070['icustay_id'] == icustayid + 200000]
            elif icustayid < 80000:
                temp = ce7080.loc[ce7080['icustay_id'] == icustayid + 200000]
            elif icustayid < 90000:
                temp = ce8090.loc[ce8090['icustay_id'] == icustayid + 200000]
            else:
                temp = ce90100.loc[ce90100['icustay_id'] == icustayid + 200000]
            
            temp = temp.loc[temp['charttime'].between(qst.values[0]-(winb4+4)*3600,qst.values[0]+(winaft+4)*3600)] #time period of interest
            
            # LABEVENTS
            temp2 = labU.loc[labU['icustay_id'] == icustayid + 200000]
            temp2 = temp2.loc[temp2['timestp'].between(qst.values[0]-(winb4+4)*3600,qst.values[0]+(winaft+4)*3600)] #time period of interest
            
            #MECH VENT and Exubated
            temp3 = MV.loc[MV['icustay_id'] == icustayid + 200000]
            temp3 = temp3.loc[temp3['charttime'].between(qst.values[0]-(winb4+4)*3600, qst.values[0]+(winaft+4)*3600)] #time period of interest
            
            unique_timestp = pd.concat([temp['charttime'], temp2['timestp'].rename('charttime'), temp3['charttime']]).unique()
            unique_timestp.sort() # list of unique timestamps from all 3 sources sorted in ascending order
            
            for i in range(len(unique_timestp)):
                timedata = temp.loc[temp['charttime'] == unique_timestp[i]]
                if len(timedata) > 0:
                    try:
                        col = timedata['colnum'].values[0]
                    except KeyError:
                        print(timedata.head())
                        sys.exit()
                    value = timedata['valuenum'].values[0]
                    reformat.at[irow, 'timestep'] = i
                    reformat.at[irow, 'icustay_id'] = icustayid
                    reformat.at[irow, 'charttime'] = unique_timestp[i]
                    reformat.at[irow, 3 + col] = value

                # LAB values
                timedata = temp2.loc[temp2['timestp'] == unique_timestp[i]]
                if len(timedata) > 0:
                    col = timedata['colnum'].values[0]
                    value = timedata['valuenum'].values[0]
                    reformat.at[irow, 31+col] = value;

                # MV
                value = temp3.loc[temp3['charttime'] == unique_timestp[i], ['mechvent', 'extubated']]
                if len(value) > 0:
                    reformat.at[irow, 'mechvent'] = value['mechvent'].values[0]
                    reformat.at[irow, 'extubated'] = value['extubated'].values[0]
                
                irow = irow + 1
            
            if len(unique_timestp) > 0:
                qstime = qstime.append({'icustay_id': icustayid, 'firsttimestp': unique_timestp[0], 'lasttimestp':unique_timestp[-1], 'dischtime': d1['dischtime'].values[0]}, ignore_index = True)
            
            

58

In [304]:
print(MV.head())

   icustay_id     charttime  mechvent  extubated  selfextubated
0      261764  7.013387e+09         0          0              0
1      298685  6.189581e+09         1          0              0
2      271544  7.211196e+09         1          0              0
3      237528  7.016155e+09         0          0              0
4      286072  7.229492e+09         1          0              0


In [310]:
replace_itemids_with_col_num(ce7080,refvitals)

In [302]:
ce1020.loc[labU['icustay_id'] == 201006.0]['colnum']

0      12.0
1       5.0
2       9.0
3      10.0
4       7.0
       ... 
510     5.0
511     5.0
512     7.0
513    10.0
514     9.0
Name: colnum, Length: 515, dtype: float64