PURPOSE:
------------------------------
This creates a list of icustayIDs of patients who develop sepsis at some point 
in the ICU. records charttime for onset of sepsis. Uses sepsis3 criteria

STEPS:
-------------------------------
IMPORT DATA FROM CSV FILES  
FLAG PRESUMED INFECTION  
PREPROCESSING  
REFORMAT in 4h time slots  
COMPUTE SOFA at each time step  
FLAG SEPSIS  

note: the process generates the same features as the final MDP dataset, most of which are not used to compute SOFA

# Import all data

In [142]:
import pandas as pd
abx = pd.read_csv('/Users/faaiz/exportdir/abx.csv', sep='|')
culture = pd.read_csv('/Users/faaiz/exportdir/culture.csv', sep='|')
microbio = pd.read_csv('/Users/faaiz/exportdir/microbio.csv', sep='|')
demog = pd.read_csv('/Users/faaiz/exportdir/demog.csv', sep='|')
ce010 = pd.read_csv('/Users/faaiz/exportdir/ce010000.csv', sep='|')
ce1020 = pd.read_csv('/Users/faaiz/exportdir/ce1000020000.csv', sep='|')
ce2030 = pd.read_csv('/Users/faaiz/exportdir/ce2000030000.csv', sep='|')
ce3040 = pd.read_csv('/Users/faaiz/exportdir/ce3000040000.csv', sep='|')
ce4050 = pd.read_csv('/Users/faaiz/exportdir/ce4000050000.csv', sep='|')
ce5060 = pd.read_csv('/Users/faaiz/exportdir/ce5000060000.csv', sep='|')
ce6070 = pd.read_csv('/Users/faaiz/exportdir/ce6000070000.csv', sep='|')
ce7080 = pd.read_csv('/Users/faaiz/exportdir/ce7000080000.csv', sep='|')
ce8090 = pd.read_csv('/Users/faaiz/exportdir/ce8000090000.csv', sep='|')
ce90100 = pd.read_csv('/Users/faaiz/exportdir/ce90000100000.csv', sep='|')
lab_ce = pd.read_csv('/Users/faaiz/exportdir/labs_ce.csv', sep='|').rename(columns = {'charttime': 'timestp'}, inplace = False)
lab_le = pd.read_csv('/Users/faaiz/exportdir/labs_le.csv', sep='|')
labU = pd.concat([lab_ce, lab_le], ignore_index = True)
MV = pd.read_csv('/Users/faaiz/exportdir/mechvent.csv', sep='|')
inputpreadm = pd.read_csv('/Users/faaiz/exportdir/preadm_fluid.csv', sep='|')
inputMV = pd.read_csv('/Users/faaiz/exportdir/fluid_mv.csv', sep='|')
inputCV = pd.read_csv('/Users/faaiz/exportdir/fluid_cv.csv', sep='|')
vasoMV = pd.read_csv('/Users/faaiz/exportdir/vaso_mv.csv', sep='|')
vasoCV = pd.read_csv('/Users/faaiz/exportdir/vaso_cv.csv', sep='|')
UOpreadm = pd.read_csv('/Users/faaiz/exportdir/preadm_uo.csv', sep='|')
UO = pd.read_csv('/Users/faaiz/exportdir/uo.csv', sep='|')

reflabs = pd.read_csv('/Users/faaiz/exportdir/Reflabs.csv', header=None)
refvitals = pd.read_csv('/Users/faaiz/exportdir/Refvitals.csv', header=None)
sample_and_hold = pd.read_csv('/Users/faaiz/exportdir/sample_and_hold.csv')

# Initial Data Manipulations

In [143]:
# if charttime is empty but chartdate isn't
microbio.loc[microbio['charttime'].isnull(), 'charttime'] = microbio['chartdate'] 
microbio = microbio.drop(columns = 'chartdate')
bacterio = pd.concat([microbio, culture], ignore_index = True)

In [144]:
demog['morta_90'] = demog['morta_90'].fillna(0)
demog['morta_hosp'] = demog['morta_hosp'].fillna(0)
demog['elixhauser'] = demog['elixhauser'].fillna(0)

In [145]:
inputMV.head()
inputMV['normrate'] = inputMV['rate']*inputMV['tev']/inputMV['amount']

In [149]:
def find_icustay_id_from_demog(hadm_id, time, subject_id = None):
    if subject_id is not None:
        df = demog.loc[demog['subject_id'] == subject_id]
    else:
        df = demog.loc[demog['hadm_id'] == hadm_id]
    for index, row in df.iterrows():
        if (time >= row['intime'] - 48*3600 and time <= row['outtime'] + 48*3600) or len(df) == 1:
            return row['icustay_id']
    df2 = df.loc[df['hadm_id'] == hadm_id]
    if len(df2) == 1:
        return df2['icustay_id'].values[0]
    return None

In [152]:
# TODO: to be removed!
len(bacterio)
len(bacterio.loc[bacterio['icustay_id'].isnull()])

30

In [151]:
bacterio.head()
# Fill in missing ICUSTAY IDs in bacterio
for index, row in bacterio.iterrows():
    if pd.isna(row['icustay_id']):
        charttime, hadm_id, subject_id = row['charttime'], row['hadm_id'], row['subject_id']
        icustayid = find_icustay_id_from_demog(hadm_id, charttime, subject_id)
        if icustayid is not None:
            bacterio.at[index,'icustay_id'] = icustayid

In [154]:
# Fill in missing ICUSTAY IDs in bacterio
for index, row in abx.iterrows():
    if pd.isna(row['icustay_id']):
        time, hadm_id = row['startdate'], row['hadm_id']
        icustayid = find_icustay_id_from_demog(hadm_id, time)
        if icustayid is not None:
            abx.at[index,'icustay_id'] = icustayid

# Find presumed onset of infection according to sepsis3 guidelines

In [212]:
from sklearn.metrics.pairwise import euclidean_distances

onset = pd.DataFrame(columns=['subject_id', 'icustay_id', 'onsettime'])

for icustayid in range(1,100001):
    ab = pd.Series.to_numpy(abx.loc[abx['icustay_id'] == icustayid + 200000, 'startdate'])
    bact = pd.Series.to_numpy(bacterio.loc[bacterio['icustay_id'] == icustayid + 200000, 'charttime'])
    subj_bact = pd.Series.to_numpy(bacterio.loc[bacterio['icustay_id'] == icustayid + 200000, 'subject_id'])
    
    if len(ab) > 0 and len(bact) > 0:
        D = euclidean_distances(ab.reshape(-1,1), bact.reshape(-1,1))/3600
        for i in range(len(D)):
            M = min(D[i])
            I = D[i].argmin()
            ab1 = ab[i]
            bact1 = bact[I]
            
            if M <= 24 and ab1 <= bact1:
                onset = onset.append({'subject_id': subj_bact[0], 'icustay_id': icustayid, 'onsettime': ab1}, ignore_index=True)
                break
            elif M <= 72 and ab1 >= bact1:
                onset = onset.append({'subject_id': subj_bact[0], 'icustay_id': icustayid, 'onsettime': bact1}, ignore_index=True)
                break
        

In [213]:
len(onset.loc[onset['onsettime']>0])

65

# Replacing item_ids with column numbers from reference tables

In [228]:
def find_the_row_number(df, num):
    for index, row in df.iterrows():
        arr = pd.Series.to_numpy(row)
        if num in arr:
            return index+1
        
def replace_itemids_with_row_num(df, reftable):
    for index, row in df.iterrows():
        df.at[index, 'rownum'] = find_the_row_number(reftable, row['itemid'])
    df.drop(columns=['itemid'], inplace=True)

In [226]:
for index, row in labU.iterrows():
    labU.at[index,'rownum'] = find_the_row_number(reflabs, row['itemid'])

labU = labU.drop(columns=['itemid'])

Unnamed: 0,icustay_id,timestp,valuenum,rownum
0,201006,4.330384e+09,2.40,16.0
1,201006,4.330384e+09,17.00,5.0
2,201006,4.330384e+09,7.50,8.0
3,201006,4.330384e+09,22.00,10.0
4,201006,4.330384e+09,101.00,3.0
...,...,...,...,...
56058,286020,6.959992e+09,1.60,27.0
56059,286020,6.959992e+09,199.00,23.0
56060,286020,6.959992e+09,16.90,25.0
56061,286020,6.959992e+09,4.04,21.0


In [229]:
replace_itemids_with_row_num(ce010,refvitals)
replace_itemids_with_row_num(ce1020,refvitals)
replace_itemids_with_row_num(ce2030,refvitals)
replace_itemids_with_row_num(ce3040,refvitals)
replace_itemids_with_row_num(ce4050,refvitals)
replace_itemids_with_row_num(ce5060,refvitals)
replace_itemids_with_row_num(ce6070,refvitals)
replace_itemids_with_row_num(ce8090,refvitals)
replace_itemids_with_row_num(ce90100,refvitals)