# Concat and Refinement Procedure
In this notebook, we will concat the data and do some additional refinement process.
The procedures consists of following things.
- before concat
    1. GCS data refinement: outlier deletion, concatenation
    2. time error deletion : for icu module data, exceeding IT OT data deletion and hosp data no deletion 
    3. rarely observed feature identification: delete the features whose missing rate larger than 75%
    4. main variable not observed cohort deletion 
- in concat
    1. concat into admission
    2. hosp urine data have to be concated for only when there is no urine data for total cohort
    3. SOFA calculation, shock diagnosis
- after concat
    1. weight, height consistency error
    2. shock diagnosis time statistics

In [5]:
import pandas as pd
from warnings import simplefilter
from tqdm import tqdm
import numpy as np
import datetime
simplefilter(action="ignore", category=pd.errors.DtypeWarning)
pd.set_option('display.max_columns', None)

import multiprocessing as mp
from functools import partial
from sepsis_preprocessing import data_concatenation
import pickle

## 1. before concat

### 1.1 GCS refinement

In [2]:
dirs = [str('processed_data/sepsis/%s_R.csv' %i) for i in ['omr', 'LE', 'CE', 'IE', 'OE', 'PE']]
df = pd.read_csv(dirs[2])
df.charttime = pd.to_datetime(df.charttime)

In [3]:
# outlier deletion
tmp_itemid = [220739,223900,223901]
drop_idx = []
for ii in tmp_itemid:
    tmp_cond = df.itemid == ii
    part_df = df.loc[tmp_cond]
    
    if ii == 220739:
        lb = 1
        ub = 4
    elif ii == 223900:
        lb = 1
        ub = 5
    else:
        lb = 1
        ub = 6
    
    tmp_cond = (part_df.valuenum > ub) | (part_df.valuenum < lb)
    
    drop_idx += tmp_cond.index[tmp_cond].tolist()
        
df.loc[drop_idx]

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,storetime,itemid,valuenum,valueuom


In [4]:
# GCS concat
tmp_data = []
for stay in tqdm(df.stay_id.unique()):
    tmp_cond = (df.stay_id == stay) & (df.itemid.isin(tmp_itemid))
    part_df = df.loc[tmp_cond]

    if part_df.shape[0] == 0:
        continue
    ct_list = part_df.charttime.unique()
    for ct in ct_list:
        tmp_cond = (part_df.charttime == ct)
        tmp_list = part_df.iloc[0, 0:3].tolist()+[ct]
        tmp_list += [part_df.loc[tmp_cond&(part_df.itemid==tmp_itemid[i]), 'valuenum'].to_numpy()[0] if part_df.loc[tmp_cond&(part_df.itemid==tmp_itemid[i]), 'valuenum'].shape[0] == 1 else np.nan for i in range(3)]
        tmp_data.append(tmp_list)
     
tmp_data = pd.DataFrame(tmp_data, columns=['subject_id', 'hadm_id', 'stay_id', 'charttime', 'eye', 'verbal', 'motor'])

for stay in tqdm(tmp_data.stay_id.unique()):
    tmp_cond = (tmp_data.stay_id == stay)
    part_df = tmp_data.loc[tmp_cond]

    tmp_data.loc[tmp_cond] = part_df.ffill().bfill()

tmp_data['itemid'] = sum(tmp_itemid)
tmp_data['valuenum'] = tmp_data.loc[:, ('eye', 'verbal', 'motor')].sum(axis=1)
tmp_data['valueuom'] = np.nan

df = pd.concat([df, tmp_data.loc[:, list(set(tmp_data.columns) - set(tmp_data.columns[4:7]))]], axis=0).reset_index(drop = True)

100%|██████████| 26622/26622 [56:02<00:00,  7.92it/s]  
100%|██████████| 26449/26449 [02:07<00:00, 207.50it/s]


In [5]:
# GCS refined version of CE saving
tmp_cond = df.itemid.isin(tmp_itemid)
df = df.drop(tmp_cond[tmp_cond].index)
df = df.sort_values(['stay_id', 'charttime', 'itemid'])
df.loc[:, list(set(df.columns) - set(['storetime']))].to_csv('processed_data/sepsis/CE_R.csv', index=False)

In [14]:
df.loc[:, ('subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid', 'valuenum', 'valueuom')].to_csv('processed_data/sepsis/CE_R.csv', index=False)

### 1.2 time error handling

time error handling criteria
1. lab values: exceeding IT, OT will be deleted
2. vaso, fluid, ventilators: exceeding IT and OT at the same time will be deleted. 

=> therefore, LE and OMR will not be the interests of this sections.

In [2]:
dirs = [str('processed_data/sepsis/%s_R.csv' %i) for i in ['omr', 'LE', 'CE', 'IE', 'OE', 'PE']]
icustays = pd.read_csv('processed_data/sepsis/icustays_wsusinf.csv')
icustays.intime = pd.to_datetime(icustays.intime)
icustays.outtime = pd.to_datetime(icustays.outtime)

In [8]:
# 0 hours
tmp_data = []
for data_dir in dirs[2:]:
    drop_idx = []
    df = pd.read_csv(data_dir)
    try:
        df.charttime = pd.to_datetime(df.charttime)
        cond_ct = 1
    except:
        df.starttime = pd.to_datetime(df.starttime)
        df.endtime = pd.to_datetime(df.endtime)
        cond_ct = 0
        
    td_list = []
    for stay in tqdm(df.stay_id.unique()):
        tmp_cond = icustays.stay_id == stay
        it, ot = icustays.loc[tmp_cond, ('intime', 'outtime')].reset_index(drop=True).loc[0]

        tmp_cond = df.stay_id == stay
        part_df = df.loc[tmp_cond]

        if cond_ct == 1:
            tmp_cond = (part_df.charttime > ot) | (part_df.charttime < it)
            drop_idx += tmp_cond[tmp_cond].index.tolist()
            td_list += ((part_df.loc[(part_df.charttime > ot), 'charttime'] - ot)/datetime.timedelta(hours=1)).tolist()
            td_list += ((it - part_df.loc[(part_df.charttime < it), 'charttime'])/datetime.timedelta(hours=1)).tolist()

        else:
            tmp_cond = (part_df.starttime > ot) | (part_df.endtime < it)
            drop_idx += tmp_cond[tmp_cond].index.tolist()
            td_list += ((part_df.loc[(part_df.starttime > ot), 'starttime'] - ot)/datetime.timedelta(hours=1)).tolist()
            td_list += ((it - part_df.loc[(part_df.endtime < it), 'endtime'])/datetime.timedelta(hours=1)).tolist()

    print(icustays.loc[icustays.stay_id == df.loc[drop_idx, 'stay_id'][drop_idx[0]], ('intime', 'outtime')].reset_index(drop=True).loc[0])
    print(df.loc[drop_idx[0]].reset_index(drop=True).loc[0])
    tmp_data.append([data_dir[-8:-4], df.shape[0], len(drop_idx), len(df.stay_id.unique()), len(df.loc[drop_idx].stay_id.unique()), df.loc[drop_idx].itemid.unique(), pd.Series(td_list).describe()])
    df = df.drop(drop_idx).reset_index(drop=True)
    df.to_csv(data_dir[:-5]+'TEH_v1.csv', index=False)

pd.DataFrame(tmp_data, columns=['df', 'b4_drop', 'a_drop', 'b4_stay', 'drop_stays', 'drop_iis', 'drop_td']).to_csv('tmp.csv')

100%|██████████| 26622/26622 [06:20<00:00, 70.01it/s]


intime    2174-09-29 12:09:00
outtime   2174-10-01 03:26:10
Name: 0, dtype: datetime64[ns]
12466550


100%|██████████| 26446/26446 [01:52<00:00, 235.58it/s]


intime    2129-04-06 00:25:00
outtime   2129-04-08 21:02:55
Name: 0, dtype: datetime64[ns]
10007928


100%|██████████| 25494/25494 [01:26<00:00, 293.16it/s]


intime    2188-06-05 23:38:19
outtime   2188-06-08 00:32:17
Name: 0, dtype: datetime64[ns]
16235911


100%|██████████| 12893/12893 [00:23<00:00, 540.00it/s]


intime    2191-06-23 22:57:47
outtime   2191-06-24 09:22:22
Name: 0, dtype: datetime64[ns]
10501162


In [11]:
# 1 hour 
tmp_data = []
for data_dir in dirs[2:]:
    drop_idx = []
    df = pd.read_csv(data_dir)
    try:
        df.charttime = pd.to_datetime(df.charttime)
        cond_ct = 1
    except:
        df.starttime = pd.to_datetime(df.starttime)
        df.endtime = pd.to_datetime(df.endtime)
        cond_ct = 0
        
    td_list = []
    for stay in tqdm(df.stay_id.unique()):
        tmp_cond = icustays.stay_id == stay
        it, ot = icustays.loc[tmp_cond, ('intime', 'outtime')].reset_index(drop=True).loc[0]

        tmp_cond = df.stay_id == stay
        part_df = df.loc[tmp_cond]

        if cond_ct == 1:
            tmp_cond = (part_df.charttime > (ot+datetime.timedelta(days=1))) | (part_df.charttime < (it-datetime.timedelta(days=1)))
            drop_idx += tmp_cond[tmp_cond].index.tolist()
            td_list += ((part_df.loc[(part_df.charttime > (ot+datetime.timedelta(days=1))), 'charttime'] - ot)/datetime.timedelta(hours=1)).tolist()
            td_list += ((it - part_df.loc[(part_df.charttime < (it-datetime.timedelta(days=1))), 'charttime'])/datetime.timedelta(hours=1)).tolist()

        else:
            tmp_cond = (part_df.starttime > (ot+datetime.timedelta(days=1))) | (part_df.endtime < (it-datetime.timedelta(days=1)))
            drop_idx += tmp_cond[tmp_cond].index.tolist()
            td_list += ((part_df.loc[(part_df.starttime > (ot+datetime.timedelta(days=1))), 'starttime'] - ot)/datetime.timedelta(hours=1)).tolist()
            td_list += ((it - part_df.loc[(part_df.endtime < (it-datetime.timedelta(days=1))), 'endtime'])/datetime.timedelta(hours=1)).tolist()

    print(icustays.loc[icustays.stay_id == df.loc[drop_idx, 'stay_id'][drop_idx[0]], ('intime', 'outtime')].reset_index(drop=True).loc[0])
    print(df.loc[drop_idx[0]].reset_index(drop=True))
    tmp_data.append([data_dir[-8:-4], df.shape[0], len(drop_idx), len(df.stay_id.unique()), len(df.loc[drop_idx].stay_id.unique()), df.loc[drop_idx].itemid.unique(), pd.Series(td_list).describe()])
    df = df.drop(drop_idx).reset_index(drop=True)
    df.to_csv(data_dir[:-5]+'TEH_v2.csv', index=False)

pd.DataFrame(tmp_data, columns=['df', 'b4_drop', 'a_drop', 'b4_stay', 'drop_stays', 'drop_iis', 'drop_td']).to_csv('tmp_1.csv')

100%|██████████| 26622/26622 [06:23<00:00, 69.48it/s]


intime    2174-09-29 12:09:00
outtime   2174-10-01 03:26:10
Name: 0, dtype: datetime64[ns]
0               12466550
1               23998182
2               30000153
3    2174-10-04 19:14:00
4                 220179
5                  119.0
6                   mmHg
Name: 277, dtype: object


100%|██████████| 26446/26446 [01:54<00:00, 231.47it/s]


intime    2142-01-17 09:13:46
outtime   2142-01-25 01:26:14
Name: 0, dtype: datetime64[ns]
0                10449408
1                22698294
2                30983111
3     2142-01-14 23:00:00
4     2142-01-14 23:01:00
5                  226452
6                   120.0
7                      ml
8                     NaN
9                     NaN
10                   80.4
Name: 150801, dtype: object


100%|██████████| 25494/25494 [01:27<00:00, 290.77it/s]


intime    2127-07-01 07:20:08
outtime   2127-07-03 16:15:29
Name: 0, dtype: datetime64[ns]
0               12407894
1               20375008
2               30014984
3    2127-07-05 18:00:00
4                 226559
5                  300.0
6                     ml
Name: 3464, dtype: object


100%|██████████| 12893/12893 [00:24<00:00, 525.85it/s]

intime    2115-02-13 00:23:12
outtime   2115-02-15 17:28:17
Name: 0, dtype: datetime64[ns]
0               18083893
1               27616048
2               31068539
3    2115-02-16 21:47:00
4    2115-02-16 22:28:00
5                 225792
6                   41.0
Name: 13509, dtype: object





In [13]:
# 2 hour
tmp_data = []
for data_dir in dirs[2:]:
    drop_idx = []
    df = pd.read_csv(data_dir)
    try:
        df.charttime = pd.to_datetime(df.charttime)
        cond_ct = 1
    except:
        df.starttime = pd.to_datetime(df.starttime)
        df.endtime = pd.to_datetime(df.endtime)
        cond_ct = 0
        
    td_list = []
    for stay in tqdm(df.stay_id.unique()):
        tmp_cond = icustays.stay_id == stay
        it, ot = icustays.loc[tmp_cond, ('intime', 'outtime')].reset_index(drop=True).loc[0]

        tmp_cond = df.stay_id == stay
        part_df = df.loc[tmp_cond]

        if cond_ct == 1:
            tmp_cond = (part_df.charttime > (ot+datetime.timedelta(days=2))) | (part_df.charttime < (it-datetime.timedelta(days=2)))
            drop_idx += tmp_cond[tmp_cond].index.tolist()
            td_list += ((part_df.loc[(part_df.charttime > (ot+datetime.timedelta(days=2))), 'charttime'] - ot)/datetime.timedelta(hours=1)).tolist()
            td_list += ((it - part_df.loc[(part_df.charttime < (it-datetime.timedelta(days=2))), 'charttime'])/datetime.timedelta(hours=1)).tolist()

        else:
            tmp_cond = (part_df.starttime > (ot+datetime.timedelta(days=2))) | (part_df.endtime < (it-datetime.timedelta(days=2)))
            drop_idx += tmp_cond[tmp_cond].index.tolist()
            td_list += ((part_df.loc[(part_df.starttime > (ot+datetime.timedelta(days=2))), 'starttime'] - ot)/datetime.timedelta(hours=1)).tolist()
            td_list += ((it - part_df.loc[(part_df.endtime < (it-datetime.timedelta(days=2))), 'endtime'])/datetime.timedelta(hours=1)).tolist()

    print(icustays.loc[icustays.stay_id == df.loc[drop_idx, 'stay_id'][drop_idx[0]], ('intime', 'outtime')].reset_index(drop=True).loc[0])
    print(df.loc[drop_idx[0]].reset_index(drop=True))
    tmp_data.append([data_dir[-8:-4], df.shape[0], len(drop_idx), len(df.stay_id.unique()), len(df.loc[drop_idx].stay_id.unique()), df.loc[drop_idx].itemid.unique(), pd.Series(td_list).describe()])
    df = df.drop(drop_idx).reset_index(drop=True)
    df.to_csv(data_dir[:-5]+'TEH_v3.csv', index=False)

pd.DataFrame(tmp_data, columns=['df', 'b4_drop', 'a_drop', 'b4_stay', 'drop_stays', 'drop_iis', 'drop_td']).to_csv('tmp_2.csv')

100%|██████████| 26622/26622 [06:23<00:00, 69.45it/s]


intime    2174-09-29 12:09:00
outtime   2174-10-01 03:26:10
Name: 0, dtype: datetime64[ns]
0               12466550
1               23998182
2               30000153
3    2174-10-04 19:14:00
4                 220179
5                  119.0
6                   mmHg
Name: 277, dtype: object


100%|██████████| 26446/26446 [01:53<00:00, 233.97it/s]


intime    2142-01-17 09:13:46
outtime   2142-01-25 01:26:14
Name: 0, dtype: datetime64[ns]
0                10449408
1                22698294
2                30983111
3     2142-01-14 23:00:00
4     2142-01-14 23:01:00
5                  226452
6                   120.0
7                      ml
8                     NaN
9                     NaN
10                   80.4
Name: 150801, dtype: object


100%|██████████| 25494/25494 [01:26<00:00, 293.69it/s]


intime    2127-07-01 07:20:08
outtime   2127-07-03 16:15:29
Name: 0, dtype: datetime64[ns]
0               12407894
1               20375008
2               30014984
3    2127-07-05 18:00:00
4                 226559
5                  300.0
6                     ml
Name: 3464, dtype: object


100%|██████████| 12893/12893 [00:24<00:00, 530.66it/s]

intime    2192-06-29 14:40:21
outtime   2192-07-03 23:27:24
Name: 0, dtype: datetime64[ns]
0               19651093
1               26488509
2               30067309
3    2192-05-25 18:40:00
4    2192-06-08 15:59:00
5                 225792
6                19999.0
Name: 16022, dtype: object





In [14]:
admissions = pd.read_csv('hosp/admissions.csv')
admissions.admittime = pd.to_datetime(admissions.admittime)
admissions.dischtime = pd.to_datetime(admissions.dischtime)

In [17]:
tmp_data = []
for data_dir in dirs[2:]:
    drop_idx = []
    df = pd.read_csv(data_dir)
    try:
        df.charttime = pd.to_datetime(df.charttime)
        cond_ct = 1
    except:
        df.starttime = pd.to_datetime(df.starttime)
        df.endtime = pd.to_datetime(df.endtime)
        cond_ct = 0
        
    td_list = []
    for stay in tqdm(df.stay_id.unique()):
        tmp_cond = icustays.stay_id == stay
        adm = icustays.loc[tmp_cond, ('hadm_id')].reset_index(drop=True).loc[0]
        
        tmp_cond = admissions.hadm_id == adm
        at, dt = admissions.loc[tmp_cond, ('admittime', 'dischtime')].reset_index(drop=True).loc[0]

        tmp_cond = df.stay_id == stay
        part_df = df.loc[tmp_cond]

        if cond_ct == 1:
            tmp_cond = (part_df.charttime > (dt+datetime.timedelta(days=2))) | (part_df.charttime < (at-datetime.timedelta(days=2)))
            drop_idx += tmp_cond[tmp_cond].index.tolist()
            td_list += ((part_df.loc[(part_df.charttime > (dt+datetime.timedelta(days=2))), 'charttime'] - dt)/datetime.timedelta(hours=1)).tolist()
            td_list += ((at - part_df.loc[(part_df.charttime < (at-datetime.timedelta(days=2))), 'charttime'])/datetime.timedelta(hours=1)).tolist()

        else:
            tmp_cond = (part_df.starttime > (dt+datetime.timedelta(days=2))) | (part_df.endtime < (at-datetime.timedelta(days=2)))
            drop_idx += tmp_cond[tmp_cond].index.tolist()
            td_list += ((part_df.loc[(part_df.starttime > (dt+datetime.timedelta(days=2))), 'starttime'] - dt)/datetime.timedelta(hours=1)).tolist()
            td_list += ((at - part_df.loc[(part_df.endtime < (at-datetime.timedelta(days=2))), 'endtime'])/datetime.timedelta(hours=1)).tolist()

    if len(drop_idx) == 0:
        tmp_data.append([data_dir[-8:-4], df.shape[0], len(drop_idx), len(df.stay_id.unique()), len(df.loc[drop_idx].stay_id.unique()), df.loc[drop_idx].itemid.unique(), pd.Series(td_list).describe()])
        df = df.drop(drop_idx).reset_index(drop=True)
        df.to_csv(data_dir[:-5]+'TEH_v4.csv', index=False)
    else:
        print(icustays.loc[icustays.stay_id == df.loc[drop_idx, 'stay_id'][drop_idx[0]], ('intime', 'outtime')].reset_index(drop=True).loc[0])
        print(df.loc[drop_idx[0]].reset_index(drop=True))
        tmp_data.append([data_dir[-8:-4], df.shape[0], len(drop_idx), len(df.stay_id.unique()), len(df.loc[drop_idx].stay_id.unique()), df.loc[drop_idx].itemid.unique(), pd.Series(td_list).describe()])
        df = df.drop(drop_idx).reset_index(drop=True)
        df.to_csv(data_dir[:-5]+'TEH_v4.csv', index=False)

pd.DataFrame(tmp_data, columns=['df', 'b4_drop', 'a_drop', 'b4_stay', 'drop_stays', 'drop_iis', 'drop_td']).to_csv('tmp_3.csv')

100%|██████████| 26622/26622 [06:39<00:00, 66.68it/s]


intime    2185-04-19 22:59:00
outtime   2185-05-07 19:05:25
Name: 0, dtype: datetime64[ns]
0               13077594
1               22574617
2               30056766
3    2185-05-09 22:00:00
4                 220228
5                    7.6
6                   g/dl
Name: 139742, dtype: object


100%|██████████| 26446/26446 [02:03<00:00, 213.56it/s]


intime    2116-08-27 19:15:00
outtime   2116-09-03 15:04:09
Name: 0, dtype: datetime64[ns]
0                11320106
1                21001406
2                33963040
3     2116-09-02 16:50:00
4     2116-09-02 16:51:00
5                  220949
6                   200.0
7                      ml
8                     NaN
9                     NaN
10                   84.9
Name: 451361, dtype: object


100%|██████████| 25494/25494 [01:40<00:00, 252.54it/s]


intime    2193-11-21 19:44:00
outtime   2193-11-28 07:41:08
Name: 0, dtype: datetime64[ns]
0               11950244
1               28956626
2               30794450
3    2193-11-26 17:00:00
4                 226559
5                    5.0
6                     ml
Name: 162394, dtype: object


100%|██████████| 12893/12893 [00:30<00:00, 421.84it/s]
  tmp_data.append([data_dir[-8:-4], df.shape[0], len(drop_idx), len(df.stay_id.unique()), len(df.loc[drop_idx].stay_id.unique()), df.loc[drop_idx].itemid.unique(), pd.Series(td_list).describe()])


In [19]:
tmp_data = []
for data_dir in dirs[2:]:
    drop_idx = []
    df = pd.read_csv(data_dir)
    try:
        df.charttime = pd.to_datetime(df.charttime)
        cond_ct = 1
    except:
        df.starttime = pd.to_datetime(df.starttime)
        df.endtime = pd.to_datetime(df.endtime)
        cond_ct = 0
        
    td_list = []
    for stay in tqdm(df.stay_id.unique()):
        tmp_cond = icustays.stay_id == stay
        adm = icustays.loc[tmp_cond, ('hadm_id')].reset_index(drop=True).loc[0]
        
        tmp_cond = admissions.hadm_id == adm
        at, dt = admissions.loc[tmp_cond, ('admittime', 'dischtime')].reset_index(drop=True).loc[0]

        tmp_cond = df.stay_id == stay
        part_df = df.loc[tmp_cond]

        if cond_ct == 1:
            tmp_cond = (part_df.charttime > (dt+datetime.timedelta(days=3))) | (part_df.charttime < (at-datetime.timedelta(days=3)))
            drop_idx += tmp_cond[tmp_cond].index.tolist()
            td_list += ((part_df.loc[(part_df.charttime > (dt+datetime.timedelta(days=3))), 'charttime'] - dt)/datetime.timedelta(hours=1)).tolist()
            td_list += ((at - part_df.loc[(part_df.charttime < (at-datetime.timedelta(days=3))), 'charttime'])/datetime.timedelta(hours=1)).tolist()

        else:
            tmp_cond = (part_df.starttime > (dt+datetime.timedelta(days=3))) | (part_df.endtime < (at-datetime.timedelta(days=3)))
            drop_idx += tmp_cond[tmp_cond].index.tolist()
            td_list += ((part_df.loc[(part_df.starttime > (dt+datetime.timedelta(days=3))), 'starttime'] - dt)/datetime.timedelta(hours=1)).tolist()
            td_list += ((at - part_df.loc[(part_df.endtime < (at-datetime.timedelta(days=3))), 'endtime'])/datetime.timedelta(hours=1)).tolist()

    if len(drop_idx) == 0:
        tmp_data.append([data_dir[-8:-4], df.shape[0], len(drop_idx), len(df.stay_id.unique()), len(df.loc[drop_idx].stay_id.unique()), np.nan, np.nan])
        df = df.drop(drop_idx).reset_index(drop=True)
        df.to_csv(data_dir[:-5]+'TEH_v5.csv', index=False)
    else:
        print(admissions.loc[admissions.hadm_id == df.loc[drop_idx, 'hadm_id'][drop_idx[0]], ('admittime', 'dischtime')].reset_index(drop=True).loc[0])
        print(df.loc[drop_idx[0]].reset_index(drop=True))
        tmp_data.append([data_dir[-8:-4], df.shape[0], len(drop_idx), len(df.stay_id.unique()), len(df.loc[drop_idx].stay_id.unique()), df.loc[drop_idx].itemid.unique(), pd.Series(td_list).describe()])
        df = df.drop(drop_idx).reset_index(drop=True)
        df.to_csv(data_dir[:-5]+'TEH_v5.csv', index=False)


pd.DataFrame(tmp_data, columns=['df', 'b4_drop', 'a_drop', 'b4_stay', 'drop_stays', 'drop_iis', 'drop_td']).to_csv('tmp_4.csv')

100%|██████████| 26622/26622 [06:46<00:00, 65.51it/s]


intime    2185-04-19 22:59:00
outtime   2185-05-07 19:05:25
Name: 0, dtype: datetime64[ns]
0               13077594
1               22574617
2               30056766
3    2185-05-11 03:01:00
4                 220228
5                    7.3
6                   g/dl
Name: 139766, dtype: object


100%|██████████| 26446/26446 [02:11<00:00, 201.56it/s]
100%|██████████| 25494/25494 [01:46<00:00, 240.09it/s]


intime    2129-05-01 23:15:00
outtime   2129-05-25 14:21:48
Name: 0, dtype: datetime64[ns]
0               19776632
1               23306501
2               31238684
3    2129-04-01 23:00:00
4                 226559
5                 1000.0
6                     ml
Name: 255513, dtype: object


100%|██████████| 12893/12893 [00:31<00:00, 414.30it/s]


In [21]:
tmp_data = []
for data_dir in [dirs[1]]:
    drop_idx = []
    df = pd.read_csv(data_dir)
    df.charttime = pd.to_datetime(df.charttime)
    cond_ct = 1
        
    td_list = []
    for adm in tqdm(df.hadm_id.unique()):
        tmp_cond = admissions.hadm_id == adm
        at, dt = admissions.loc[tmp_cond, ('admittime', 'dischtime')].reset_index(drop=True).loc[0]

        tmp_cond = df.hadm_id == adm
        part_df = df.loc[tmp_cond]

        if cond_ct == 1:
            tmp_cond = (part_df.charttime > (dt+datetime.timedelta(days=3))) | (part_df.charttime < (at-datetime.timedelta(days=3)))
            drop_idx += tmp_cond[tmp_cond].index.tolist()
            td_list += ((part_df.loc[(part_df.charttime > (dt+datetime.timedelta(days=3))), 'charttime'] - dt)/datetime.timedelta(hours=1)).tolist()
            td_list += ((at - part_df.loc[(part_df.charttime < (at-datetime.timedelta(days=3))), 'charttime'])/datetime.timedelta(hours=1)).tolist()

        else:
            tmp_cond = (part_df.starttime > (dt+datetime.timedelta(days=3))) | (part_df.endtime < (at-datetime.timedelta(days=3)))
            drop_idx += tmp_cond[tmp_cond].index.tolist()
            td_list += ((part_df.loc[(part_df.starttime > (dt+datetime.timedelta(days=3))), 'starttime'] - dt)/datetime.timedelta(hours=1)).tolist()
            td_list += ((at - part_df.loc[(part_df.endtime < (at-datetime.timedelta(days=3))), 'endtime'])/datetime.timedelta(hours=1)).tolist()

    if len(drop_idx) == 0:
        tmp_data.append([data_dir[-8:-4], df.shape[0], len(drop_idx), len(df.hadm_id.unique()), len(df.loc[drop_idx].hadm_id.unique()), np.nan, np.nan])
        df = df.drop(drop_idx).reset_index(drop=True)
        df.to_csv(data_dir[:-5]+'TEH_v5.csv', index=False)
    else:
        print(admissions.loc[admissions.hadm_id == df.loc[drop_idx, 'hadm_id'][drop_idx[0]], ('admittime', 'dischtime')].reset_index(drop=True).loc[0])
        print(df.loc[drop_idx[0]].reset_index(drop=True))
        tmp_data.append([data_dir[-8:-4], df.shape[0], len(drop_idx), len(df.hadm_id.unique()), len(df.loc[drop_idx].hadm_id.unique()), df.loc[drop_idx].itemid.unique(), pd.Series(td_list).describe()])
        df = df.drop(drop_idx).reset_index(drop=True)
        df.to_csv(data_dir[:-5]+'TEH_v5.csv', index=False)

pd.read_csv(dirs[0]).to_csv(dirs[0][:-5]+'TEH_v5.csv', index=False)
pd.DataFrame(tmp_data, columns=['df', 'b4_drop', 'a_drop', 'b4_stay', 'drop_stays', 'drop_iis', 'drop_td']).to_csv('tmp_5.csv')

100%|██████████| 21912/21912 [01:48<00:00, 202.65it/s]


### 1.3 feature observation analysis

In [None]:
'''
# omr needs hadm ids
dirs = [str('processed_data/sepsis/%s_TEH_v5.csv' %i) for i in ['omr', 'LE', 'CE', 'IE', 'OE', 'PE']]
admissions = pd.read_csv('hosp/admissions.csv')
admissions.admittime = pd.to_datetime(admissions.admittime)
admissions.dischtime = pd.to_datetime(admissions.dischtime)

df = pd.read_csv(dirs[0])
df.charttime = pd.to_datetime(df.charttime)
df['hadm_id'] = np.nan

for sbj in tqdm(df.subject_id.unique()):
    tmp_cond = admissions.subject_id == sbj
    adms_list = admissions.loc[tmp_cond].hadm_id.tolist()

    for adm in adms_list:
        tmp_cond = admissions.hadm_id == adm
        at, dt = admissions.loc[tmp_cond, ('admittime', 'dischtime')].reset_index(drop=True).loc[0]
        
        tmp_cond = (df.subject_id == sbj) & (df.charttime <= dt) & (df.charttime >= at)
        df.loc[tmp_cond, 'hadm_id'] = int(adm)

df.loc[:, ['hadm_id']+df.columns[0:-1].tolist()].to_csv(dirs[0], index=False)
df.loc[:, ['hadm_id']+df.columns[0:-1].tolist()]
'''

before check which of the features is rarely observed, make sure that the GCS itemid from CE_R, urine output itemid from OE_R, and vent itemid from LE_R are editted.
1. GCS itemid: [220739,223900,223901] -> 668540
2. urine itemid: 227488 -> deleted
3. vent itemid: 50828 -> deleted

In [2]:
dirs = [str('processed_data/sepsis/%s_R.csv' %i) for i in ['omr', 'LE', 'CE', 'IE', 'OE', 'PE']]
icustays = pd.read_csv('processed_data/sepsis/icustays_wsusinf.csv')
icustays.intime = pd.to_datetime(icustays.intime)
icustays.outtime = pd.to_datetime(icustays.outtime)
d_predictors = pd.read_csv('processed_data/sepsis/d_predictors_R.csv')
tmp = icustays.loc[icustays.suspected_infection == 1].hadm_id.unique().tolist()
df_prd_obs = pd.DataFrame([[0 if j !=0 else tmp[i] for j in range(d_predictors.shape[0]+1)] for i in range(len(tmp))], columns=['hadm_id']+d_predictors['items'].to_numpy().tolist())

In [3]:
for item in tqdm(d_predictors['items']):
    tmp_cond = d_predictors['items'] == item
    df_names = d_predictors.iloc[tmp_cond[tmp_cond].index, 1:].T.dropna().index.tolist()
    itemid_list = d_predictors.iloc[tmp_cond[tmp_cond].index, 1:].dropna(axis=1).to_numpy()[0].tolist()
    for idx, dn in enumerate(df_names):
        df = pd.read_csv(str('processed_data/sepsis/%s_TEH_v5.csv' %dn))
        if dn == 'omr':
            tmp_itemid = itemid_list[idx]
            tmp_cond = df.result_name == tmp_itemid
        else:
            tmp_itemid = np.array(itemid_list[idx].split(',')).astype(int).tolist() if itemid_list[idx].__contains__(',') else [np.array(itemid_list[idx]).astype(int).tolist()]
            tmp_cond = df.itemid.isin(tmp_itemid)
        
        hadmids = df.loc[tmp_cond].hadm_id.unique().tolist()

        tmp_cond = df_prd_obs.hadm_id.isin(hadmids)
        df_prd_obs.loc[tmp_cond, item] = 1

df_prd_obs

100%|██████████| 33/33 [05:45<00:00, 10.47s/it]


Unnamed: 0,hadm_id,heart_rate,resp_rate,temperature,sbp,dbp,cvp,pao2,fio2,gcs,bilirubin,platelets,creatinine,lactate,bun,arterial_ph,wbc,paco2,hemoglobin,hematocrit,potassium,epinephrine,dopamine,dobutamine,norepinephrine,phenylephrine,vasopressin,urine output,sodium,crp,ventilator,fluid,weight,height
0,24597018,1,1,1,1,1,0,0,0,1,0,1,1,0,1,0,1,0,1,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0
1,26184834,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,1,1,0,1,1,1,1
2,23473524,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,1,1,0,1,1,1,0
3,28662225,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,0,1,1,1,1
4,24982426,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,0,0,1,0,1,1,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21920,29356037,1,1,1,1,1,0,0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0
21921,22997012,1,1,1,1,1,0,0,0,1,0,1,1,0,1,0,1,0,1,1,1,0,0,0,0,0,0,1,1,0,0,1,1,1
21922,21439025,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,1,0,0,1,0,1
21923,25744818,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,1,0,0,1,1,0


In [4]:
df_prd_obs.to_csv('processed_data/sepsis/analysis_results/FOA.csv', index=False)

In [5]:
# missing rate per feature
(df_prd_obs.loc[:,df_prd_obs.columns[1:]].sum()/df_prd_obs.shape[0]).to_csv('processed_data/sepsis/analysis_results/FOA_missing_rate.csv')

In [20]:
# LVNM case analysis
labeling_variables = ['sbp', 'dbp', 'pao2', 'fio2', 'gcs', 'bilirubin', 'platelets', 'creatinine', 'lactate']
tmp_cond = (df_prd_obs.loc[:, labeling_variables] == 0).sum(axis=1) == 0
df_prd_obs.loc[tmp_cond].to_csv('processed_data/sepsis/analysis_results/FOA_LV_nonmissing.csv', index=False)
(df_prd_obs.loc[tmp_cond ,df_prd_obs.columns[1:]].sum()/sum(tmp_cond)).to_csv('processed_data/sepsis/analysis_results/FOA_missing_rate_LVNM.csv')

In [21]:
# LVNM case cohorts number
tmp_cond = (df_prd_obs.loc[:, labeling_variables] == 0).sum(axis=1) == 0
hadmids = df_prd_obs.loc[tmp_cond].hadm_id.tolist()
tmp_cond = icustays.hadm_id.isin(hadmids)
icustays['LVNM'] = 0
icustays.loc[tmp_cond, 'LVNM'] = 1

tmp_cond = icustays.LVNM == 1
print(len(icustays.loc[tmp_cond].stay_id.unique()))
print(len(icustays.loc[tmp_cond].hadm_id.unique()))
print(len(icustays.loc[tmp_cond].subject_id.unique()))

13737
10341
9502


In [22]:
# save the LVNM case cohorts recorded icustays
icustays.to_csv('processed_data/sepsis/icustays_LVNM.csv', index=False)

### 1.4 wrap up and preparation

In [2]:
dirs = [str('processed_data/sepsis/%s_TEH_v5.csv' %i) for i in ['omr', 'LE', 'CE', 'IE', 'OE', 'PE']]

In [3]:
# we decided to delete CRP from predictors
d_predictors = pd.read_csv('processed_data/sepsis/d_predictors_R.csv')
tmp_cond = (d_predictors['items']=='crp')
d_predictors.drop(tmp_cond[tmp_cond].index).to_csv('processed_data/sepsis/d_predictors_BFCC.csv', index=False)

In [4]:
# cohort selection
icustays = pd.read_csv('processed_data/sepsis/icustays_LVNM.csv')
hadmids = icustays.loc[icustays.LVNM == 1].hadm_id.unique().tolist()
df_dict = {}

for idx, df_dir in enumerate(dirs):
    df = pd.read_csv(df_dir)

    tmp_cond = df.hadm_id.isin(hadmids)
    df = df.loc[tmp_cond].reset_index(drop=True)

    df_dict[idx] = df
    print(df.columns)

Index(['hadm_id', 'subject_id', 'charttime', 'result_name', 'result_value'], dtype='object')
Index(['subject_id', 'hadm_id', 'itemid', 'charttime', 'storetime', 'valuenum',
       'valueuom'],
      dtype='object')
Index(['subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid', 'valuenum',
       'valueuom'],
      dtype='object')
Index(['subject_id', 'hadm_id', 'stay_id', 'starttime', 'endtime', 'itemid',
       'amount', 'amountuom', 'rate', 'rateuom', 'patientweight'],
      dtype='object')
Index(['subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid', 'value',
       'valueuom'],
      dtype='object')
Index(['subject_id', 'hadm_id', 'stay_id', 'starttime', 'endtime', 'itemid',
       'value'],
      dtype='object')


In [5]:
# SBP, DBP, non inv and inv overlapping case identification
df = df_dict[2]

sbp_inv_ii = [220050,225309]
sbp_ninv_ii = [220179]
dbp_inv_ii = [220051,225310]
dbp_ninv_ii = [220180]

drop_idx = []
# SBP
tmp_cond = df.itemid.isin(sbp_inv_ii+sbp_ninv_ii)
part_df = df.loc[tmp_cond]
for adm in tqdm(df.hadm_id.unique()):
    tmp_cond = part_df.hadm_id == adm
    part_df_1 = part_df.loc[tmp_cond]

    for ct in part_df_1.charttime.unique():
        tmp_cond = part_df_1.charttime == ct
        if sum(tmp_cond) >=2 :
            if (sum(part_df_1.loc[tmp_cond].itemid == sbp_ninv_ii[0]) > 0) & (sum(part_df_1.loc[tmp_cond].itemid == sbp_inv_ii[0]) > 0):
                tmp_cond = part_df_1.loc[tmp_cond].itemid == sbp_ninv_ii[0]
                drop_idx += tmp_cond[tmp_cond].index.tolist()

# DBP
tmp_cond = df.itemid.isin(dbp_inv_ii+dbp_ninv_ii)
part_df = df.loc[tmp_cond]
for adm in tqdm(df.hadm_id.unique()):
    tmp_cond = part_df.hadm_id == adm
    part_df_1 = part_df.loc[tmp_cond]

    for ct in part_df_1.charttime.unique():
        tmp_cond = part_df_1.charttime == ct
        if sum(tmp_cond) >=2 :
            if (sum(part_df_1.loc[tmp_cond].itemid == dbp_ninv_ii[0]) > 0) & (sum(part_df_1.loc[tmp_cond].itemid == dbp_inv_ii[0]) > 0):
                tmp_cond = part_df_1.loc[tmp_cond].itemid == dbp_ninv_ii[0]
                drop_idx += tmp_cond[tmp_cond].index.tolist()

# SBP, DBP, non inv and inv overlapping case deletion
df_dict[2] = df.drop(drop_idx).reset_index(drop=True)

100%|██████████| 10341/10341 [05:40<00:00, 30.37it/s]
100%|██████████| 10341/10341 [05:32<00:00, 31.07it/s]


In [25]:
# LE urine output OE urine output overlapping deletion
# df_dict[1]: LE, df_dict[-2]: OE 
le = df_dict[1]
oe = df_dict[4]

le.charttime = pd.to_datetime(le.charttime)
oe.charttime = pd.to_datetime(oe.charttime)

raw_itemid = d_predictors.loc[(d_predictors['items'].str.lower().str.contains('urin'))&(~pd.isna(d_predictors.LE)), ('LE')].to_numpy()
raw_itemid = raw_itemid[0]
tmp_itemid = np.array(raw_itemid.split(',')).astype('int').tolist() if (str(raw_itemid).__contains__(',')) else [int(raw_itemid)]

tmp_cond = le.itemid.isin(tmp_itemid)
part_df_le = le.loc[tmp_cond]
drop_idx = []
for adm in tqdm(part_df_le.hadm_id.unique()):
    tmp_cond = oe.hadm_id == adm
    part_df_oe = oe.loc[tmp_cond]

    if part_df_oe.shape[0] == 0:
        continue

    tmp_cond = (part_df_le.hadm_id == adm) & (part_df_le.charttime <= max(part_df_oe.charttime)) & (part_df_le.charttime >= min(part_df_oe.charttime))
    drop_idx += tmp_cond[tmp_cond].index.tolist()
print(f'len of drop_idx: {len(drop_idx)}')
df_dict[1] = le.drop(drop_idx).reset_index(drop=True)

100%|██████████| 263/263 [00:00<00:00, 478.65it/s]


len of drop_idx: 237


In [26]:
# column selection and name alteration
# omr
df_dict[0].columns = ['hadm_id', 'subject_id', 'charttime', 'itemid', 'value']

# LE
df_dict[1] = df_dict[1].loc[:, ['subject_id', 'hadm_id', 'itemid', 'charttime', 'valuenum', 'valueuom']]
df_dict[1].columns = ['subject_id', 'hadm_id', 'itemid', 'charttime', 'value', 'valueuom']

# CE
df_dict[2].columns = ['subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid', 'value', 'valueuom']

In [27]:
# save all dataframes
for idx, df_dir in enumerate(dirs):
    df_dict[idx].to_csv(df_dir[:-10]+'BFCC.csv', index=False)

In [9]:
d_predictors = pd.read_csv('processed_data/sepsis/d_predictors_BFCC.csv')
d_predictors.loc[26, 'items'] = 'urine_output'
d_predictors.to_csv('processed_data/sepsis/d_predictors_BFCC.csv', index=False)

## 2. concatenation

In [2]:
icustays = pd.read_csv('processed_data/sepsis/icustays_LVNM.csv')
tmp_cond = (icustays.LVNM == 1)
icustays = icustays.loc[tmp_cond].reset_index(drop = True)
icustays['intime'] = pd.to_datetime(icustays['intime'])
icustays['outtime'] = pd.to_datetime(icustays['outtime'])
hadmids = icustays.hadm_id.unique().tolist()

df_names = ['omr', 'LE', 'CE', 'IE', 'OE', 'PE']
df_dirs = [str('processed_data/sepsis/%s_BFCC.csv' %i) for i in df_names]
df_dict = {}

for idx, df_dir in enumerate(df_dirs):
    df = pd.read_csv(df_dir)

    if df_names[idx] in ['IE', 'PE']:
        df.starttime = pd.to_datetime(df.starttime)
        df.endtime = pd.to_datetime(df.endtime)
    else:
        df.charttime = pd.to_datetime(df.charttime)

    df_dict[df_names[idx]] = df

df_dict['icustays'] = icustays

In [3]:
concat_data = {}
d_predictors = pd.read_csv('processed_data/sepsis/d_predictors_BFCC.csv')
empty_dict = {
    'hospadm_id' : None,
    'age' : {'charttime': [], 'value': []},
    'gender' : {'value': []},
    'race' : {'value': []},
    'height': {'charttime': [], 'value': []},
    'weight': {'charttime': [], 'value': []},
    'bmi': {'charttime': [], 'value': []},
    'heart_rate' : {'charttime': [], 'value': []},
    'resp_rate' : {'charttime': [], 'value': []},
    'temperature' : {'charttime': [], 'value': []},
    'sbp' : {'charttime': [], 'value': []},
    'dbp' : {'charttime': [], 'value': []},
    'map' : {'charttime': [], 'value': []},
    'cvp' : {'charttime': [], 'value': []},
    'paco2' : {'charttime': [], 'value': []},
    'pao2' : {'charttime': [], 'value': []},
    'fio2' : {'charttime': [], 'value': []},
    'gcs' : {'charttime': [], 'value': []},
    'bilirubin' : {'charttime': [], 'value': []},
    'platelets' : {'charttime': [], 'value': []},
    'creatinine' : {'charttime': [], 'value': []},
    'lactate' : {'charttime': [], 'value': []},
    'bun' : {'charttime': [], 'value': []},
    'arterial_ph' : {'charttime': [], 'value': []},
    'wbc' : {'charttime': [], 'value': []},
    'hemoglobin' : {'charttime': [], 'value': []},
    'hematocrit' : {'charttime': [], 'value': []},
    'potassium' : {'charttime': [], 'value': []},
    'sodium' : {'charttime': [], 'value': []},
    'urine_output' : {'charttime': [], 'value': []},
    'epinephrine' : {'starttime': [], 'endtime': [], 'value': [], 'rate': []},
    'dopamine' : {'starttime': [], 'endtime': [], 'value': [], 'rate': []},
    'dobutamine' : {'starttime': [], 'endtime': [], 'value': [], 'rate': []},
    'norepinephrine' : {'starttime': [], 'endtime': [], 'value': [], 'rate': []},
    'phenylephrine' : {'starttime': [], 'endtime': [], 'value': [], 'rate': []},
    'vasopressin' : {'starttime': [], 'endtime': [], 'value': [], 'rate': []},
    'fluid' : {'starttime': [], 'endtime': [], 'value': [], 'rate': []},
    'ventilator' : {'starttime': [], 'endtime': [], 'value': []}
}

# concatenation-----------------------------------------------------------------------------------------------------------------------------
#for i in tqdm(range(len(hadmids))):
#     concat_data[hadmids[i]] = data_concatenation(hadmid = hadmids[i], filling_dict=empty_dict, data_sources_dict=df_dict, predictor_df=d_predictors)
# data_concatenation(hadmid = hadmids[0], filling_dict=empty_dict, data_sources_dict=df_dict, predictor_df=d_predictors)

with mp.Pool(10) as pool:
    pooled_result = pool.map(partial(data_concatenation, filling_dict=empty_dict, data_sources_dict=df_dict, predictor_df=d_predictors), tqdm(hadmids)) # 5:24:59
pool.close()
pool.join()

100%|██████████| 10341/10341 [5:34:39<00:00,  1.94s/it]  


In [6]:
for i in tqdm(pooled_result):
    concat_data[i['hospadm_id']] = i

with open('sepsis_condata_new.pkl','wb') as f:
    pickle.dump(concat_data, f)


id_list = list(concat_data.keys())
with open('sepsis_condata_new_0.pkl','wb') as f:
    pickle.dump(concat_data[id_list[0]], f)

100%|██████████| 10341/10341 [00:00<00:00, 1036597.14it/s]


## after concat

after concat, the things that should be done is following:
1. sepsis and shock timing analysis: which comes first? how fast does it comes compared with different time standards?

In [5]:
import pickle
with open('sepsis_condata_new.pkl','rb') as f:
    concat_data = pickle.load(f)

id_list = list(concat_data.keys())
item_list = list(concat_data[id_list[0]].keys())
item_list = item_list[1:len(item_list)]

demo_list = ['age', 'gender', 'race', 'height', 'weight', 'bmi']
vital_list = ['heart_rate', 'resp_rate', 'temperature', 'sbp', 'dbp', 'map', 'cvp' 'paco2', 'pao2', 'fio2']
lab_list = ['gcs', 'bilirubin', 'platelets', 'creatinine', 'lactate', 'bun', 'arterial_ph', 'wbc', 'hemoglobin', 'hematocrit','potassium', 'sodium']
vaso_list = ['epinephrine', 'dopamine', 'dobutamine', 'norepinephrine', 'phenylephrine', 'vasopressin']
fluid_list = ['fluid']
urine_list = ['urine_output']
vent_list = ['ventilator']
SOFA_list = ['CNS_SOFA', 'CARDIO_SOFA', 'RESP_SOFA', 'COAG_SOFA', 'LIVER_SOFA', 'RENAL_SOFA']
label_list = ['SEPSIS', 'SHOCK']

item_dict = {
    'ct': demo_list[3:]+vital_list+lab_list+urine_list,
    'stet': fluid_list+vaso_list+vent_list,
    'demo': demo_list[3:],
    'vital': vital_list,
    'lab': lab_list,
    'urine': urine_list, 
    'vaso': vaso_list,
    'fluid': fluid_list,
    'vent': vent_list
}

In [6]:
sepshock_adm = pd.DataFrame(columns=['hadm_id', 'sepsis', 'shock', '0tosep', '0tosho', 'septhensho', 'shothensep', 'td_sepsho'])
sepshock_adm['hadm_id'] = id_list
#----------------------------------------------------------------------------------------------------------------------------------------------------

for i in tqdm(range(len(id_list))):
    tmp_adm = id_list[i]
    tmp_adm_data = concat_data[tmp_adm]

    timepoints_list = []
    for j in range(len(item_list[3:])):
        tmp_item = item_list[3:][j]
        tmp_item_data = tmp_adm_data[tmp_item]
        try :
            timepoints_list = timepoints_list + tmp_item_data['charttime']
        except:
            timepoints_list = timepoints_list + tmp_item_data['starttime']
            timepoints_list = timepoints_list + tmp_item_data['endtime']
    timepoints_list = pd.to_datetime(timepoints_list)
    timepoints_list = timepoints_list.sort_values()
    tmp_0t = timepoints_list[0]


    # 1. sepsis 0tosep 
    tmp_item_data = tmp_adm_data['SEPSIS']
    tmp_cond = sepshock_adm.hadm_id == tmp_adm
    sepshock_adm.loc[tmp_cond, 'sepsis'] = int((sum(tmp_item_data['value']) > 0))

    if sepshock_adm.loc[tmp_cond, 'sepsis'].tolist()[0] == 1:
        tmp_idx = tmp_item_data['value'].index(1)
        tmp_st = pd.to_datetime(tmp_item_data['starttime'])[tmp_idx]
        sepshock_adm.loc[tmp_cond, '0tosep'] = (tmp_st-tmp_0t)/datetime.timedelta(hours=1)
    

    # 2. shock 0tosho
    tmp_item_data = tmp_adm_data['SHOCK']
    tmp_cond = sepshock_adm.hadm_id == tmp_adm
    sepshock_adm.loc[tmp_cond, 'shock'] = int((sum(tmp_item_data['value']) > 0))
    
    if sepshock_adm.loc[tmp_cond, 'shock'].tolist()[0] == 1:
        tmp_idx = tmp_item_data['value'].index(1)
        tmp_st = pd.to_datetime(tmp_item_data['starttime'])[tmp_idx]
        sepshock_adm.loc[tmp_cond, '0tosho'] = (tmp_st-tmp_0t)/datetime.timedelta(hours=1)


    # 3. septhensho shothensep td_sepsho
    if ((sepshock_adm.loc[tmp_cond, 'sepsis'][i] == 1) & (sepshock_adm.loc[tmp_cond, 'shock'][i] == 1)):
        sepshock_adm.loc[tmp_cond, 'septhensho'] = int((sepshock_adm.loc[tmp_cond, '0tosho'] > sepshock_adm.loc[tmp_cond, '0tosep']))
        sepshock_adm.loc[tmp_cond, 'shothensep'] = int((sepshock_adm.loc[tmp_cond, '0tosho'] <= sepshock_adm.loc[tmp_cond, '0tosep']))
        sepshock_adm.loc[tmp_cond, 'td_sepsho'] = abs(sepshock_adm.loc[tmp_cond, '0tosho'] - sepshock_adm.loc[tmp_cond, '0tosep'])

100%|██████████| 10341/10341 [04:44<00:00, 36.40it/s]


In [7]:
sepshock_adm.loc[sepshock_adm.septhensho == 1 ,'0tosho'].astype('float').describe([i*0.01 for i in [1, 5, 10, 25, 50, 75, 90, 95, 99]])#.to_csv('processed_data/sepsis/analysis_results/sepshock_sish_0tosho_admver.csv')

count    5236.000000
mean      112.551308
std       234.841894
min         0.016667
1%          0.116667
5%          0.566667
10%         1.258333
25%         4.029167
50%        22.291667
75%       133.195833
90%       319.666806
95%       490.070833
99%       992.528333
max      4926.516667
Name: 0tosho, dtype: float64

In [9]:
print(f'sepsis num, ratio: {sum(sepshock_adm.sepsis)}, {sum(sepshock_adm.sepsis)/sepshock_adm.shape[0]}')
print(f'shock num, ratio: {sum(sepshock_adm.shock)}, {sum(sepshock_adm.shock)/sepshock_adm.shape[0]}')
print(f'septhensho num, ratio(/total): {np.nansum(sepshock_adm.septhensho)}, {np.nansum(sepshock_adm.septhensho)/sum((sepshock_adm.sepsis==1)&(sepshock_adm.shock==1))}')
print(f'shothensep num, ratio(/total): {np.nansum(sepshock_adm.shothensep)}, {np.nansum(sepshock_adm.shothensep)/sum((sepshock_adm.sepsis==1)&(sepshock_adm.shock==1))}')
print(f'septhensho ratio(/total): {np.nansum(sepshock_adm.septhensho)/sepshock_adm.shape[0]}')
print(f'shothensep ratio(/total): {np.nansum(sepshock_adm.shothensep)/sepshock_adm.shape[0]}')
print(f"\n==========0 to sep========== \n{sepshock_adm['0tosep'].astype('float').describe()}\n")
print(f"\n==========0 to shock========== \n{sepshock_adm['0tosho'].astype('float').describe()}\n")
print(f"\n==========td sepsho========== \n{sepshock_adm['td_sepsho'].astype('float').describe()}\n")

sepsis num, ratio: 10285, 0.9945846629919737
shock num, ratio: 5412, 0.5233536408471134
septhensho num, ratio(/total): 5236, 0.9676584734799483
shothensep num, ratio(/total): 175, 0.03234152652005175
septhensho ratio(/total): 0.5063340102504593
shothensep ratio(/total): 0.016922928150082197

count    10285.000000
mean        10.371926
std         50.308824
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max       1862.216667
Name: 0tosep, dtype: float64


count    5412.000000
mean      111.039890
std       232.135914
min         0.000000
25%         3.933333
50%        22.041667
75%       129.608333
max      4926.516667
Name: 0tosho, dtype: float64


count    5411.000000
mean      103.380732
std       228.251217
min         0.000000
25%         3.200000
50%        17.333333
75%       113.766667
max      4926.516667
Name: td_sepsho, dtype: float64



In [10]:
icustays = pd.read_csv('processed_data/sepsis/icustays_LVNM.csv')
tmp_cond = icustays.LVNM == 1
stayids = icustays.loc[tmp_cond].stay_id.unique().tolist()
icustays.intime = pd.to_datetime(icustays.intime)
icustays.outtime = pd.to_datetime(icustays.outtime)

In [11]:
sepshock = pd.DataFrame(columns=['stay_id', 'sepsis', 'shock', '0tosep', '0tosho', 'septhensho', 'shothensep', 'td_sepsho', 'td_stit'])
sepshock['stay_id'] = stayids
#----------------------------------------------------------------------------------------------------------------------------------------------------

for i in tqdm(range(len(stayids))):
    tmp_stay = stayids[i]
    tmp_cond = icustays.stay_id == tmp_stay
    tmp_adm = icustays.loc[tmp_cond].hadm_id.tolist()[0]
    it, ot = icustays.loc[tmp_cond, ('intime', 'outtime')].reset_index(drop=True).loc[0]
    tmp_adm_data = concat_data[tmp_adm]

    # 1. sepsis 0tosep 
    tmp_item_data = pd.DataFrame(tmp_adm_data['SEPSIS'])
    tmp_cond = (tmp_item_data.endtime >= it) & (tmp_item_data.starttime <= ot)
    tmp_stay_item_df = tmp_item_data.loc[tmp_cond].reset_index(drop=True)
    tmp_0t = tmp_stay_item_df.starttime[0]

    tmp_cond = sepshock.stay_id == tmp_stay
    sepshock.loc[tmp_cond, 'sepsis'] = int((sum(tmp_stay_item_df['value']) > 0)) # sepsis
    sepshock.loc[tmp_cond, 'td_stit'] = (it-tmp_0t)/datetime.timedelta(hours=1)
    
    if sepshock.loc[tmp_cond, 'sepsis'].tolist()[0] == 1:
        tmp_idx = tmp_stay_item_df['value'].tolist().index(1)
        tmp_st = pd.to_datetime(tmp_stay_item_df['starttime'])[tmp_idx]
        sepshock.loc[tmp_cond, '0tosep'] = (tmp_st-tmp_0t)/datetime.timedelta(hours=1)


    # 2. shock 0tosho
    tmp_item_data = pd.DataFrame(tmp_adm_data['SHOCK'])
    tmp_cond = (tmp_item_data.endtime >= it) & (tmp_item_data.starttime <= ot)
    tmp_stay_item_df = tmp_item_data.loc[tmp_cond].reset_index(drop=True)

    tmp_cond = sepshock.stay_id == tmp_stay
    sepshock.loc[tmp_cond, 'shock'] = int((sum(tmp_stay_item_df['value']) > 0)) # sepsis
    
    if sepshock.loc[tmp_cond, 'shock'].tolist()[0] == 1:
        tmp_idx = tmp_stay_item_df['value'].tolist().index(1)
        tmp_st = tmp_stay_item_df['starttime'][tmp_idx]
        sepshock.loc[tmp_cond, '0tosho'] = (tmp_st-tmp_0t)/datetime.timedelta(hours=1)


    # 3. septhensho shothensep td_sepsho
    if ((sepshock.loc[tmp_cond, 'sepsis'][i] == 1) & (sepshock.loc[tmp_cond, 'shock'][i] == 1)):
        sepshock.loc[tmp_cond, 'septhensho'] = int((sepshock.loc[tmp_cond, '0tosho'] > sepshock.loc[tmp_cond, '0tosep']))
        sepshock.loc[tmp_cond, 'shothensep'] = int((sepshock.loc[tmp_cond, '0tosho'] <= sepshock.loc[tmp_cond, '0tosep']))
        sepshock.loc[tmp_cond, 'td_sepsho'] = abs(sepshock.loc[tmp_cond, '0tosho'] - sepshock.loc[tmp_cond, '0tosep'])

100%|██████████| 13737/13737 [01:39<00:00, 138.55it/s]


In [12]:
sepsis_stays = sepshock.stay_id[sepshock.sepsis == 1].tolist()
shock_stays = sepshock.stay_id[sepshock.shock == 1].tolist()
septhensho_stays = sepshock.stay_id[sepshock.septhensho == 1].tolist()
shothensep_stays = sepshock.stay_id[sepshock.shothensep == 1].tolist()

icustays.loc[icustays.stay_id.isin(sepsis_stays), 'sepsis'] = 1
icustays.loc[icustays.stay_id.isin(shock_stays), 'shock'] = 1
icustays.loc[icustays.stay_id.isin(septhensho_stays), 'septhensho'] = 1
icustays.loc[icustays.stay_id.isin(shothensep_stays), 'shothensep'] = 1
icustays.loc[((icustays.sepsis == 1) & (icustays.shothensep != 1)), 'cohort_stays'] = 1

icustays.to_csv('processed_data/sepsis/icustays_sepshock.csv', index=False)

In [13]:
print(f'sepsis num, ratio: {sum(sepshock.sepsis)}, {sum(sepshock.sepsis)/sepshock.shape[0]}')
print(f'shock num, ratio: {sum(sepshock.shock)}, {sum(sepshock.shock)/sepshock.shape[0]}')
print(f'septhensho num, ratio(/total): {np.nansum(sepshock.septhensho)}, {np.nansum(sepshock.septhensho)/sum((sepshock.sepsis==1)&(sepshock.shock==1))}')
print(f'shothensep num, ratio(/total): {np.nansum(sepshock.shothensep)}, {np.nansum(sepshock.shothensep)/sum((sepshock.sepsis==1)&(sepshock.shock==1))}')
print(f'septhensho ratio(/total): {np.nansum(sepshock.septhensho)/sepshock.shape[0]}')
print(f'shothensep ratio(/total): {np.nansum(sepshock.shothensep)/sepshock.shape[0]}')
print(f"\n==========0 to sep========== \n{sepshock['0tosep'].astype('float').describe()}\n")
print(f"\n==========0 to shock========== \n{sepshock['0tosho'].astype('float').describe()}\n")
print(f"\n==========td sepsho========== \n{sepshock['td_sepsho'].astype('float').describe()}\n")

sepsis num, ratio: 13285, 0.9670961636456286
shock num, ratio: 5871, 0.42738589211618255
septhensho num, ratio(/total): 5400, 0.9204022498721663
shothensep num, ratio(/total): 467, 0.07959775012783364
septhensho ratio(/total): 0.3930989298973575
shothensep ratio(/total): 0.03399577782630851

count    13285.000000
mean         2.401771
std         10.308149
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max        245.000000
Name: 0tosep, dtype: float64


count    5871.000000
mean       28.612400
std        75.720241
min         0.000000
25%         1.066667
50%         4.383333
75%        19.100000
max      1530.183333
Name: 0tosho, dtype: float64


count    5867.000000
mean       27.417059
std        74.722382
min         0.000000
25%         0.866667
50%         3.883333
75%        16.958333
max      1530.183333
Name: td_sepsho, dtype: float64



In [14]:
sepshock.loc[sepshock.septhensho == 1 ,'0tosho'].astype('float').describe([i*0.01 for i in [1, 5, 10, 25, 50, 75, 90, 95, 99]])#.to_csv('processed_data/sepsis/analysis_results/sepshock_sish_0tosho_stayver.csv')

count    5400.000000
mean       30.603762
std        78.460404
min         0.016667
1%          0.050000
5%          0.183333
10%         0.400000
25%         1.400000
50%         4.900000
75%        21.141667
90%        80.613333
95%       150.537500
99%       380.435667
max      1530.183333
Name: 0tosho, dtype: float64

In [15]:
sepshock.loc[sepshock.septhensho == 1 ,'td_stit'].astype('float').describe([i*0.01 for i in [1, 5, 10, 25, 50, 75, 90, 95, 99]])

count    5400.000000
mean        0.594082
std         2.050999
min        -3.451944
1%         -0.683622
5%          0.000278
10%         0.001944
25%         0.006944
50%         0.014722
75%         0.400000
90%         1.465917
95%         2.549208
99%        10.234167
max        47.662778
Name: td_stit, dtype: float64

In [16]:
sepshock.loc[: ,'td_stit'].astype('float').describe([i*0.01 for i in [1, 5, 10, 25, 50, 75, 90, 95, 99]])

count    13737.000000
mean         0.744949
std          2.632894
min         -5.339722
1%          -1.002478
5%           0.000000
10%          0.001944
25%          0.007222
50%          0.016667
75%          0.500000
90%          1.683333
95%          3.468167
99%         12.780289
max         91.792222
Name: td_stit, dtype: float64

# EON