In [None]:
import numpy as np
import pandas as pd
import pickle

from tqdm.notebook import tqdm

In [None]:
# Load the preprocessed data.
obs_path = './obs_periodic.p'
obs = pickle.load(open(obs_path, 'rb'))

infusion_path = './infusionDrug.csv'

In [None]:
print(len(obs[2760705]))  # 2760705 pateint has 1113 observational datas

In [None]:
obs[2760705][:5]

In [None]:
# List of patient IDs 
pids = list(obs.keys())
pids[:5]

In [None]:
# Collecting only Systolic BP, Diastolic BP, Mean BP
pids = list(obs.keys())
obs_bp = {}
for pid in tqdm(pids):
    time_bp = []
    for (timestamp, measures) in obs[pid]:
        measures = np.array(measures)
        if measures[6:9].min() < 0:  # measures 6,7,8 are related to BP
            continue
        time_bp.append((timestamp, measures[6:9]))
    obs_bp[pid] = time_bp

In [None]:
obs_bp[2760705][:5]

In [None]:
print(len(obs_bp[2760705]))  # 1113 observations => 217 observations

In [None]:
# Select patients with at least 50 BP measures.
lens = np.array([len(v) for k, v in obs_bp.items()])
obs_bp_50 = dict([(k, v) for k, v in obs_bp.items() if len(v) > 49])
pids_50 = list(obs_bp_50.keys())

In [None]:
# List of medications that affect BP.
vaso_list = [
    'epoprostenol',
    'nitroglycerin',
    'nitroprusside',
    'labetalol',
    'esomolol',
    'nicardipine',
    'clevidipine',
    'milrinone',
    'esmolol',
    'nicardipine',
    'isoproterenol',
    'isoprenaline',
    'furosemide',
    'diltiazem',
    'cardizem',
    'amiodarone',
    'propofol',
    'norepinephrine',
    'dopamine',
    'phenylephrine',
    'vasopressin',
    'dobutamine',
]
vaso_set = set(vaso_list)

In [None]:
# Extract BP-related medication info from infusionDrug.csv for patients with at least 50 BP measures.
targetIcuIdSet = set(pids_50)
infusionDict = {}
for icuId in targetIcuIdSet:
    infusionDict[icuId] = []
    
with open(infusion_path, 'r') as infd:
    infd.readline()
    for _ in tqdm(range(4803720 - 1)):
        line = infd.readline().strip()
        tokens = line.split(',')
        icuId = int(tokens[1])
        if icuId not in targetIcuIdSet:
            continue
        drugRate = tokens[-5]
        offset = int(tokens[2])
        
        if len(tokens) > 9:    
            drugName = " ".join(tokens[3:len(tokens) - 9 + 3 + 1])[1:-1]
        else:
            drugName = tokens[3]
        
        drugName = drugName.lower()
        flag = False
        for vaso in vaso_list:
            if vaso in drugName:
                drugName = vaso
                flag = True
        if not flag:
            continue
            
        if drugRate == '""':
            drugRate = -1.0
        else:
            drugRate = float(drugRate)
        infusionDict[icuId].append((offset, drugName, drugRate))

for k, v in infusionDict.items():
    v.sort()

In [None]:
# Top 10 patients with the most BP measures.
lens = sorted([(len(v),k) for k,v in obs_bp_50.items()], reverse=True)
lens[:10]

In [None]:
# Merge medications given at the same timestamp. This is required to create the 90-min training samples.

def merge_ivs(iv_list, new_iv):
    new_drug = new_iv[1]
    new_rate = new_iv[2]
    flag = False
    for i in range(len(iv_list)):
        if iv_list[i][1] == new_drug:
            iv_list[i] = (iv_list[i][0], iv_list[i][1], iv_list[i][2] + new_rate)
            flag = True
    if not flag:
        iv_list.append(new_iv)
    
merged_iv_dict = {}
for pid, ivs in infusionDict.items():
    merged_iv_dict[pid] = []
    if not ivs:
        continue
    iv_list = [ivs[0]]
    prev_iv = ivs[0]
    last_iv_list = [(-100, '', 0.0)]
    for iv in ivs[1:]:
        offset = iv[0]
        prev_offset = prev_iv[0]
        if offset - prev_offset <= 10:
            merge_ivs(iv_list, iv)
        else:
            merged_iv_dict[pid].append(iv_list)
            iv_list = [iv]
        prev_iv = iv
    merged_iv_dict[pid].append(iv_list)

In [None]:
merged_iv_dict[2885883][:5]

In [None]:
# Select one patient and create the 90-min training samples.

pid = 2885883
#pid = 2776825
#pid = 3158943
#pid = 984944
#pid = 1028643
#pid = 1037580

obs = obs_bp_50[pid]
ivs = merged_iv_dict[pid]
sample_idx = 0
new_obs = []
new_ivs = []
for i in tqdm(range(1,len(ivs)-1)):
    prev_iv = ivs[i-1]
    current_iv = ivs[i]
    next_iv = ivs[i+1]
    
    prev_iv_offset = prev_iv[0][0]
    current_iv_offset = current_iv[0][0]
    next_iv_offset = next_iv[0][0]
    
    obs_sample = []
    for ob in obs:
        ob_offset = ob[0]
        if ob_offset - prev_iv_offset < 30:
            continue
        elif ob_offset > next_iv_offset:
            break
        else:
            obs_sample.append(ob)
    if obs_sample:
        new_ivs.append(current_iv)
        new_obs.append(obs_sample)

In [None]:
# Count the frequency of training samples with a specific sequence length. (18 is the 90-min samples)
freqs = {}
for obs in new_obs:
    length = len(obs)
    if length not in freqs:
        freqs[length] = 0
    freqs[length] += 1
len_dict = dict([(i, len(v)) for i, v in enumerate(new_obs)])
freqs

In [None]:
final_obs = []
final_ivs = []

In [None]:
for i, obs in enumerate(new_obs):
    if len(obs) == 18:
        final_obs.append(obs)
        final_ivs.append(new_ivs[i])

In [None]:
final_obs_array = []
for idx, obs in enumerate(final_obs):
    for i in range(len(obs)):
        final_obs_array.append(np.concatenate((np.array([idx, i, obs[i][0]]), obs[i][1])))

In [None]:
final_obs_df = pd.DataFrame(np.array(final_obs_array))

In [None]:
final_obs_df.columns = ['ID', 'Time', 'Real_Time', 'Systolic_BP', 'Diastolic_BP', 'Mean_BP']

In [None]:
itv_list = []
for ivs in final_ivs:
    for iv in ivs:
        if iv[1] not in itv_list:
            itv_list.append(iv[1])

In [None]:
final_obs_df = pd.concat([final_obs_df, pd.DataFrame(np.zeros((4518, 5)), columns=itv_list)], 1)

In [None]:
for idx, obs in enumerate(final_obs):
    for i in range(len(obs)):
        if obs[i][0] == (final_ivs[idx][0][0] - 4):
            for j in range(len(final_ivs[idx])):
                temp = final_obs_df.iloc[idx * 18 + i].copy()
                temp[final_ivs[idx][j][1]] = final_ivs[idx][j][2]
                final_obs_df.iloc[idx * 18 + i] = temp

In [None]:
final_obs_df.to_csv('final_patient.csv', index=False)

In [None]:
temp = final_obs_df[['Systolic_BP', 'Diastolic_BP', 'Mean_BP', 'norepinephrine', 
                     'vasopressin', 'propofol', 'amiodarone', 'phenylephrine']].copy()

In [None]:
temp_max = temp.max(0)
temp_min = temp.min(0)

temp = (temp - temp_min) / (temp_max - temp_min)

In [None]:
final_obs_df[['Systolic_BP', 'Diastolic_BP', 'Mean_BP', 'norepinephrine', 
              'vasopressin', 'propofol', 'amiodarone', 'phenylephrine']] = temp

In [None]:
final_obs_df.to_csv('final_patient_std.csv', index=False)

In [None]:
for i in range(251 - 1):
    temp = pd.concat([final_obs_df[final_obs_df['ID'] == i], final_obs_df[final_obs_df['ID'] == (i + 1)]])
    temp['ID'] = i
    temp = pd.concat([temp[:17], temp[23:]])
    temp['Time'] = np.arange(len(temp))
    if i == 0:
        final_obs_df_30 = temp
    else:
        final_obs_df_30 = pd.concat([final_obs_df_30, temp])

In [None]:
final_obs_df_30.to_csv('final_patient_30.csv', index=False)

In [None]:
temp = final_obs_df_30[['Systolic_BP', 'Diastolic_BP', 'Mean_BP', 'norepinephrine', 
                        'vasopressin', 'propofol', 'amiodarone', 'phenylephrine']].copy()

In [None]:
temp_max = temp.max(0)
temp_min = temp.min(0)

In [None]:
temp = (temp - temp_min) / (temp_max - temp_min)

In [None]:
final_obs_df_30[['Systolic_BP', 'Diastolic_BP', 'Mean_BP', 'norepinephrine', 
                        'vasopressin', 'propofol', 'amiodarone', 'phenylephrine']] = temp

In [None]:
final_obs_df_30.to_csv('final_patient_30_std.csv', index=False)