In [1]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

from projects.data_cleaning.common import *
from projects.data_cleaning.utils.data_io import *
from projects.data_cleaning.utils.utils import *

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
data_mapping = pd.read_csv('processed/DatasetOverview - Inputs.tsv', sep='\t', header=0)
pid_diagnosis = np.load('processed/patient_grouped_by_diagnosis.npy', allow_pickle=True).item()
pid_infarction = pid_diagnosis['Infarction, acute myocardial (MI)']
pid_acute_renal_failure = np.load('processed/acute_renal_failure_cases_valid.npy')
pid_total = np.load('processed/patient_ids_all.npy')

In [4]:
table_dict = data_mapping[['TableID', 'TableSource']]
table_dict = table_dict.drop_duplicates()
table_dict = {table_dict.iloc[i,0]:table_dict.iloc[i,1] for i in range(table_dict.shape[0])}

uid_dict = data_mapping[['ParamNameOrigin', 'TableUID']]
uid_dict = {uid_dict.iloc[i,0]:uid_dict.iloc[i,1] for i in range(uid_dict.shape[0])}

In [7]:
def convert_lab_data_to_num(lab_data):
    data_num = []
    for d in lab_data:
        if o == '':
            continue
        elif '<' in o:
            data_num.append(float(o.replace('<', '')))
        elif '>' in o:
            data_num.append(float(o.replace('>', '')))
        elif '%' in o:
            data_num.append(float(o.replace('%', '')))
    return data_num

In [42]:
drugrate_unit_dict = {
    'mcg/min': 1/1000,
    'mcg/hr': 1/60/1000,
    'mcg/kg/min': 1/1000,
    'mcg/kg/hr': 1/60/1000,
    
    'mg/min': 1/1000,
    'mg/hr': 1/60/1000,
    'mg/kg/min': 1/1000,
    'mg/kg/hr': 1/60/1000,
    
    'units/min': 1,
    'units/hr': 1/60,
    
    'ml/min': 1,
    'ml/hr':1/60,
}


def unify_drugrate_unit(rate, name, weight):
    convert_coeff = 1
    for i in range(len(weight)):
        if weight[i] == '':
            weight[i] = 1
        else:
            weight[i] = float(weight[i])
                
    for k in drugrate_unit_dict:
        if k in name: 
            convert_coeff = drugrate_unit_dict[k]
    
    return rate.astype(float) * weight * convert_coeff

# Save patient data in .CSV file

In [34]:
for pid in pid_total:
    data = load_patient_data_by_id(pid)
    
    data_table = {
        'Offset': np.array([]),
        'UID': np.array([]),
        'Value': np.array([]),
        'Unit': np.array([]),
    }
    
    patient_info = {
        'UID': np.array([]),
        'Value': np.array([]),
    }
    
    for table_id in table_dict.keys():
        # patient information
        if table_id == 0:
            entry_mapping = data_mapping[data_mapping['TableID']==table_id]
            table_source = table_dict[table_id]
            for _, entry in entry_mapping.iterrows():
                entry_name_eicu = entry['ParamNameOrigin']
                entry_uid = entry['TableUID']
                entry_value = data[table_source][entry_name_eicu][0]

                patient_info['UID'] = np.append(patient_info['UID'], entry_uid)
                patient_info['Value'] = np.append(patient_info['Value'], entry_value)

        
        # periodic and aperiodic vitals
        elif table_id in [1,2]:
            
            # read and sort data
            entry_mapping = data_mapping[data_mapping['TableID']==table_id]
            table_source = table_dict[table_id]

            sorted_ids = np.argsort(data[table_source]['observationoffset'])
            sorted_offset = np.array(data[table_source]['observationoffset'])[sorted_ids]
            for _, entry in entry_mapping.iterrows():
                entry_name_eicu = entry['ParamNameOrigin']
                entry_uid = entry['TableUID']
                sorted_vals = np.array(data[table_source][entry_name_eicu])[sorted_ids]
#                 sorted_vals[pd.isnull(sorted_vals)] = np.nan
                
                data_table['Offset'] = np.append(data_table['Offset'], sorted_offset)
                data_table['UID'] = np.append(data_table['UID'], [entry_uid]*len(sorted_offset))
                data_table['Value'] = np.append(data_table['Value'], sorted_vals)
                data_table['Unit'] = np.append(data_table['Unit'], [np.nan]*len(sorted_offset))
                
                if 'nan' in sorted_vals:
                    print(entry_name_eicu)
            check_table = data_table
                
        # intake & output
        elif table_id ==3:
            
            entry_mapping = data_mapping[data_mapping['TableID']==table_id]
            table_source = table_dict[table_id]
            # intakeoutputoffset shared by [intaketotal, outtaketotal, dialysistotal, nettotal]
            sorted_ids = np.argsort(data[table_source]['intakeoutputoffset'])
            sorted_offset = np.array(data[table_source]['intakeoutputoffset'])[sorted_ids]
            for _, entry in entry_mapping[entry_mapping['TableUID']<300005].iterrows():
                entry_name_eicu = entry['ParamNameOrigin']
                entry_uid = entry['TableUID']
                sorted_vals = np.array(data[table_source][entry_name_eicu])[sorted_ids]
#                 sorted_vals[pd.isnull(sorted_vals)] = np.nan
                
                data_table['Offset'] = np.append(data_table['Offset'], sorted_offset)
                data_table['UID'] = np.append(data_table['UID'], [entry_uid]*len(sorted_offset))
                data_table['Value'] = np.append(data_table['Value'], sorted_vals)
                data_table['Unit'] = np.append(data_table['Unit'], [np.nan]*len(sorted_offset))

            # other intake-output entries 
            for i, entry_name_eicu in enumerate(data[table_source]['celllabel']):
                if entry_name_eicu in uid_dict:
                    entry_uid = uid_dict[entry_name_eicu]
                    entry_offset = data[table_source]['intakeoutputentryoffset'][i]
                    entry_value = data[table_source]['cellvaluenumeric'][i]
                    
                    data_table['Offset'] = np.append(data_table['Offset'], entry_offset)
                    data_table['UID'] = np.append(data_table['UID'], entry_uid)
                    data_table['Value'] = np.append(data_table['Value'], entry_value)
                    data_table['Unit'] = np.append(data_table['Unit'], np.nan)
                
            uid_mask = np.logical_and(
                data_table['UID'] >= table_id * 1e5 + 5,
                data_table['UID'] < (table_id + 1) * 1e5,
            )
            sorted_ids = np.argsort(data_table['Offset'][uid_mask])
            for k in data_table:
                data_table[k][uid_mask] = data_table[k][uid_mask][sorted_ids]
                
                
        # lab
        elif table_id == 4:
            entry_mapping = data_mapping[data_mapping['TableID']==table_id]
            table_source = table_dict[table_id]
            
            for i, entry_name_eicu in enumerate(data[table_source]['labname']):
                if entry_name_eicu in uid_dict:
                    entry_uid = uid_dict[entry_name_eicu]
                    entry_offset = data[table_source]['labresultoffset'][i]
                    entry_value = data[table_source]['labresulttext'][i]
                    # convert lab data to float type
                    entry_value = convert_lab_data_to_num(entry_value)
                    
                    data_table['Offset'] = np.append(data_table['Offset'], entry_offset)
                    data_table['UID'] = np.append(data_table['UID'], entry_uid)
                    data_table['Value'] = np.append(data_table['Value'], entry_value)
                    data_table['Unit'] = np.append(data_table['Unit'], np.nan)
                
            uid_mask = np.logical_and(
                data_table['UID'] >= table_id * 1e5,
                data_table['UID'] < (table_id + 1) * 1e5,
            )
            sorted_ids = np.argsort(data_table['Offset'][uid_mask])
            for k in data_table:
                data_table[k][uid_mask] = data_table[k][uid_mask][sorted_ids]

            
            
        # infusion drug
        elif table_id == 5:
            entry_mapping = data_mapping[data_mapping['TableID']==table_id]
            table_source = table_dict[table_id]
            
            for i, entry_name_eicu in enumerate(data[table_source]['drugname']):
                if entry_name_eicu in uid_dict:
                    entry_uid = uid_dict[entry_name_eicu]
                    entry_offset = data[table_source]['infusionoffset'][i]
                    entry_value = data[table_source]['drugrate'][i]
                    entry_weight = data[table_source]['patientweight'][i]
                    if entry_value == '':
                        continue
                    # unify drugrate unit to: mg/min, units/min, ml/min
                    entry_value, entry_unit = unify_drugrate_unit(entry_value, entry_name_eicu, entry_weight)

                    data_table['Offset'] = np.append(data_table['Offset'], entry_offset)
                    data_table['UID'] = np.append(data_table['UID'], entry_uid)
                    data_table['Value'] = np.append(data_table['Value'], entry_value)
                    data_table['Unit'] = np.append(data_table['Unit'], entry_unit)
                    
            uid_mask = np.logical_and(
                data_table['UID'] >= table_id * 1e5,
                data_table['UID'] < (table_id + 1) * 1e5,
            )
            sorted_ids = np.argsort(data_table['Offset'][uid_mask])
            for k in data_table:
                data_table[k][uid_mask] = data_table[k][uid_mask][sorted_ids]

                
        # nurse charting
        elif table_id == 6:
            entry_mapping = data_mapping[data_mapping['TableID']==table_id]
            table_source = table_dict[table_id]
            
            for _, entry in entry_mapping.iterrows():
                entry_name_eicu = entry['ParamNameOrigin']
                entry_label_eicu = entry['ParamLabel']
                entry_uid = entry['TableUID']

                entry_sub_ids = select_entry_subset(data, table_source, 'nursingchartcelltypevalname', entry_name_eicu)[0]
                entry_sub_ids = entry_sub_ids[np.in1d(
                    entry_sub_ids,
                    select_entry_subset(data, table_source, 'nursingchartcelltypevallabel', entry_label_eicu)[0],
                )]
                entry_offset = select_list_subset_with_index(data[table_source]['nursingchartentryoffset'], entry_sub_ids)
                entry_value = select_list_subset_with_index(data[table_source]['nursingchartvalue'], entry_sub_ids)
                
                data_table['Offset'] = np.append(data_table['Offset'], entry_offset)
                data_table['UID'] = np.append(data_table['UID'], [entry_uid]*len(entry_offset))
                data_table['Value'] = np.append(data_table['Value'], entry_value)
                data_table['Unit'] = np.append(data_table['Unit'], [np.nan]*len(entry_offset))
                    
                
            uid_mask = np.logical_and(
                data_table['UID'] >= table_id * 1e5,
                data_table['UID'] < (table_id + 1) * 1e5,
            )
            sorted_ids = np.argsort(data_table['Offset'][uid_mask])
            for k in data_table:
                data_table[k][uid_mask] = data_table[k][uid_mask][sorted_ids]


        # diagnosis
        elif table_id == 7:
            entry_mapping = data_mapping[data_mapping['TableID']==table_id]
            table_source = table_dict[table_id]

            entry_offset_all = data[table_source]['diagnosisoffset']
            entry_name_all = data[table_source]['icd9code']
            entry_value_all = data[table_source]['diagnosispriority']
            
            for offset, name, val in zip(entry_offset_all, entry_name_all, entry_value_all):
                if name == '':
                    continue
                entry_uid = uid_dict[name]
                val = DIAGNOSIS_PRIORITY_DICT[val]
                data_table['Offset'] = np.append(data_table['Offset'], offset)
                data_table['UID'] = np.append(data_table['UID'], entry_uid)
                data_table['Value'] = np.append(data_table['Value'], val)
                data_table['Unit'] = np.append(data_table['Unit'], np.nan)
                
            uid_mask = data_table['UID'] >= table_id * 1e5
            sorted_ids = np.argsort(data_table['Offset'][uid_mask])
            for k in data_table:
                data_table[k][uid_mask] = data_table[k][uid_mask][sorted_ids]
                
    mask = np.logical_or(
        np.logical_and(data_table['UID'] < 7e5, ~pd.isnull(data_table['Value'])),
        data_table['UID'] >= 7e5
    )
    for k in data_table:
        data_table[k] = data_table[k][mask]
        
    
            
            
    # save data to csv
    df_info = pd.DataFrame(patient_info)
    df_data = pd.DataFrame(data_table)
    # delete duplicated diagnosis which have the same timestamp
    df_data = df_data[df_data['UID']<7e5].append(df_data[df_data['UID']>7e5].drop_duplicates(keep='first'))
    save_dir_info = 'processed_dataset/all/info/' 
    save_dir_data = 'processed_dataset/all/data/'
    
    save_csv(save_dir_info+str(pid)+'.csv', df_info)
    save_csv(save_dir_data+str(pid)+'.csv', df_data)