# Generate patient data as input format for Clairvoyance

In [1]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

from utils.data_io import *
from utils.utils import *

In [2]:
data_path_origin = 'processed_dataset/all/'
data_path_save = 'processed_dataset/clairvoyance/'

patient_data_path = data_path_origin + 'data/'
patient_info_path = data_path_origin + 'info/'

In [3]:
data_mapping = pd.read_csv('processed/DatasetOverview - Inputs.tsv', sep='\t', header=0)
table_dict = data_mapping[['TableID', 'TableSource']]
table_dict = table_dict.drop_duplicates()
table_dict = {table_dict.iloc[i,0]:table_dict.iloc[i,1] for i in range(table_dict.shape[0])}

uid_dict = data_mapping[['ParamNameOrigin', 'TableUID']]
uid_dict = {uid_dict.iloc[i,0]:uid_dict.iloc[i,1] for i in range(uid_dict.shape[0])}

uid2str_dict = {v:k for k,v in uid_dict.items()}

In [4]:
for uid in data_mapping[data_mapping['TableUID']<1e5]['TableUID']:
    print(f'{uid}: {uid2str_dict[uid]}')

1: patientunitstayid
2: patienthealthsystemstayid
3: gender
4: age
5: ethnicity
6: hospitalid
7: wardid
8: apacheadmissiondx
9: admissionheight
10: hospitaladmittime24
11: hospitaladmitoffset
12: hospitaladmitsource
13: hospitaldischargeyear
14: hospitaldischargetime24
15: hospitaldischargeoffset
16: hospitaldischargelocation
17: hospitaldischargestatus
18: unittype
19: unitadmittime24
20: unitadmitsource
21: unitvisitnumber
22: unitstaytype
23: admissionweight
24: dischargeweight
25: unitdischargetime24
26: unitdischargeoffset
27: unitdischargelocation
28: unitdischargestatus
29: uniquepid


# export all static data to csv

In [96]:
SELECTED_STATIC_DATA = [
    'gender',
    'age',
#     'admissionheight',
#     'admissionweight',
]

GENDER_DICT = {
    'Female': 0,
    'Male': 1,
}

df_static = pd.DataFrame(columns=['id', 'gender', 'age'])

for filename in os.listdir(patient_info_path):
    
    info = pd.read_csv(patient_info_path+filename, sep='\t', header=0)
    pid = int(float(info[info['UID']==1]['Value'].iloc[0]))
    
    info_data = [pid]
    for entry in SELECTED_STATIC_DATA:
        uid = uid_dict[entry]
        val = info[info['UID']==uid]['Value'].iloc[0]
        # select patient with valid static data entries
        if pd.isnull(val):
            break
        if entry == 'gender':
            if val not in GENDER_DICT.keys():
#                 print('gender: ', val)
                break
            info_data.append(GENDER_DICT[val])
        if entry == 'age':
            # select patient > 18 and <= 89 years old
            try:
                val = int(val)
            except:
#                 print('age is not integer: ', val)
                break
            if val < 18 or val > 89:
#                 print('age: ', val)
                break
            info_data.append(val)
            
    if len(info_data) == 3:
        info_data = pd.Series(info_data, index=df_static.columns)
        df_static = df_static.append(info_data, ignore_index=True)
#     else:
#         print(info_data)

In [None]:
df_static = df_static.sort_values(by='id')
df_static.to_csv('processed_dataset/clairvoyance/static_data.csv', sep=',', index=False)   

In [19]:
df_static = pd.read_csv('processed_dataset/clairvoyance/static_data.csv', sep=',')
df_static

Unnamed: 0,id,gender,age
0,141194,1,68
1,141233,0,81
2,141244,1,59
3,141284,1,63
4,141313,1,45
...,...,...,...
42209,3353201,0,66
42210,3353216,0,50
42211,3353226,0,79
42212,3353235,1,50


In [16]:
total_patient_num = df_static.shape[0]

# export all temporal data to csv

In [7]:
df_temporal = pd.DataFrame(columns=['id', 'time', 'variable', 'value'])
df_temporal.to_csv('processed_dataset/clairvoyance/temporal_data.csv', 
                   sep=',', index=False, mode='a')

In [17]:
for i, pid in enumerate(df_static['id']):
    df_temporal = pd.DataFrame(columns=['id', 'time', 'variable', 'value'])
    
    data = pd.read_csv(patient_data_path+str(pid)+'.csv', sep='\t', header=0)
    data['id'] = pid
    data = data[['id', 'Offset', 'UID', 'Value']]
    data = data.rename(columns={k1:k2 for k1,k2 in zip(data.columns, df_temporal.columns)})
    df_temporal = pd.concat([df_temporal, data], ignore_index=True)
    
    df_temporal.to_csv('processed_dataset/clairvoyance/temporal_data.csv', 
                   sep=',', index=False, header=False, mode='a')
    
    if i % 100 == 0:
        print(f'{i} / {total_patient_num}')
    

0 / 42214
100 / 42214
200 / 42214
300 / 42214
400 / 42214
500 / 42214
600 / 42214
700 / 42214
800 / 42214
900 / 42214
1000 / 42214
1100 / 42214
1200 / 42214
1300 / 42214
1400 / 42214
1500 / 42214
1600 / 42214
1700 / 42214
1800 / 42214
1900 / 42214
2000 / 42214
2100 / 42214
2200 / 42214
2300 / 42214
2400 / 42214
2500 / 42214
2600 / 42214
2700 / 42214
2800 / 42214
2900 / 42214
3000 / 42214
3100 / 42214
3200 / 42214
3300 / 42214
3400 / 42214
3500 / 42214
3600 / 42214
3700 / 42214
3800 / 42214
3900 / 42214
4000 / 42214
4100 / 42214
4200 / 42214
4300 / 42214
4400 / 42214
4500 / 42214
4600 / 42214
4700 / 42214
4800 / 42214
4900 / 42214
5000 / 42214
5100 / 42214
5200 / 42214
5300 / 42214
5400 / 42214
5500 / 42214
5600 / 42214
5700 / 42214
5800 / 42214
5900 / 42214
6000 / 42214
6100 / 42214
6200 / 42214
6300 / 42214
6400 / 42214
6500 / 42214
6600 / 42214
6700 / 42214
6800 / 42214
6900 / 42214
7000 / 42214
7100 / 42214
7200 / 42214
7300 / 42214
7400 / 42214
7500 / 42214
7600 / 42214
7700 / 4221

In [2]:
df_temporal = pd.read_csv('processed_dataset/clairvoyance_formatted/temporal_data.csv')
df_temporal

Unnamed: 0,141194,6.0,100002.0,112.0
0,141194,11.0,100002.0,112.0
1,141194,16.0,100002.0,112.0
2,141194,21.0,100002.0,112.0
3,141194,26.0,100002.0,117.0
4,141194,31.0,100002.0,114.0
...,...,...,...,...
187264046,3353251,12374.0,700007.0,1.0
187264047,3353251,12374.0,700026.0,2.0
187264048,3353251,12374.0,700005.0,2.0
187264049,3353251,12374.0,700009.0,2.0
