In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()
%store -r raw_path

  from .autonotebook import tqdm as notebook_tqdm


Strategy:  
1. Feature space is: 7-day SCr trajectories and all labs prior to prediction point.✅  
2. The prediction point was 1-day prior to onset for AKI patients and 1-day prior to the last SCr record for non-AKI patients.✅  
3. Each patient should have at least 2 SCr measurement within the window.✅
4. Exclude those SCr baseline > 3.5✅   
5. Lab missingness > 50% dropped.✅  
6. Each patient/encounter is unique✅  

# Read KUMC Patients' ONSET, LAB, SCr and COMO

In [2]:
# define data storage path
raw_path = '/blue/yonghui.wu/hoyinchan/Data/data2022raw/'
data_path = raw_path + "KUMC_ORCALE" + '/raw/'

In [3]:
#Read in Onsets data and only use KUMC data
All_onsets = pd.read_csv('/blue/yonghui.wu/lideyi/Personalization_Methodology/NEW_ONSETS.csv')
ONSET = All_onsets.loc[All_onsets.CENTER_NAME == "KUMC"].copy(deep = True)

  All_onsets = pd.read_csv('/blue/yonghui.wu/lideyi/Personalization_Methodology/NEW_ONSETS.csv')


In [None]:
#Read in Lab test results
with open(data_path + 'AKI_LAB.csv', 'r', encoding='utf-8', errors='ignore') as file:
    LAB = pd.read_csv(data_path + 'AKI_LAB.csv', 
                      delimiter=",", usecols=['PATID', 'LAB_LOINC', 
                                              'SPECIMEN_DATE"+PD.DATE_SHIFT"', 'RESULT_NUM'],
                      encoding='unicode_escape')
    
LAB.rename(columns = {'SPECIMEN_DATE"+PD.DATE_SHIFT"': "SPECIMEN_DATE"}, inplace = True)

In [None]:
# Read in SCr trajectories
SCR_use_cols = ['ONSETS_ENCOUNTERID','PATID','ENCOUNTERID',
                        'SPECIMEN_DATE"+PD.DATE_SHIFT"','RESULT_NUM', 'DAYS_SINCE_ADMIT']
SCR = pd.read_csv(data_path + "AKI_LAB_SCR.csv", delimiter = ',', usecols=SCR_use_cols)
SCR.rename(columns = {'SPECIMEN_DATE"+PD.DATE_SHIFT"': "SPECIMEN_DATE"}, inplace = True)

# Construct a Full DataFrame

Process ONSET

In [None]:
#format datatype for merge
#exclude those baseline SCr > 3.5, which indicate poor renal functions
ONSET = ONSET.loc[ONSET.SERUM_CREAT_BASE < 3.5, :]

ONSET.loc[:, ["PATID", "ONSETS_ENCOUNTERID"]] = ONSET[["PATID", "ONSETS_ENCOUNTERID"]].astype(str)

time_cols = ["ADMIT_DATE", "DISCHARGE_DATE", "AKI1_ONSET", "AKI2_ONSET", "AKI3_ONSET"]
for time_col in time_cols:
    ONSET[time_col] = pd.to_datetime(ONSET[time_col], format = "mixed")
    
# binary predictiton task
ONSET.loc[:, "EARLIEST_ONSET_DATE"] = np.min(ONSET[["AKI1_ONSET", "AKI2_ONSET", "AKI3_ONSET"]],
                                      axis = 1)
ONSET.loc[:, "AKI_LABEL"] = ONSET["EARLIEST_ONSET_DATE"].notna().astype(int)

ONSET.drop(["CENTER_NAME", "SERUM_CREAT_BASE", "NONAKI_SINCE_ADMIT", "AKI1_ONSET",
           "AKI2_ONSET", "AKI3_ONSET"], axis = 1, inplace = True)

Merge Onset with SCr

In [None]:
#process data type 
SCR["PATID"] = SCR["PATID"].astype(str)

In [None]:
ONSET_SCR = ONSET.merge(SCR[["PATID", "SPECIMEN_DATE", "RESULT_NUM"]], on = "PATID",
                       how = "left")

In [None]:
#after merging, process date time
ONSET_SCR["SPECIMEN_DATE"] = pd.to_datetime(ONSET_SCR["SPECIMEN_DATE"], format = "mixed")

In [None]:
#filter out those beyond this hospitalization (we also need history prior to this hospitalization)
ONSET_SCR = ONSET_SCR.loc[ONSET_SCR.SPECIMEN_DATE <= ONSET_SCR.DISCHARGE_DATE, :]
ONSET_SCR = ONSET_SCR.sort_values(by=['PATID', 'ADMIT_DATE', 'SPECIMEN_DATE'])

# get average SCr on the same day
ONSET_SCR_avg = ONSET_SCR.groupby(['PATID', 'ONSETS_ENCOUNTERID', 'SPECIMEN_DATE'])['RESULT_NUM'].mean().reset_index()

In [None]:
# append the info back
ONSET_SCR_app = ONSET_SCR.loc[:, ["PATID", "ONSETS_ENCOUNTERID", "ADMIT_DATE",
                                  "DISCHARGE_DATE", "EARLIEST_ONSET_DATE", "AKI_LABEL"]]
ONSET_SCR_app.drop_duplicates(inplace = True)
ONSET_SCR_avg = ONSET_SCR_app.merge(ONSET_SCR_avg, on = ["PATID", "ONSETS_ENCOUNTERID"],
                                   how = "left")

In [None]:
# get the prediction point for non-AKI patient
non_AKI_pat = ONSET_SCR_avg.loc[ONSET_SCR_avg.AKI_LABEL == 0, 
                                ["PATID", "ONSETS_ENCOUNTERID", "SPECIMEN_DATE"]]

In [None]:
non_AKI_pat.drop_duplicates(subset = ["PATID", "ONSETS_ENCOUNTERID"], keep = "last",
                           inplace = True)

In [None]:
non_AKI_pat.rename(columns = {"SPECIMEN_DATE": "PREDICTION_POINT"}, inplace = True)

In [None]:
ONSET_SCR_avg = ONSET_SCR_avg.merge(non_AKI_pat, on = ["PATID", "ONSETS_ENCOUNTERID"],
                                   how = "left")

In [None]:
ONSET_SCR_avg.loc[ONSET_SCR_avg.AKI_LABEL == 1, "PREDICTION_POINT"] = \
ONSET_SCR_avg.loc[ONSET_SCR_avg.AKI_LABEL == 1, "EARLIEST_ONSET_DATE"]

In [None]:
#check that we have predicition point for each encounter
assert(ONSET_SCR_avg.PREDICTION_POINT.isna().mean() == 0)

In [None]:
# the time frame we need for SCr is the -8 to -2 days prior to prediction point
ONSET_SCR_avg = ONSET_SCR_avg[((ONSET_SCR_avg.SPECIMEN_DATE <= (ONSET_SCR_avg.PREDICTION_POINT) - pd.Timedelta(days=2))) & \
                             (ONSET_SCR_avg.SPECIMEN_DATE >= ONSET_SCR_avg.PREDICTION_POINT - pd.Timedelta(days=8))]

In [None]:
#drop patients with less than 2 SCr measurements during the 7-day window
# group them and calcualte number of measurements
measure_num = ONSET_SCR_avg.groupby('ONSETS_ENCOUNTERID').size()
encounterID_to_drop = measure_num[measure_num < 2].index
ONSET_SCR_avg = ONSET_SCR_avg.loc[~ONSET_SCR_avg.ONSETS_ENCOUNTERID.isin(encounterID_to_drop), :]

In [None]:
#pivot all the SCr values, that is create features -8 ~ -2 and entries are RESULT_NUM
ONSET_SCR_avg["DAYS_BEFORE_PREDICTION_POINT"] = (ONSET_SCR_avg["SPECIMEN_DATE"] - \
ONSET_SCR_avg["PREDICTION_POINT"]).dt.days

#prepare a skleleton to merge on
unique_encounterids = list(ONSET_SCR_avg['ONSETS_ENCOUNTERID'].unique())
time_window = np.arange(-8, -1)  # from -8 to -2
skeleton = pd.MultiIndex.from_product([unique_encounterids, time_window], 
                                              names=['ONSETS_ENCOUNTERID', 
                                                     'DAYS_BEFORE_PREDICTION_POINT']).to_frame(index=False)
#merge on
skeleton = pd.merge(skeleton, ONSET_SCR_avg, 
                     on=['ONSETS_ENCOUNTERID', 'DAYS_BEFORE_PREDICTION_POINT'],
                     how='left')

#pivot
ONSET_SCR_formatted = skeleton.pivot(index='ONSETS_ENCOUNTERID', 
                                          columns='DAYS_BEFORE_PREDICTION_POINT', 
                                          values='RESULT_NUM').reset_index()

# get other info back
ONSET_SCR_app2 = ONSET_SCR_avg.loc[:, ["PATID", "ONSETS_ENCOUNTERID", "ADMIT_DATE", "DISCHARGE_DATE", 
                                       "PREDICTION_POINT", "AKI_LABEL"]]
ONSET_SCR_app2.drop_duplicates(inplace = True)
ONSET_SCR_formatted = ONSET_SCR_formatted.merge(ONSET_SCR_app2, on = "ONSETS_ENCOUNTERID", how = "left")

In [None]:
# only keep the earliest encounter of each patient
ONSET_SCR_formatted = ONSET_SCR_formatted.sort_values(by=['PATID', 'ADMIT_DATE'])
ONSET_SCR_formatted = ONSET_SCR_formatted.drop_duplicates(subset='PATID', keep='first')

Merge Onset with Labs

In [None]:
LAB["PATID"] = LAB["PATID"].astype(str)

In [None]:
# merge the lab 
ONSET_SCR_LAB = ONSET_SCR_formatted.merge(LAB, on = "PATID", how = "left")

In [None]:
ONSET_SCR_LAB["SPECIMEN_DATE"] = \
pd.to_datetime(ONSET_SCR_LAB["SPECIMEN_DATE"], format = "mixed")

In [None]:
ONSET_SCR_LAB = ONSET_SCR_LAB[(ONSET_SCR_LAB.SPECIMEN_DATE <= (ONSET_SCR_LAB.PREDICTION_POINT - pd.Timedelta(days=2))) & \
                              (ONSET_SCR_LAB.SPECIMEN_DATE >= ONSET_SCR_LAB.ADMIT_DATE  - pd.Timedelta(days=8))]

In [None]:
#we only keep the lastest result of a certain lab within the time window
ONSET_SCR_LAB = \
ONSET_SCR_LAB.sort_values(by=['PATID', 'ONSETS_ENCOUNTERID', 
                              'LAB_LOINC', 'SPECIMEN_DATE'])
ONSET_SCR_LAB = \
ONSET_SCR_LAB.groupby(['PATID', 'ONSETS_ENCOUNTERID', 
                       'LAB_LOINC']).last().reset_index()

In [None]:
#turn lab into feature columns
LAB_info = ONSET_SCR_LAB.pivot(index='ONSETS_ENCOUNTERID', 
                                              columns='LAB_LOINC', 
                                              values='RESULT_NUM')
LAB_info = LAB_info.reset_index()

In [None]:
# drop lab with missing rate > 0.5
nan_rate = LAB_info.isnull().mean()
columns_to_drop = list(nan_rate[nan_rate > 0.5].index)

# we also drop SCR, GFR that directly indicate renal functions
columns_to_drop += ["2160-0", "48642-3", "48643-1"]

# drop these labs
LAB_info.drop(columns=columns_to_drop, inplace = True)


In [None]:
#merge them back to the original dataframe
ONSET_SCR_LAB = ONSET_SCR_formatted.merge(LAB_info, 
                                      on = 'ONSETS_ENCOUNTERID', 
                                      how = 'left')

In [None]:
lab_feature_space = list(LAB_info.columns[1:])
len(lab_feature_space)

42

# Each Patient Should Be Unique: Just use the first Encounter of Each Patients

In [None]:
ONSET_SCR_LAB = ONSET_SCR_LAB.sort_values(by=['PATID', 'ADMIT_DATE'], ascending=True)

In [None]:
# drop duplicates based on PATID and ENCOUTNERID and keep the first records
ONSET_SCR_LAB = ONSET_SCR_LAB.drop_duplicates(subset='PATID', keep='first')

# Get just a Fraction of Patients for Development Purpose

In [None]:
ONSET_SCR_LAB = ONSET_SCR_LAB.sort_values(by='ADMIT_DATE', ascending=True)
ONSET_SCR_LAB

Unnamed: 0,ONSETS_ENCOUNTERID,-8,-7,-6,-5,-4,-3,-2,PATID,ADMIT_DATE,...,736-9,742-7,751-8,770-8,777-3,785-6,786-4,787-2,788-0,789-8
94509,4680,0.91,0.89,0.79,0.74,0.820,0.82,0.85,82622,2009-01-10,...,24.0,0.38,1.03,52.0,199.0,33.0,34.0,97.0,13.8,3.50
71848,1752329,,,,,,0.62,0.60,2809717,2009-01-11,...,,,,,256.0,19.0,31.0,60.0,20.5,5.50
70220,1412185,,,,,,0.79,0.83,2750504,2009-01-11,...,12.0,0.73,6.30,75.0,162.0,27.0,33.0,84.0,16.3,4.20
6230,634850,,,,,,1.48,1.57,1172190,2009-01-12,...,11.0,0.82,6.81,79.0,100.0,33.0,33.0,98.0,15.4,3.10
22916,1872565,1.13,1.08,1.09,0.79,0.930,1.05,0.85,1521393,2009-01-13,...,24.0,0.67,4.58,64.0,194.0,31.0,33.0,92.0,15.2,3.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73405,3913682,,,,,1.030,1.03,1.02,315796,2021-11-23,...,,,,,291.0,30.6,33.8,90.7,14.6,4.13
66849,5532091,,,,,0.630,0.55,0.60,2578284,2021-11-23,...,14.0,2.60,11.83,69.0,223.0,39.4,35.1,112.4,15.5,2.11
67053,3915237,,,,1.32,1.220,1.15,1.10,2600563,2021-11-24,...,,,,,216.0,31.1,32.4,96.1,14.8,4.03
90979,10390858,,,,,,1.13,0.98,74521,2021-11-25,...,4.0,0.50,17.47,92.0,158.0,29.8,33.4,89.1,13.9,3.67


In [None]:
# records after train-test split date will be used as test set
start_date = pd.to_datetime('2015-1-1')
split_date = pd.to_datetime('2016-1-1')
end_date = pd.to_datetime('2017-1-1')

%store start_date
%store split_date
%store end_date

Stored 'start_date' (Timestamp)
Stored 'split_date' (Timestamp)
Stored 'end_date' (Timestamp)


In [None]:
# take a half as train set AKI and the rest are test set AKI
# get their admission times
dataset_train = ONSET_SCR_LAB[(ONSET_SCR_LAB.ADMIT_DATE < split_date) & (ONSET_SCR_LAB.ADMIT_DATE >= start_date)]
dataset_test = ONSET_SCR_LAB[(ONSET_SCR_LAB.ADMIT_DATE >= split_date) & (ONSET_SCR_LAB.ADMIT_DATE < end_date)]

print(len(dataset_train))
print(len(dataset_test))
print(dataset_train.AKI_LABEL.value_counts())
print(dataset_test.AKI_LABEL.value_counts())

train_len = len(dataset_train)
test_len = len(dataset_test)
%store train_len
%store test_len

8637
8542
AKI_LABEL
0    7940
1     697
Name: count, dtype: int64
AKI_LABEL
0    7860
1     682
Name: count, dtype: int64
Stored 'train_len' (int)
Stored 'test_len' (int)


In [None]:
dataset_sampled = pd.concat([dataset_train, dataset_test], axis = 0)

In [None]:
time_window = np.arange(-8, -1)  # from -8 to -2

In [None]:
# save a copy for demographics chararcterization
dataset_sampled.to_csv("/blue/yonghui.wu/lideyi/Personalization_Methodology/DEMO_KUMC.csv", index = False)

In [None]:
%store -r train_len

In [None]:
# extract used cols
dataset_sampled = dataset_sampled.loc[:, list(time_window) + list(lab_feature_space) + ['AKI_LABEL']]

In [None]:
dataset_sampled.AKI_LABEL.value_counts()

AKI_LABEL
0    15800
1     1379
Name: count, dtype: int64

In [None]:
dataset_sampled.reset_index(drop = True, inplace = True)

In [None]:
dataset_sampled

Unnamed: 0,-8,-7,-6,-5,-4,-3,-2,14979-9,1742-6,17861-6,...,742-7,751-8,770-8,777-3,785-6,786-4,787-2,788-0,789-8,AKI_LABEL
0,,,,,0.72,0.95,0.88,,,9.3,...,1.1,7.9,60.0,239.0,32.5,33.5,97.1,13.8,4.01,0
1,,1.210,,0.97,1.04,0.93,0.96,,23.0,8.9,...,,,,177.0,31.8,33.3,95.5,14.9,3.89,0
2,1.03,0.990,,,1.01,0.91,0.82,,26.0,9.1,...,0.6,8.8,75.0,223.0,28.8,30.9,93.2,16.3,3.66,0
3,,,,0.69,0.90,0.73,0.62,32.2,28.0,8.6,...,,,,211.0,29.7,34.2,86.9,14.3,3.63,0
4,0.60,0.615,0.685,0.62,0.50,0.53,0.56,97.6,10.0,8.1,...,1.5,12.4,10.0,69.0,30.5,32.7,98.5,18.2,2.26,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17174,1.11,1.170,1.170,1.19,1.16,1.09,1.10,67.3,4.0,9.2,...,0.6,8.7,85.0,247.0,30.8,32.8,94.1,16.7,2.83,0
17175,,,,,,1.25,1.21,,,9.1,...,,,,248.0,29.9,32.7,91.3,15.4,3.18,1
17176,,,,1.32,0.91,0.94,1.06,28.9,19.0,9.8,...,1.2,6.8,65.0,446.0,28.2,32.3,87.2,16.6,3.88,0
17177,1.31,1.280,1.460,1.36,1.52,1.60,1.62,133.6,3.0,9.5,...,0.8,3.7,58.0,228.0,30.5,34.3,88.8,13.7,3.74,0


# Use RF to Detect Lab Importance 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# drop patients for test set, drop patients contains nan
dataset_clean = dataset_sampled.iloc[:train_len, :]
dataset_clean = dataset_clean.dropna() 

In [None]:
dataset_clean.AKI_LABEL.value_counts()

AKI_LABEL
0    415
1     68
Name: count, dtype: int64

In [None]:
X = dataset_clean.drop(columns=['AKI_LABEL']).loc[:, lab_feature_space]
y = dataset_clean['AKI_LABEL']

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

In [None]:
feature_importances = rf.feature_importances_

# ascending order
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

In [None]:
assert(len(feature_importance_df) == len(lab_feature_space))

In [None]:
# get the lab overlap weighting, that is top 20% features normalized according to feature importance and
# set ohters to 0
top_rate = 0.5
threshold = feature_importance_df['Importance'].quantile(1 - top_rate)
top_features = feature_importance_df['Importance'] >= threshold
feature_importance_df['Weighting'] = 0
feature_importance_df.loc[top_features, 'Weighting'] = \
feature_importance_df.loc[top_features, 'Importance'] / feature_importance_df.loc[top_features, 'Importance'].sum()

 0.07809331 0.08882308 0.04473739 0.03453632 0.1254981  0.03571204
 0.04551276 0.03418316 0.03025788 0.03491163 0.03508939 0.0482373
 0.03718249 0.03223167 0.03612296]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  feature_importance_df.loc[top_features, 'Weighting'] = \


In [None]:
lab_overlap_weighting = list(feature_importance_df['Weighting'])

In [None]:
dataset_sampled.reset_index(drop = True, inplace = True)
dataset_sampled.to_csv("/blue/yonghui.wu/lideyi/Personalization_Methodology/dataset.csv", index = False)

In [None]:
dataset_sampled

Unnamed: 0,-8,-7,-6,-5,-4,-3,-2,14979-9,1742-6,17861-6,...,742-7,751-8,770-8,777-3,785-6,786-4,787-2,788-0,789-8,AKI_LABEL
0,,,,,0.72,0.95,0.88,,,9.3,...,1.1,7.9,60.0,239.0,32.5,33.5,97.1,13.8,4.01,0
1,,1.210,,0.97,1.04,0.93,0.96,,23.0,8.9,...,,,,177.0,31.8,33.3,95.5,14.9,3.89,0
2,1.03,0.990,,,1.01,0.91,0.82,,26.0,9.1,...,0.6,8.8,75.0,223.0,28.8,30.9,93.2,16.3,3.66,0
3,,,,0.69,0.90,0.73,0.62,32.2,28.0,8.6,...,,,,211.0,29.7,34.2,86.9,14.3,3.63,0
4,0.60,0.615,0.685,0.62,0.50,0.53,0.56,97.6,10.0,8.1,...,1.5,12.4,10.0,69.0,30.5,32.7,98.5,18.2,2.26,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17174,1.11,1.170,1.170,1.19,1.16,1.09,1.10,67.3,4.0,9.2,...,0.6,8.7,85.0,247.0,30.8,32.8,94.1,16.7,2.83,0
17175,,,,,,1.25,1.21,,,9.1,...,,,,248.0,29.9,32.7,91.3,15.4,3.18,1
17176,,,,1.32,0.91,0.94,1.06,28.9,19.0,9.8,...,1.2,6.8,65.0,446.0,28.2,32.3,87.2,16.6,3.88,0
17177,1.31,1.280,1.460,1.36,1.52,1.60,1.62,133.6,3.0,9.5,...,0.8,3.7,58.0,228.0,30.5,34.3,88.8,13.7,3.74,0


In [None]:
time_window = [str(i) for i in time_window]
%store time_window
%store lab_feature_space
%store lab_overlap_weighting

Stored 'time_window' (list)
Stored 'lab_feature_space' (list)
Stored 'lab_overlap_weighting' (list)
