In [57]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()
%store -r raw_path

Strategy:  
1. Feature space is: 7-day SCr trajectories and all labs prior to prediction point.✅  
2. The prediction point was 1-day prior to onset for AKI patients and 1-day prior to the last SCr record for non-AKI patients.✅  
3. Each patient should have at least 2 SCr measurement within the window.✅
4. Exclude those SCr baseline > 3.5✅   
5. Lab missingness > 50% dropped.✅  
6. Each patient/encounter is unique✅  

# Read KUMC Patients' ONSET, LAB, SCr and COMO

In [58]:
# define data storage path
raw_path = '/blue/yonghui.wu/hoyinchan/Data/data2022raw/'
data_path = raw_path + "KUMC_ORCALE" + '/raw/'

In [59]:
#Read in Onsets data and only use KUMC data
All_onsets = pd.read_csv('/blue/yonghui.wu/lideyi/Personalization_Methodology/NEW_ONSETS.csv')
ONSET = All_onsets.loc[All_onsets.CENTER_NAME == "KUMC"].copy(deep = True)

  All_onsets = pd.read_csv('/blue/yonghui.wu/lideyi/Personalization_Methodology/NEW_ONSETS.csv')


In [60]:
#Read in Lab test results
with open(data_path + 'AKI_LAB.csv', 'r', encoding='utf-8', errors='ignore') as file:
    LAB = pd.read_csv(data_path + 'AKI_LAB.csv', 
                      delimiter=",", usecols=['PATID', 'LAB_LOINC', 
                                              'SPECIMEN_DATE"+PD.DATE_SHIFT"', 'RESULT_NUM'],
                      encoding='unicode_escape')
    
LAB.rename(columns = {'SPECIMEN_DATE"+PD.DATE_SHIFT"': "SPECIMEN_DATE"}, inplace = True)

In [61]:
# Read in SCr trajectories
SCR_use_cols = ['ONSETS_ENCOUNTERID','PATID','ENCOUNTERID',
                        'SPECIMEN_DATE"+PD.DATE_SHIFT"','RESULT_NUM', 'DAYS_SINCE_ADMIT']
SCR = pd.read_csv(data_path + "AKI_LAB_SCR.csv", delimiter = ',', usecols=SCR_use_cols)
SCR.rename(columns = {'SPECIMEN_DATE"+PD.DATE_SHIFT"': "SPECIMEN_DATE"}, inplace = True)

# Construct a Full DataFrame

Process ONSET

In [62]:
#format datatype for merge
#exclude those baseline SCr > 3.5, which indicate poor renal functions
ONSET = ONSET.loc[ONSET.SERUM_CREAT_BASE < 3.5, :]

ONSET[["PATID", "ONSETS_ENCOUNTERID"]] = ONSET[["PATID", "ONSETS_ENCOUNTERID"]].astype(str)

time_cols = ["ADMIT_DATE", "DISCHARGE_DATE", "AKI1_ONSET", "AKI2_ONSET", "AKI3_ONSET"]
for time_col in time_cols:
    ONSET[time_col] = pd.to_datetime(ONSET[time_col], format = "mixed")
    
# binary predictiton task
ONSET["EARLIEST_ONSET_DATE"] = np.min(ONSET[["AKI1_ONSET", "AKI2_ONSET", "AKI3_ONSET"]],
                                      axis = 1)
ONSET["AKI_LABEL"] = ONSET["EARLIEST_ONSET_DATE"].notna().astype(int)

ONSET.drop(["CENTER_NAME", "SERUM_CREAT_BASE", "NONAKI_SINCE_ADMIT", "AKI1_ONSET",
           "AKI2_ONSET", "AKI3_ONSET"], axis = 1, inplace = True)

Merge Onset with SCr

In [63]:
#process data type 
SCR["PATID"] = SCR["PATID"].astype(str)

In [64]:
ONSET_SCR = ONSET.merge(SCR[["PATID", "SPECIMEN_DATE", "RESULT_NUM"]], on = "PATID",
                       how = "left")

In [65]:
#after merging, process date time
ONSET_SCR["SPECIMEN_DATE"] = pd.to_datetime(ONSET_SCR["SPECIMEN_DATE"], format = "mixed")

In [66]:
#filter out those beyond this hospitalization (we also need history prior to this hospitalization)
ONSET_SCR = ONSET_SCR.loc[ONSET_SCR.SPECIMEN_DATE <= ONSET_SCR.DISCHARGE_DATE, :]
ONSET_SCR = ONSET_SCR.sort_values(by=['PATID', 'ADMIT_DATE', 'SPECIMEN_DATE'])

# get average SCr on the same day
ONSET_SCR_avg = ONSET_SCR.groupby(['PATID', 'ONSETS_ENCOUNTERID', 'SPECIMEN_DATE'])['RESULT_NUM'].mean().reset_index()

In [None]:
# append the info back
ONSET_SCR_app = ONSET_SCR.loc[:, ["PATID", "ONSETS_ENCOUNTERID", "ADMIT_DATE",
                                  "DISCHARGE_DATE", "EARLIEST_ONSET_DATE", "AKI_LABEL"]]
ONSET_SCR_app.drop_duplicates(inplace = True)
ONSET_SCR_avg = ONSET_SCR_app.merge(ONSET_SCR_avg, on = ["PATID", "ONSETS_ENCOUNTERID"],
                                   how = "left")

In [None]:
# get the prediction point for non-AKI patient
non_AKI_pat = ONSET_SCR_avg.loc[ONSET_SCR_avg.AKI_LABEL == 0, 
                                ["PATID", "ONSETS_ENCOUNTERID", "SPECIMEN_DATE"]]

In [None]:
non_AKI_pat.drop_duplicates(subset = ["PATID", "ONSETS_ENCOUNTERID"], keep = "last",
                           inplace = True)

In [None]:
non_AKI_pat.rename(columns = {"SPECIMEN_DATE": "PREDICTION_POINT"}, inplace = True)

In [None]:
ONSET_SCR_avg = ONSET_SCR_avg.merge(non_AKI_pat, on = ["PATID", "ONSETS_ENCOUNTERID"],
                                   how = "left")

In [None]:
ONSET_SCR_avg.loc[ONSET_SCR_avg.AKI_LABEL == 1, "PREDICTION_POINT"] = \
ONSET_SCR_avg.loc[ONSET_SCR_avg.AKI_LABEL == 1, "EARLIEST_ONSET_DATE"]

In [None]:
#check that we have predicition point for each encounter
assert(ONSET_SCR_avg.PREDICTION_POINT.isna().mean() == 0)

In [None]:
# the time frame we need for SCr is the -8 to -2 days prior to prediction point
ONSET_SCR_avg = ONSET_SCR_avg[((ONSET_SCR_avg.SPECIMEN_DATE <= ONSET_SCR_avg.PREDICTION_POINT) - pd.Timedelta(days=2)) & \
                             (ONSET_SCR_avg.SPECIMEN_DATE >= ONSET_SCR_avg.PREDICTION_POINT - pd.Timedelta(days=8))]

In [None]:
#drop patients with less than 2 SCr measurements during the 7-day window
# group them and calcualte number of measurements
measure_num = ONSET_SCR_avg.groupby('ONSETS_ENCOUNTERID').size()
encounterID_to_drop = measure_num[measure_num < 2].index
ONSET_SCR_avg = ONSET_SCR_avg.loc[~ONSET_SCR_avg.ONSETS_ENCOUNTERID.isin(encounterID_to_drop), :]

In [None]:
#pivot all the SCr values, that is create features -8 ~ -2 and entries are RESULT_NUM
ONSET_SCR_avg["DAYS_BEFORE_PREDICTION_POINT"] = (ONSET_SCR_avg["SPECIMEN_DATE"] - \
ONSET_SCR_avg["PREDICTION_POINT"]).dt.days

#prepare a skleleton to merge on
unique_encounterids = list(ONSET_SCR_avg['ONSETS_ENCOUNTERID'].unique())
time_window = np.arange(-8, -1)  # from -8 to -2
skeleton = pd.MultiIndex.from_product([unique_encounterids, time_window], 
                                              names=['ONSETS_ENCOUNTERID', 
                                                     'DAYS_BEFORE_PREDICTION_POINT']).to_frame(index=False)
#merge on
skeleton = pd.merge(skeleton, ONSET_SCR_avg, 
                     on=['ONSETS_ENCOUNTERID', 'DAYS_BEFORE_PREDICTION_POINT'],
                     how='left')

#pivot
ONSET_SCR_formatted = skeleton.pivot(index='ONSETS_ENCOUNTERID', 
                                          columns='DAYS_BEFORE_PREDICTION_POINT', 
                                          values='RESULT_NUM').reset_index()

# get other info back
ONSET_SCR_app2 = ONSET_SCR_avg.loc[:, ["PATID", "ONSETS_ENCOUNTERID", "ADMIT_DATE", "DISCHARGE_DATE", 
                                       "PREDICTION_POINT", "AKI_LABEL"]]
ONSET_SCR_app2.drop_duplicates(inplace = True)
ONSET_SCR_formatted = ONSET_SCR_formatted.merge(ONSET_SCR_app2, on = "ONSETS_ENCOUNTERID", how = "left")

In [None]:
# only keep the earliest encounter of each patient
ONSET_SCR_formatted = ONSET_SCR_formatted.sort_values(by=['PATID', 'ADMIT_DATE'])
ONSET_SCR_formatted = ONSET_SCR_formatted.drop_duplicates(subset='PATID', keep='first')

Merge Onset with Labs

In [None]:
LAB["PATID"] = LAB["PATID"].astype(str)

In [None]:
# merge the lab 
ONSET_SCR_LAB = ONSET_SCR_formatted.merge(LAB, on = "PATID", how = "left")

In [None]:
ONSET_SCR_LAB["SPECIMEN_DATE"] = \
pd.to_datetime(ONSET_SCR_LAB["SPECIMEN_DATE"], format = "mixed")

In [None]:
ONSET_SCR_LAB = ONSET_SCR_LAB[(ONSET_SCR_LAB.SPECIMEN_DATE <= ONSET_SCR_LAB.PREDICTION_POINT) & \
                              (ONSET_SCR_LAB.SPECIMEN_DATE >= ONSET_SCR_LAB.ADMIT_DATE)]

In [None]:
#we only keep the lastest result of a certain lab within the time window
ONSET_SCR_LAB = \
ONSET_SCR_LAB.sort_values(by=['PATID', 'ONSETS_ENCOUNTERID', 
                              'LAB_LOINC', 'SPECIMEN_DATE'])
ONSET_SCR_LAB = \
ONSET_SCR_LAB.groupby(['PATID', 'ONSETS_ENCOUNTERID', 
                       'LAB_LOINC']).last().reset_index()

In [None]:
#turn lab into feature columns
LAB_info = ONSET_SCR_LAB.pivot(index='ONSETS_ENCOUNTERID', 
                                              columns='LAB_LOINC', 
                                              values='RESULT_NUM')
LAB_info = LAB_info.reset_index()

In [None]:
# drop lab with missing rate > 0.5
nan_rate = LAB_info.isnull().mean()
columns_to_drop = list(nan_rate[nan_rate > 0.5].index)

# we also drop SCR, GFR that directly indicate renal functions
columns_to_drop += ["2160-0", "48642-3", "48643-1"]

# drop these labs
LAB_info.drop(columns=columns_to_drop, inplace = True)


In [None]:
#merge them back to the original dataframe
ONSET_SCR_LAB = ONSET_SCR_formatted.merge(LAB_info, 
                                      on = 'ONSETS_ENCOUNTERID', 
                                      how = 'left')

In [None]:
lab_feature_space = list(LAB_info.columns[1:])
len(lab_feature_space)

40

# Each Patient Should Be Unique: Just use the first Encounter of Each Patients

In [None]:
ONSET_SCR_LAB = ONSET_SCR_LAB.sort_values(by=['PATID', 'ADMIT_DATE'], ascending=True)

In [None]:
# drop duplicates based on PATID and ENCOUTNERID and keep the first records
ONSET_SCR_LAB = ONSET_SCR_LAB.drop_duplicates(subset='PATID', keep='first')

# Get just a Fraction of Patients for Development Purpose

In [None]:
ONSET_SCR_LAB = ONSET_SCR_LAB.sort_values(by='ADMIT_DATE', ascending=True)
ONSET_SCR_LAB

Unnamed: 0,ONSETS_ENCOUNTERID,-7,-6,-5,-4,-3,-2,-1,PATID,ADMIT_DATE,...,736-9,742-7,751-8,770-8,777-3,785-6,786-4,787-2,788-0,789-8
113510,4680,0.89,0.79,0.74,0.820,0.82,0.850000,0.79,82622,2009-01-10,...,16.0,0.28,1.95,71.0,190.0,33.0,34.0,97.0,14.1,3.20
84313,1412185,,,,,0.79,0.830000,0.49,2750504,2009-01-11,...,12.0,0.73,6.30,75.0,167.0,27.0,32.0,84.0,16.1,4.30
86307,1752329,,,,,0.62,0.600000,0.56,2809717,2009-01-11,...,,,,,251.0,19.0,31.0,59.0,21.3,5.70
7442,634850,,,,,1.48,1.570000,1.80,1172190,2009-01-12,...,11.0,0.74,8.10,82.0,137.0,34.0,34.0,99.0,15.2,2.30
27475,1872565,1.08,1.09,0.79,0.930,1.05,0.850000,1.20,1521393,2009-01-13,...,24.0,0.67,4.58,64.0,213.0,31.0,33.0,93.0,15.7,4.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80629,3915237,,,1.32,1.220,1.15,1.100000,1.13,2600563,2021-11-24,...,,,,,235.0,31.2,32.7,95.4,14.6,4.14
109227,10390858,,,,,1.13,0.980000,0.94,74521,2021-11-25,...,4.0,0.50,17.47,92.0,127.0,29.6,32.9,89.9,14.1,3.39
88237,7153075,,,,,,0.723333,0.68,315983,2021-11-26,...,,,,,87.0,28.7,34.7,82.9,14.5,2.71
3081,7153387,,,,1.345,1.12,1.270000,1.14,1081553,2021-11-27,...,18.0,0.78,4.09,62.0,333.0,31.7,33.7,94.1,13.6,3.07


In [None]:
# records after train-test split date will be used as test set
start_date = pd.to_datetime('2015-1-1')
split_date = pd.to_datetime('2016-1-1')
end_date = pd.to_datetime('2017-1-1')

%store start_date
%store split_date
%store end_date

Stored 'start_date' (Timestamp)
Stored 'split_date' (Timestamp)
Stored 'end_date' (Timestamp)


In [None]:
# take a half as train set AKI and the rest are test set AKI
# get their admission times
dataset_train = ONSET_SCR_LAB[(ONSET_SCR_LAB.ADMIT_DATE < split_date) & (ONSET_SCR_LAB.ADMIT_DATE >= start_date)]
dataset_test = ONSET_SCR_LAB[(ONSET_SCR_LAB.ADMIT_DATE >= split_date) & (ONSET_SCR_LAB.ADMIT_DATE < end_date)]

print(len(dataset_train))
print(len(dataset_test))
print(dataset_train.AKI_LABEL.value_counts())
print(dataset_test.AKI_LABEL.value_counts())

train_len = len(dataset_train)
test_len = len(dataset_test)
%store train_len
%store test_len

10281
10256
AKI_LABEL
0    9339
1     942
Name: count, dtype: int64
AKI_LABEL
0    9388
1     868
Name: count, dtype: int64
Stored 'train_len' (int)
Stored 'test_len' (int)


In [None]:
dataset_sampled = pd.concat([dataset_train, dataset_test], axis = 0)

In [None]:
time_window = np.arange(-8, -1)  # from -8 to -2

In [None]:
# save a copy for demographics chararcterization
dataset_sampled.to_csv("/blue/yonghui.wu/lideyi/Personalization_Methodology/DEMO_KUMC.csv", index = False)

In [None]:
%store -r train_len

In [None]:
# extract used cols
dataset_sampled = dataset_sampled.loc[:, list(time_window) + list(lab_feature_space) + ['AKI_LABEL']]

In [None]:
dataset_sampled.AKI_LABEL.value_counts()

AKI_LABEL
0    18727
1     1810
Name: count, dtype: int64

In [None]:
dataset_sampled.reset_index(drop = True, inplace = True)

In [None]:
dataset_sampled

Unnamed: 0,-7,-6,-5,-4,-3,-2,-1,14979-9,1742-6,17861-6,...,742-7,751-8,770-8,777-3,785-6,786-4,787-2,788-0,789-8,AKI_LABEL
0,,,,,0.91,0.790000,0.91,,,8.1,...,,,,162.0,30.6,32.9,93.2,13.5,3.70,0
1,,,,,0.74,0.680000,0.71,,,8.2,...,,,,236.0,30.5,34.3,88.9,13.7,3.66,0
2,0.300,0.290,0.30,0.29,0.29,0.300000,0.30,,20.0,8.4,...,0.3,3.6,62.0,308.0,28.1,32.6,86.0,15.4,2.90,0
3,0.615,0.685,0.62,0.50,0.53,0.560000,0.76,32.9,9.0,7.8,...,3.9,16.0,9.0,98.0,35.0,34.2,102.3,17.4,1.74,1
4,,,,,0.88,0.800000,1.00,27.9,14.0,8.5,...,0.8,3.5,56.0,154.0,24.9,32.4,76.8,16.5,4.25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20532,,,,,,2.030000,2.02,,23.0,9.0,...,0.9,7.6,75.0,271.0,29.9,34.1,87.9,15.8,3.14,1
20533,,,,,,0.800000,0.57,,20.0,7.8,...,,,,393.0,23.5,31.4,75.0,19.0,3.72,0
20534,,,,,0.84,0.880000,0.80,,8.0,8.4,...,0.5,3.3,47.0,182.0,32.8,34.9,94.1,14.4,3.86,0
20535,,,,,0.88,0.800000,0.90,,15.0,9.4,...,,,,251.0,30.2,33.1,91.4,14.5,4.51,0


# Use RF to Detect Lab Importance 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# drop patients for test set, drop patients contains nan
dataset_clean = dataset_sampled.iloc[:train_len, :]
dataset_clean = dataset_clean.dropna() 

In [None]:
dataset_clean.AKI_LABEL.value_counts()

AKI_LABEL
0    665
1     86
Name: count, dtype: int64

In [None]:
X = dataset_clean.drop(columns=['AKI_LABEL']).loc[:, lab_feature_space]
y = dataset_clean['AKI_LABEL']

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

In [None]:
feature_importances = rf.feature_importances_

# ascending order
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

In [None]:
assert(len(feature_importance_df) == len(lab_feature_space))

In [None]:
# get the lab overlap weighting, that is top 20% features normalized according to feature importance and
# set ohters to 0
top_rate = 0.5
threshold = feature_importance_df['Importance'].quantile(1 - top_rate)
top_features = feature_importance_df['Importance'] >= threshold
feature_importance_df['Weighting'] = 0
feature_importance_df.loc[top_features, 'Weighting'] = \
feature_importance_df.loc[top_features, 'Importance'] / feature_importance_df.loc[top_features, 'Importance'].sum()

In [None]:
lab_overlap_weighting = list(feature_importance_df['Weighting'])

In [None]:
dataset_sampled.reset_index(drop = True, inplace = True)
dataset_sampled.to_csv("/blue/yonghui.wu/lideyi/Personalization_Methodology/dataset.csv", index = False)

In [None]:
dataset_sampled

Unnamed: 0,-7,-6,-5,-4,-3,-2,-1,14979-9,1742-6,17861-6,...,742-7,751-8,770-8,777-3,785-6,786-4,787-2,788-0,789-8,AKI_LABEL
0,,,,,0.91,0.790000,0.91,,,8.1,...,,,,162.0,30.6,32.9,93.2,13.5,3.70,0
1,,,,,0.74,0.680000,0.71,,,8.2,...,,,,236.0,30.5,34.3,88.9,13.7,3.66,0
2,0.300,0.290,0.30,0.29,0.29,0.300000,0.30,,20.0,8.4,...,0.3,3.6,62.0,308.0,28.1,32.6,86.0,15.4,2.90,0
3,0.615,0.685,0.62,0.50,0.53,0.560000,0.76,32.9,9.0,7.8,...,3.9,16.0,9.0,98.0,35.0,34.2,102.3,17.4,1.74,1
4,,,,,0.88,0.800000,1.00,27.9,14.0,8.5,...,0.8,3.5,56.0,154.0,24.9,32.4,76.8,16.5,4.25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20532,,,,,,2.030000,2.02,,23.0,9.0,...,0.9,7.6,75.0,271.0,29.9,34.1,87.9,15.8,3.14,1
20533,,,,,,0.800000,0.57,,20.0,7.8,...,,,,393.0,23.5,31.4,75.0,19.0,3.72,0
20534,,,,,0.84,0.880000,0.80,,8.0,8.4,...,0.5,3.3,47.0,182.0,32.8,34.9,94.1,14.4,3.86,0
20535,,,,,0.88,0.800000,0.90,,15.0,9.4,...,,,,251.0,30.2,33.1,91.4,14.5,4.51,0


In [None]:
time_window = [str(i) for i in time_window]
%store time_window
%store lab_feature_space
%store lab_overlap_weighting

Stored 'time_window' (list)
Stored 'lab_feature_space' (list)
Stored 'lab_overlap_weighting' (list)
