In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()

Procedures:  
1. Feature space is: 7-day SCr trajectories and all labs prior to prediction point.✅  
2. The prediction point was 1-day prior to onset for AKI patients and 1-day prior to the last SCr record for non-AKI patients.✅  
3. Each patient should have at least 2 SCr measurement within the window.✅
4. Exclude those SCr baseline > 3.5✅   
5. Lab missingness > 50% dropped.✅  
6. Each patient/encounter is unique✅  

# Read KUMC Patients' ONSET, LAB, SCr and COMO

In [None]:
# define data storage path
raw_path = '/blue/yonghui.wu/hoyinchan/Data/data2022raw/'
data_path = raw_path + "KUMC_ORCALE" + '/raw/'

In [None]:
#Read in Onsets data and only use KUMC data
All_onsets = pd.read_csv('/blue/yonghui.wu/lideyi/Personalization_Methodology/NEW_ONSETS.csv')
ONSET = All_onsets.loc[All_onsets.CENTER_NAME == "KUMC"].copy(deep = True)

In [None]:
#Read in Lab test results
with open(data_path + 'AKI_LAB.csv', 'r', encoding='utf-8', errors='ignore') as file:
    LAB = pd.read_csv(data_path + 'AKI_LAB.csv', 
                      delimiter=",", usecols=['PATID', 'LAB_LOINC', 'SPECIMEN_DATE"+PD.DATE_SHIFT"', 'RESULT_NUM'], encoding='unicode_escape')
    
LAB.rename(columns = {'SPECIMEN_DATE"+PD.DATE_SHIFT"': "SPECIMEN_DATE"}, inplace = True)

In [None]:
# Read in SCr trajectories
SCR_use_cols = ['ONSETS_ENCOUNTERID','PATID','ENCOUNTERID',
                        'SPECIMEN_DATE"+PD.DATE_SHIFT"','RESULT_NUM', 'DAYS_SINCE_ADMIT']
SCR = pd.read_csv(data_path + "AKI_LAB_SCR.csv", delimiter = ',', usecols=SCR_use_cols)
SCR.rename(columns = {'SPECIMEN_DATE"+PD.DATE_SHIFT"': "SPECIMEN_DATE"}, inplace = True)

# Construct a Full DataFrame

Process ONSET

In [None]:
#format datatype for merge
#exclude those baseline SCr > 3.5, which indicate poor renal functions
ONSET = ONSET.loc[ONSET.SERUM_CREAT_BASE < 3.5, :]

ONSET.loc[:, ["PATID", "ONSETS_ENCOUNTERID"]] = ONSET[["PATID", "ONSETS_ENCOUNTERID"]].astype(str)

time_cols = ["ADMIT_DATE", "DISCHARGE_DATE", "AKI1_ONSET", "AKI2_ONSET", "AKI3_ONSET"]
for time_col in time_cols:
    ONSET[time_col] = pd.to_datetime(ONSET[time_col], format = "mixed")
    
# binary predictiton task
ONSET.loc[:, "EARLIEST_ONSET_DATE"] = np.min(ONSET[["AKI1_ONSET", "AKI2_ONSET", "AKI3_ONSET"]], axis = 1)
ONSET.loc[:, "AKI_LABEL"] = ONSET["EARLIEST_ONSET_DATE"].notna().astype(int)

ONSET.drop(["CENTER_NAME", "SERUM_CREAT_BASE", "NONAKI_SINCE_ADMIT", "AKI1_ONSET",
           "AKI2_ONSET", "AKI3_ONSET"], axis = 1, inplace = True)

Merge Onset with SCr

In [None]:
#process data type 
SCR["PATID"] = SCR["PATID"].astype(str)

In [None]:
ONSET_SCR = ONSET.merge(SCR[["PATID", "SPECIMEN_DATE", "RESULT_NUM"]], on = "PATID", how = "left")

In [None]:
#after merging, process date time
ONSET_SCR["SPECIMEN_DATE"] = pd.to_datetime(ONSET_SCR["SPECIMEN_DATE"], format = "mixed")

In [None]:
#filter out those beyond this hospitalization (we also need history prior to this hospitalization)
ONSET_SCR = ONSET_SCR.loc[ONSET_SCR.SPECIMEN_DATE <= ONSET_SCR.DISCHARGE_DATE, :]
ONSET_SCR = ONSET_SCR.sort_values(by=['PATID', 'ADMIT_DATE', 'SPECIMEN_DATE'])

# get average SCr on the same day
ONSET_SCR_avg = ONSET_SCR.groupby(['PATID', 'ONSETS_ENCOUNTERID', 'SPECIMEN_DATE'])['RESULT_NUM'].mean().reset_index()

In [None]:
# append the info back
ONSET_SCR_app = ONSET_SCR.loc[:, ["PATID", "ONSETS_ENCOUNTERID", "ADMIT_DATE",
                                  "DISCHARGE_DATE", "EARLIEST_ONSET_DATE", "AKI_LABEL"]]
ONSET_SCR_app.drop_duplicates(inplace = True)
ONSET_SCR_avg = ONSET_SCR_app.merge(ONSET_SCR_avg, on = ["PATID", "ONSETS_ENCOUNTERID"], how = "left")

In [None]:
# get the prediction point for non-AKI patient
non_AKI_pat = ONSET_SCR_avg.loc[ONSET_SCR_avg.AKI_LABEL == 0, 
                                ["PATID", "ONSETS_ENCOUNTERID", "SPECIMEN_DATE"]]

In [None]:
non_AKI_pat.drop_duplicates(subset = ["PATID", "ONSETS_ENCOUNTERID"], keep = "last",
                           inplace = True)

In [None]:
non_AKI_pat.rename(columns = {"SPECIMEN_DATE": "PREDICTION_POINT"}, inplace = True)

In [None]:
ONSET_SCR_avg = ONSET_SCR_avg.merge(non_AKI_pat, on = ["PATID", "ONSETS_ENCOUNTERID"], how = "left")

In [None]:
ONSET_SCR_avg.loc[ONSET_SCR_avg.AKI_LABEL == 1, "PREDICTION_POINT"] = \
ONSET_SCR_avg.loc[ONSET_SCR_avg.AKI_LABEL == 1, "EARLIEST_ONSET_DATE"]

In [None]:
#check that we have predicition point for each encounter
assert(ONSET_SCR_avg.PREDICTION_POINT.isna().mean() == 0)

In [None]:
# the time frame we need for SCr is the -8 to -2 days prior to prediction point
ONSET_SCR_avg = ONSET_SCR_avg[((ONSET_SCR_avg.SPECIMEN_DATE <= (ONSET_SCR_avg.PREDICTION_POINT) - pd.Timedelta(days=2))) & \
                             (ONSET_SCR_avg.SPECIMEN_DATE >= ONSET_SCR_avg.PREDICTION_POINT - pd.Timedelta(days=8))]

In [None]:
#drop patients with less than 2 SCr measurements during the 7-day window
# group them and calcualte number of measurements
measure_num = ONSET_SCR_avg.groupby('ONSETS_ENCOUNTERID').size()
encounterID_to_drop = measure_num[measure_num < 2].index
ONSET_SCR_avg = ONSET_SCR_avg.loc[~ONSET_SCR_avg.ONSETS_ENCOUNTERID.isin(encounterID_to_drop), :]

In [None]:
#pivot all the SCr values, that is create features -8 ~ -2 and entries are RESULT_NUM
ONSET_SCR_avg["DAYS_BEFORE_PREDICTION_POINT"] = (ONSET_SCR_avg["SPECIMEN_DATE"] - \
ONSET_SCR_avg["PREDICTION_POINT"]).dt.days

#prepare a skleleton to merge on
unique_encounterids = list(ONSET_SCR_avg['ONSETS_ENCOUNTERID'].unique())
time_window = np.arange(-8, -1)  # from -8 to -2
skeleton = pd.MultiIndex.from_product([unique_encounterids, time_window], 
                                              names=['ONSETS_ENCOUNTERID', 
                                                     'DAYS_BEFORE_PREDICTION_POINT']).to_frame(index=False)
#merge on
skeleton = pd.merge(skeleton, ONSET_SCR_avg, on=['ONSETS_ENCOUNTERID', 'DAYS_BEFORE_PREDICTION_POINT'], how='left')

#pivot
ONSET_SCR_formatted = skeleton.pivot(index='ONSETS_ENCOUNTERID', 
                                          columns='DAYS_BEFORE_PREDICTION_POINT', 
                                          values='RESULT_NUM').reset_index()

# get other info back
ONSET_SCR_app2 = ONSET_SCR_avg.loc[:, ["PATID", "ONSETS_ENCOUNTERID", "ADMIT_DATE", "DISCHARGE_DATE", 
                                       "PREDICTION_POINT", "AKI_LABEL"]]
ONSET_SCR_app2.drop_duplicates(inplace = True)
ONSET_SCR_formatted = ONSET_SCR_formatted.merge(ONSET_SCR_app2, on = "ONSETS_ENCOUNTERID", how = "left")

In [None]:
# only keep the earliest encounter of each patient
ONSET_SCR_formatted = ONSET_SCR_formatted.sort_values(by=['PATID', 'ADMIT_DATE'])
ONSET_SCR_formatted = ONSET_SCR_formatted.drop_duplicates(subset='PATID', keep='first')

Merge Onset with Labs

In [None]:
LAB["PATID"] = LAB["PATID"].astype(str)

In [None]:
# merge the lab 
ONSET_SCR_LAB = ONSET_SCR_formatted.merge(LAB, on = "PATID", how = "left")

In [None]:
ONSET_SCR_LAB["SPECIMEN_DATE"] = pd.to_datetime(ONSET_SCR_LAB["SPECIMEN_DATE"], format = "mixed")

In [None]:
ONSET_SCR_LAB = ONSET_SCR_LAB[(ONSET_SCR_LAB.SPECIMEN_DATE <= (ONSET_SCR_LAB.PREDICTION_POINT - pd.Timedelta(days=2))) & \
                              (ONSET_SCR_LAB.SPECIMEN_DATE >= ONSET_SCR_LAB.ADMIT_DATE  - pd.Timedelta(days=8))]

In [None]:
#we only keep the lastest result of a certain lab within the time window
ONSET_SCR_LAB = ONSET_SCR_LAB.sort_values(by=['PATID', 'ONSETS_ENCOUNTERID', 'LAB_LOINC', 'SPECIMEN_DATE'])
ONSET_SCR_LAB = ONSET_SCR_LAB.groupby(['PATID', 'ONSETS_ENCOUNTERID', 'LAB_LOINC']).last().reset_index()

In [None]:
#turn lab into feature columns
LAB_info = ONSET_SCR_LAB.pivot(index='ONSETS_ENCOUNTERID', columns='LAB_LOINC', values='RESULT_NUM')
LAB_info = LAB_info.reset_index()

In [None]:
# drop lab with missing rate > 0.5
nan_rate = LAB_info.isnull().mean()
columns_to_drop = list(nan_rate[nan_rate > 0.5].index)

# we also drop SCR, GFR that directly indicate renal functions
columns_to_drop += ["2160-0", "48642-3", "48643-1"]

# drop these labs
LAB_info.drop(columns=columns_to_drop, inplace = True)


In [None]:
#merge them back to the original dataframe
ONSET_SCR_LAB = ONSET_SCR_formatted.merge(LAB_info, on = 'ONSETS_ENCOUNTERID', how = 'left')

In [None]:
lab_feature_space = list(LAB_info.columns[1:])
len(lab_feature_space)

# Each Patient Should Be Unique: Just use the first Encounter of Each Patients

In [None]:
ONSET_SCR_LAB = ONSET_SCR_LAB.sort_values(by=['PATID', 'ADMIT_DATE'], ascending=True)

In [None]:
# drop duplicates based on PATID and ENCOUTNERID and keep the first records
ONSET_SCR_LAB = ONSET_SCR_LAB.drop_duplicates(subset='PATID', keep='first')

# Get just a Fraction of Patients for Temporal Cross Validaiton

In [None]:
ONSET_SCR_LAB = ONSET_SCR_LAB.sort_values(by='ADMIT_DATE', ascending=True)
ONSET_SCR_LAB

In [None]:
# records after train-test split date will be used as test set
start_date = pd.to_datetime('2015-1-1')
split_date = pd.to_datetime('2016-1-1')
end_date = pd.to_datetime('2017-1-1')


In [None]:
# take a half as train set AKI and the rest are test set AKI
# get their admission times
dataset_train = ONSET_SCR_LAB[(ONSET_SCR_LAB.ADMIT_DATE < split_date) & (ONSET_SCR_LAB.ADMIT_DATE >= start_date)]
dataset_test = ONSET_SCR_LAB[(ONSET_SCR_LAB.ADMIT_DATE >= split_date) & (ONSET_SCR_LAB.ADMIT_DATE < end_date)]

print(len(dataset_train))
print(len(dataset_test))
print(dataset_train.AKI_LABEL.value_counts())
print(dataset_test.AKI_LABEL.value_counts())

train_len = len(dataset_train)
test_len = len(dataset_test)

In [None]:
dataset_sampled = pd.concat([dataset_train, dataset_test], axis = 0)

In [None]:
time_window = np.arange(-8, -1)  # from -8 to -2

In [None]:
# save a copy for demographics chararcterization
dataset_sampled.to_csv("/blue/yonghui.wu/lideyi/Personalization_Methodology/DEMO_KUMC.csv", index = False)

In [None]:
# extract used cols
dataset_sampled = dataset_sampled.loc[:, list(time_window) + list(lab_feature_space) + ['AKI_LABEL']]

In [None]:
dataset_sampled.AKI_LABEL.value_counts()

In [None]:
dataset_sampled.reset_index(drop = True, inplace = True)

In [None]:
dataset_sampled

# Use RF to Detect Lab Importance 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# drop patients for test set, drop patients contains nan
dataset_clean = dataset_sampled.iloc[:train_len, :]
dataset_clean = dataset_clean.dropna() 

In [None]:
dataset_clean.AKI_LABEL.value_counts()

In [None]:
X = dataset_clean.drop(columns=['AKI_LABEL']).loc[:, lab_feature_space]
y = dataset_clean['AKI_LABEL']

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

In [None]:
feature_importances = rf.feature_importances_

# ascending order
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

In [None]:
assert(len(feature_importance_df) == len(lab_feature_space))

In [None]:
# get the lab overlap weighting, that is top 20% features normalized according to feature importance and
# set ohters to 0
top_rate = 0.5
threshold = feature_importance_df['Importance'].quantile(1 - top_rate)
top_features = feature_importance_df['Importance'] >= threshold
feature_importance_df['Weighting'] = 0.0
feature_importance_df.loc[top_features, 'Weighting'] = \
feature_importance_df.loc[top_features, 'Importance'] / feature_importance_df.loc[top_features, 'Importance'].sum()

In [None]:
lab_overlap_weighting = list(feature_importance_df['Weighting'])

In [None]:
dataset_sampled.reset_index(drop = True, inplace = True)
dataset_sampled.to_csv("/blue/yonghui.wu/lideyi/Personalization_Methodology/dataset.csv", index = False)

In [None]:
time_window = [str(i) for i in time_window]

# Use Gaussian Distribution to Compute SCR Overlap Rates

If 2 patient have records at the same day, then return True otherwise False (overlap vector). We use a Gaussian distribution to the overlap vector, that is closer to the prediction point, more overlap weights are added.

In [None]:
import scipy.stats as stats

In [None]:
mean = 0.0
sd_SCR = 1.5

SCR_overlap_weighting = []
for i in range(len(time_window)):
    pos = len(time_window) - 1 - i
    AUC = stats.norm.cdf(pos + 1, loc=mean, scale=sd_SCR) - stats.norm.cdf(pos, loc=mean, scale=sd_SCR)
    SCR_overlap_weighting.append(AUC)
    
SCR_overlap_weighting = list(np.array(SCR_overlap_weighting) * 2)

In [None]:
import json

# Store variables in a dictionary
data = {'start_date': start_date.strftime('%Y-%m-%d'), 'split_date': split_date.strftime('%Y-%m-%d'), 'end_date': end_date.strftime('%Y-%m-%d'),  
        'train_len': train_len, 'test_len': test_len, 'SCR_feature_space': time_window, 'LAB_feature_space': lab_feature_space, 
        'SCR_overlap_weighting': SCR_overlap_weighting, 'LAB_overlap_weighting': lab_overlap_weighting}

# Save to a JSON file
with open('./utils/variables.json', 'w') as file:
    json.dump(data, file)