In [1]:
import pandas as pd
import numpy as np
import json
import multiprocessing

In [2]:
dataset = pd.read_csv("/blue/yonghui.wu/lideyi/Personalization_Methodology/dataset.csv")

In [17]:
len(dataset)

17179

In [3]:
# Load the variables
with open('./utils/variables.json', 'r') as file:
    variables = json.load(file)

SCR_feature_space = variables['SCR_feature_space']
LAB_feature_space = variables['LAB_feature_space']
train_len = variables['train_len']
test_len = variables['test_len']
print(len(SCR_feature_space), len(LAB_feature_space))
print(train_len, test_len)

# get num_processors for parallel computing
num_processors = multiprocessing.cpu_count()

7 40
8637 8542


In [4]:
figure_folder = "Sensitivity_analysis"

In [5]:
# drop patients for test set, drop patients contains nan
data_train = dataset.iloc[:train_len, :].copy(deep = True)
data_train = data_train.dropna() 

In [6]:
SCR_part = dataset.loc[:, SCR_feature_space]
LAB_part = dataset.loc[:, LAB_feature_space]

In [7]:
SCR_part_bin = SCR_part.notna().astype(int)
LAB_part_bin = LAB_part.notna().astype(int)

# Range from [1.2, 1.5, 1.7, 2.0, 2.5] for SCr Features

In [8]:
import scipy.stats as stats

In [9]:
mean = 0.0
sd_SCR_space = [1.2, 1.5, 1.7, 2.0, 2.5]
SCR_overlap_weighting_space = []

for sd in sd_SCR_space:
    SCR_overlap_weighting = []
    for i in range(len(SCR_feature_space)):
        pos = len(SCR_feature_space) - 1 - i
        AUC = stats.norm.cdf(pos + 1, loc=mean, scale=sd) - stats.norm.cdf(pos, loc=mean, scale=sd)
        SCR_overlap_weighting.append(AUC)

    SCR_overlap_weighting = list(np.array(SCR_overlap_weighting) * 2)
    SCR_overlap_weighting_space.append(SCR_overlap_weighting)

# Range from [0.1, 0.3, 0.5, 0.7, 1.0] for Lab Features

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
LAB_train = data_train.drop(columns=['AKI_LABEL']).loc[:, LAB_feature_space].copy(deep = True)
y_train = data_train['AKI_LABEL'].copy(deep = True)

In [12]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(LAB_train, y_train)

In [13]:
feature_importances = rf.feature_importances_

# ascending order
feature_importance_df = pd.DataFrame({
    'Feature': LAB_train.columns,
    'Importance': feature_importances
})

assert(len(feature_importance_df) == len(LAB_feature_space))

In [14]:
top_rates_space = [0.1, 0.3, 0.5, 0.7, 1.0]
lab_overlap_weighting_space = []

for top_rate in top_rates_space:
    
    threshold = feature_importance_df['Importance'].quantile(1 - top_rate)
    print(top_rate, threshold)
    top_features = feature_importance_df['Importance'] >= threshold
    feature_importance_df['Weighting'] = 0.0
    feature_importance_df.loc[top_features, 'Weighting'] = \
    feature_importance_df.loc[top_features, 'Importance'] / feature_importance_df.loc[top_features, 'Importance'].sum()
    lab_overlap_weighting = list(feature_importance_df['Weighting'])
    lab_overlap_weighting_space.append(lab_overlap_weighting)

0.1 0.03520589568681677
0.3 0.02904920087257196
0.5 0.024957272857172513
0.7 0.02241519084167006
1.0 0.006289815663413658


# Main Computing

In [15]:
from utils.Data_Overlap_Rates_Computing import parallel_overlap_matrix_comp, check_matrix_sanity, \
calculate_overlap_rate_SCR, calculate_overlap_rate_LAB

In [None]:
for i in range(len(SCR_overlap_weighting_space)):
    SCR_overlap_weighting = SCR_overlap_weighting_space[i]
    SCR_overlap = parallel_overlap_matrix_comp(SCR_part_bin, num_processors, 
                                           calculate_overlap_rate_SCR, SCR_overlap_weighting)
    check_matrix_sanity(SCR_overlap)
    np.save('/blue/yonghui.wu/lideyi/Personalization_Methodology/sensitivity_analysis/SCR_overlap_%s.npy'%(i), SCR_overlap)

In [16]:
for i in range(len(lab_overlap_weighting_space)):
    LAB_overlap_weighting = lab_overlap_weighting_space[i]
    LAB_overlap = parallel_overlap_matrix_comp(LAB_part_bin, num_processors, 
                                           calculate_overlap_rate_LAB, LAB_overlap_weighting)
    check_matrix_sanity(LAB_overlap)
    np.save('/blue/yonghui.wu/lideyi/Personalization_Methodology/sensitivity_analysis/lab_overlap_%s.npy'%(i), LAB_overlap)

100%|██████████| 17178/17178 [09:42<00:00, 29.49it/s] 
100%|██████████| 17178/17178 [10:16<00:00, 27.87it/s] 
100%|██████████| 17178/17178 [10:04<00:00, 28.40it/s] 
100%|██████████| 17178/17178 [10:02<00:00, 28.52it/s] 
100%|██████████| 17178/17178 [09:59<00:00, 28.64it/s] 


In [22]:
# we add extra all ONE matrix to represent the no overlap weights
ONE_arr = np.ones((len(dataset), len(dataset)))
np.save('/blue/yonghui.wu/lideyi/Personalization_Methodology/sensitivity_analysis/SCR_overlap_%s.npy'%(len(SCR_overlap_weighting_space)), ONE_arr)
np.save('/blue/yonghui.wu/lideyi/Personalization_Methodology/sensitivity_analysis/lab_overlap_%s.npy'%(len(lab_overlap_weighting_space)), ONE_arr)

# Read Back All Overlap Rates Matricies

# Read Pickle File for Necessary Data

nw_fea_arrs_dict (just full), SCR_train, SCR_test, LAB_train, LAB_test, y_test, k_sizes_test