In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import multiprocessing
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()
%store -r raw_path
%store -r time_window
%store -r lab_overlap_weighting
%store -r lab_feature_space

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(len(time_window))
print(len(lab_feature_space))

7
40


In [3]:
data_dev = pd.read_csv("/blue/yonghui.wu/lideyi/Personalization_Methodology/dataset.csv")

In [4]:
data_dev

Unnamed: 0,-8,-7,-6,-5,-4,-3,-2,14979-9,1742-6,17861-6,...,742-7,751-8,770-8,777-3,785-6,786-4,787-2,788-0,789-8,AKI_LABEL
0,,,,,0.72,0.95,0.88,,,9.3,...,1.0,5.6,51.0,228.0,32.2,33.4,96.6,13.5,4.04,0
1,,1.210,,0.97,1.04,0.93,0.96,,22.0,8.9,...,,,,168.0,31.8,33.3,95.6,14.6,4.20,0
2,1.03,0.990,,,1.01,0.91,0.82,,22.0,8.8,...,1.3,14.2,82.0,165.0,29.1,31.4,92.8,15.7,3.82,0
3,,,,0.69,0.90,0.73,0.62,32.2,28.0,7.8,...,,,,152.0,29.5,33.8,87.1,14.0,3.73,0
4,0.60,0.615,0.685,0.62,0.50,0.53,0.56,,12.0,7.8,...,0.4,1.8,7.0,66.0,32.0,32.7,97.8,16.8,2.32,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17174,1.11,1.170,1.170,1.19,1.16,1.09,1.10,67.3,10.0,8.7,...,0.6,8.7,85.0,186.0,32.5,35.1,92.6,17.3,2.82,0
17175,,,,,,1.25,1.21,,,9.2,...,,,,248.0,30.8,34.2,90.1,15.7,3.62,1
17176,,,,1.32,0.91,0.94,1.06,28.9,14.0,9.7,...,1.0,5.0,65.0,460.0,28.7,33.2,86.4,16.7,3.98,0
17177,1.31,1.280,1.460,1.36,1.52,1.60,1.62,103.8,3.0,9.0,...,0.8,3.7,58.0,213.0,29.7,32.4,91.8,13.9,3.81,0


In [5]:
len(data_dev)

17179

# Calculate Pairwise Data Overlap Rates for SCr

If 2 patient have records at the same day, then return True otherwise False (overlap vector). We use a Gaussian distribution to the overlap vector, that is closer to the prediction point, more overlap weights are added.

In [6]:
mean = 0.0
sd_SCR = 1.0

normal_distribution_SCR = []
for i in range(len(time_window)):
    pos = len(time_window) - 1 - i
    AUC = stats.norm.cdf(pos + 1, loc=mean, scale=sd_SCR) - stats.norm.cdf(pos, loc=mean, scale=sd_SCR)
    normal_distribution_SCR.append(AUC)
    
normal_distribution_SCR = np.array(normal_distribution_SCR) * 2
%store normal_distribution_SCR

Stored 'normal_distribution_SCR' (ndarray)


In [7]:
def check_vec_overlap(u, v):
    assert(len(u) == len(v))
    u_arr = np.array(u)
    v_arr = np.array(v)
    overlap = np.logical_and(u_arr, v_arr)
    return overlap

In [8]:
def calculate_overlap_rate_SCR(u, v, norm_distr):
    #get the bool vec
    overlap_vec = check_vec_overlap(u, v)
    overlap_rate = overlap_rate_SCR(overlap_vec, norm_distr)
    return overlap_rate

In [9]:
def calculate_overlap_rate_LAB(u, v, norm_distr):
    #get the bool vec
    overlap_vec = check_vec_overlap(u, v)
    overlap_rate = overlap_rate_LAB(overlap_vec, norm_distr)
    return overlap_rate

In [10]:
def overlap_rate_LAB(overlap_vec, norm_distr):
    return np.sum(norm_distr * overlap_vec)

In [11]:
#apply a Gaussian distribution on SCr overlap vec
def overlap_rate_SCR(overlap_vec, norm_distr):
    return np.sum(norm_distr * overlap_vec)

In [12]:
def calculate_overlap(args):
    index, df, metric, norm_distr = args
    return [metric(df.iloc[index], df.iloc[j], norm_distr) for j in range(index + 1, len(df))]

In [13]:
#parallel computing
def parallel_overlap_matrix_comp(df, num_processes, metric, norm_distr):
    pool = Pool(num_processes)
    total = len(df)

    results = list(tqdm(pool.imap(calculate_overlap, 
                                  [(i, df, metric, norm_distr) for i in range(total - 1)]), 
                            total=total - 1))
    pool.close()
    pool.join()
    return create_similarity_matrix(results)

In [14]:
def create_similarity_matrix(distance_list):
    n = len(distance_list[0]) + 1

    matrix = np.ones((n, n))

    for i in range(n-1):
        matrix[i, i+1:i+1+len(distance_list[i])] = distance_list[i]

    for i in range(n):
        for j in range(i+1, n):
            matrix[j, i] = matrix[i, j]
    return matrix

In [15]:
SCR_part = data_dev.loc[:, time_window]
LAB_part = data_dev.loc[:, lab_feature_space]

In [16]:
SCR_part_bin = SCR_part.notna().astype(int)
LAB_part_bin = LAB_part.notna().astype(int)

# Main Computing

In [17]:
def check_matrix_sanity(matrix):
    assert(matrix.shape[0] == matrix.shape[1])
    assert(np.all((np.round(matrix, 3) >= 0) & (np.round(matrix, 3) <= 1)))

In [18]:
cpu_count = multiprocessing.cpu_count()

In [19]:
SCR_overlap = parallel_overlap_matrix_comp(SCR_part_bin, cpu_count, 
                                       calculate_overlap_rate_SCR, normal_distribution_SCR)

check_matrix_sanity(SCR_overlap)
print(np.median(SCR_overlap))
print(np.mean(SCR_overlap))

np.save('/blue/yonghui.wu/lideyi/Personalization_Methodology/SCR_overlap.npy', SCR_overlap)

100%|██████████| 17178/17178 [12:22<00:00, 23.13it/s] 


0.9544997361036416
0.899984593064256


In [20]:
LAB_overlap = parallel_overlap_matrix_comp(LAB_part_bin, cpu_count, 
                                       calculate_overlap_rate_LAB, lab_overlap_weighting)

check_matrix_sanity(LAB_overlap)

# what is the median of easch matrix, is the penalty too large?
print(np.median(LAB_overlap))
print(np.mean(LAB_overlap))

np.save('/blue/yonghui.wu/lideyi/Personalization_Methodology/lab_overlap.npy', LAB_overlap)

100%|██████████| 17178/17178 [13:31<00:00, 21.18it/s] 


0.8686669751439782
0.8610430657594513
