In [4]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import multiprocessing
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()
%store -r raw_path
%store -r time_window
%store -r lab_overlap_weighting
%store -r lab_feature_space

In [5]:
print(len(time_window))
print(len(lab_feature_space))

7
40


In [6]:
data_dev = pd.read_csv("/blue/yonghui.wu/lideyi/Personalization_Methodology/dataset.csv")

In [7]:
len(data_dev)

17179

# Calculate Pairwise Data Overlap Rates for SCr

If 2 patient have records at the same day, then return True otherwise False (overlap vector). We use a Gaussian distribution to the overlap vector, that is closer to the prediction point, more overlap weights are added.

In [8]:
mean = 0.0
sd_SCR = 1.5

normal_distribution_SCR = []
for i in range(len(time_window)):
    pos = len(time_window) - 1 - i
    AUC = stats.norm.cdf(pos + 1, loc=mean, scale=sd_SCR) - stats.norm.cdf(pos, loc=mean, scale=sd_SCR)
    normal_distribution_SCR.append(AUC)
    
normal_distribution_SCR = np.array(normal_distribution_SCR) * 2
%store normal_distribution_SCR

Stored 'normal_distribution_SCR' (ndarray)


In [9]:
def check_vec_overlap(u, v):
    assert(len(u) == len(v))
    u_arr = np.array(u)
    v_arr = np.array(v)
    overlap = np.logical_and(u_arr, v_arr)
    return overlap

In [10]:
def calculate_overlap_rate_SCR(u, v, norm_distr):
    #get the bool vec
    overlap_vec = check_vec_overlap(u, v)
    overlap_rate = overlap_rate_SCR(overlap_vec, norm_distr)
    return overlap_rate

In [11]:
def calculate_overlap_rate_LAB(u, v, norm_distr):
    #get the bool vec
    overlap_vec = check_vec_overlap(u, v)
    overlap_rate = overlap_rate_LAB(overlap_vec, norm_distr)
    return overlap_rate

In [12]:
def overlap_rate_LAB(overlap_vec, norm_distr):
    return np.sum(norm_distr * overlap_vec)

In [13]:
#apply a Gaussian distribution on SCr overlap vec
def overlap_rate_SCR(overlap_vec, norm_distr):
    return np.sum(norm_distr * overlap_vec)

In [14]:
def calculate_overlap(args):
    index, df, metric, norm_distr = args
    return [metric(df.iloc[index], df.iloc[j], norm_distr) for j in range(index + 1, len(df))]

In [15]:
#parallel computing
def parallel_overlap_matrix_comp(df, num_processes, metric, norm_distr):
    pool = Pool(num_processes)
    total = len(df)

    results = list(tqdm(pool.imap(calculate_overlap, 
                                  [(i, df, metric, norm_distr) for i in range(total - 1)]), 
                            total=total - 1))
    pool.close()
    pool.join()
    return create_similarity_matrix(results)

In [16]:
def create_similarity_matrix(distance_list):
    n = len(distance_list[0]) + 1

    matrix = np.ones((n, n))

    for i in range(n-1):
        matrix[i, i+1:i+1+len(distance_list[i])] = distance_list[i]

    for i in range(n):
        for j in range(i+1, n):
            matrix[j, i] = matrix[i, j]
    return matrix

In [17]:
SCR_part = data_dev.loc[:, time_window]
LAB_part = data_dev.loc[:, lab_feature_space]

In [18]:
SCR_part_bin = SCR_part.notna().astype(int)
LAB_part_bin = LAB_part.notna().astype(int)

# Main Computing

In [19]:
def check_matrix_sanity(matrix):
    assert(matrix.shape[0] == matrix.shape[1])
    assert(np.all((np.round(matrix, 3) >= 0) & (np.round(matrix, 3) <= 1)))

In [20]:
cpu_count = multiprocessing.cpu_count()

In [21]:
SCR_overlap = parallel_overlap_matrix_comp(SCR_part_bin, cpu_count, 
                                       calculate_overlap_rate_SCR, normal_distribution_SCR)

check_matrix_sanity(SCR_overlap)
print(np.median(SCR_overlap))
print(np.mean(SCR_overlap))

np.save('/blue/yonghui.wu/lideyi/Personalization_Methodology/SCR_overlap.npy', SCR_overlap)

100%|██████████| 17178/17178 [05:55<00:00, 48.34it/s] 


0.8175775605482642
0.8232253746466658


In [22]:
LAB_overlap = parallel_overlap_matrix_comp(LAB_part_bin, cpu_count, 
                                       calculate_overlap_rate_LAB, lab_overlap_weighting)

check_matrix_sanity(LAB_overlap)

# what is the median of easch matrix, is the penalty too large?
print(np.median(LAB_overlap))
print(np.mean(LAB_overlap))

np.save('/blue/yonghui.wu/lideyi/Personalization_Methodology/lab_overlap.npy', LAB_overlap)

100%|██████████| 17178/17178 [06:38<00:00, 43.14it/s] 


0.8686669751439782
0.8610430657594513
