In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os

### Normalize the Data and Separate Patients into Individual Files

In [2]:
# Load Datasets
uc = pd.read_csv('/Users/emmadyer/Desktop/ibd_long_project/data/uc_expanded.csv')
cd = pd.read_csv('/Users/emmadyer/Desktop/ibd_long_project/data/cd_expanded.csv')
healthy = pd.read_csv('/Users/emmadyer/Desktop/ibd_long_project/data/healthy_expanded.csv')
test_ibd = pd.read_csv('/Users/emmadyer/Desktop/ibd_long_project/data/test_ibd_expanded.csv')

In [3]:
# Make a list of the lab values (column headers)
lab_value_names = list(healthy.columns.values)
lab_value_names = lab_value_names[1:-1]

# Make a list of the DataFrames
all_dfs = [uc, cd, healthy]

In [4]:
# Normalize Each Patient's Data
# Min/Max scaling [0,1] and account for missing values
def minmax_scale(x, min, max):
    ''' 
    Min/Max Scaler [0,1] that also handles when the max=min.
    Inputs:
    x (int) - Value to be normalized
    min (int) - minmum value
    max (int) - maximum value

    Outputs:
    norm (int) - normalized value [0,1]
    '''
    if min != max:
        scaled = max - min
    else:
        scaled = 1
    norm = (x - min) / scaled
    return norm

In [5]:
dataset_norms = []
for df in all_dfs:
    pts_lst = []
    mean_norm_values = []
    for lab in lab_value_names:
        #print("Lab Name: ", lab)
        values = df[lab].fillna(0)
        #values = values.astype(np.float16)
        min_lab = min(values)
        #print("Minimum Value: ", min_lab)
        max_lab = max(values)
        #print("Maximum Value: ", max_lab)
        mean_value = values.mean()
        norm_lab_val = minmax_scale(mean_value, min_lab, max_lab)
        #print('Scaled Value: ', norm_lab_val, '\n')
        mean_norm_values.append(norm_lab_val)
    dataset_norms.append(mean_norm_values)

In [6]:
# Need to calculate a mean normalized value in the event that a given patient only has one recorded value or no values.
# Mean Noramlized Values

# Lists to store patient normalized dataframes
all_patients_lsts = []
file_names = ['uc', 'cd', 'healthy']
for pt_population, df in enumerate(all_dfs):
    mean_norm_values = dataset_norms[pt_population]
    patient_id_lst = list(df.patient_id.unique())
    print(len(patient_id_lst))
    for patient in patient_id_lst:
        patient_df = df[df['patient_id'] == patient].reset_index()
        # Patient Normalized Values
        visits = list(range(0,5))
        na_reference_df = patient_df.copy()
        na_reference_df = na_reference_df.isnull()
        #norm_df = patient_df.copy().reset_index()
        for i, lab in enumerate(lab_value_names):
            #print("\n Lab Name: ", lab)
            total_missing = patient_df[lab].isnull().sum()
            if total_missing == 5: 
                continue
            else:
                values = patient_df[lab].fillna(0)
                min_lab = min(values)
                max_lab = max(values)
                #print("Minimum Value: ", min_lab)
                #print("Maximum Value: ", max_lab)
                for v in visits:
                    is_missing = na_reference_df.loc[v, lab]
                    #print("Missing???: ", is_missing)
                    if is_missing == False:
                        value = patient_df.loc[v, lab]
                        norm_val = minmax_scale(value, min_lab, max_lab)
                        patient_df.loc[v, lab] = norm_val
                    #print("Lab Value: ", value)
                    #print('Scaled Value: ', norm_val)
        pts_lst.append(patient_df)
        file_path = '/Users/emmadyer/Desktop/ibd_long_project/data/' + file_names[pt_population] + '_pts/' + str(patient) + '.csv'
        patient_df.to_csv(file_path, index=False)
        #print(norm_df)
    all_patients_lsts.append(pts_lst)
        

1688
2197
5415


### Calculate Distance Correlation
This is calculated on the whole dataset. We are solving for the distance between two vectors. In this case, we are calculating the distance correlation for the pairwise vectors for each analyte. 
1. Create individual arrays of each analyte with NA values removed.

In [2]:
import dcor
import math

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [3]:
# Paths for Emma's Macbook 

# Load Datasets
uc = pd.read_csv('/Users/emmadyer/Desktop/data/uc_expanded.csv')
cd = pd.read_csv('/Users/emmadyer/Desktop/data/cd_expanded.csv')
healthy = pd.read_csv('/Users/emmadyer/Desktop/data/healthy_expanded.csv')
test_ibd = pd.read_csv('/Users/emmadyer/Desktop/data/test_ibd_expanded.csv')

# Make a list of the lab values (column headers)
lab_value_names = list(healthy.columns.values)
lab_value_names = lab_value_names[1:-1]

# Make a list of the DataFrames
all_dfs = [uc, cd, healthy]

In [4]:
df = test_ibd.copy()
individual_labs = []
for lab in lab_value_names: 
    lab_data = df[lab].dropna()
    lab_data = lab_data.to_numpy()
    individual_labs.append(lab_data)

2. Calculate the distance between each array. Need to adjust the arrays to be the same size before calculating the distance correlation.

In [5]:
# Create a dictionary of dictionaries. Structure is that the outer key is a lab name,
# the inner key is a lab name with a value of the distance correlation between the 
# two labs specified by the keys

whole_correlation_dict = dict()
for i, lab_1 in enumerate(individual_labs):
    all_cors = []
    len_l1 = len(lab_1)
    sub_correlation_dict = dict()
    for j, lab_2 in enumerate(individual_labs):
        len_l2 = len(lab_2)
        min_len = min(len_l1, len_l2)
        red_1 = lab_1[:min_len]
        red_2 = lab_2[:min_len]
        correlation = dcor.distance_correlation(red_1, red_2)
        sub_correlation_dict[lab_value_names[j]] = correlation
    whole_correlation_dict[lab_value_names[i]] = sub_correlation_dict

# To test that this is working correctly, we can see that, as expected, the correlation 
# of crp to crp is 1
whole_correlation_dict['crp']['crp']


1.0

In [6]:
# Formatted with a list of correlation values


l_whole_correlation_dict = dict()
for i, lab_1 in enumerate(individual_labs):
    all_cors = []
    len_l1 = len(lab_1)
    cor_lst = []
    for j, lab_2 in enumerate(individual_labs):
        len_l2 = len(lab_2)
        min_len = min(len_l1, len_l2)
        red_1 = lab_1[:min_len]
        red_2 = lab_2[:min_len]
        correlation = dcor.distance_correlation(red_1, red_2)
        cor_lst.append(correlation)
    l_whole_correlation_dict[lab_value_names[i]] = cor_lst

In [7]:
# Calculate the distance from the missing data point (to be imputed) to ALL
# available values

# Create a DataFrame with the correlation values (correlation matrix in a dataframe format)
correlation_matrix = pd.DataFrame()
correlation_matrix['lab_name'] = lab_value_names
correlation_matrix = correlation_matrix.set_index('lab_name')

for labs, cor_vals in l_whole_correlation_dict.items():
    correlation_matrix[labs] = cor_vals


#### Imputation Steps
1. Land on a missing value in the patient matrix.
2. Identify if the patient has a non-missing value for the given lab at another visit.
3. If there are no data for this lab in any visit, impute with the population mean.
4. If there is another data point for this lab value at another visit, calculate the weight d(u,v) between all values, including multiple values of the same lab (that have a correlation of 1)
   
Note: The summed MIC helps account for temporal relationships. The sum constitutes the sum of MIC for non-missing values that exist in the two vists that are being compared when calculating d(v,u). For example if a patient is missing a value for cal, but they have measurements for crp, fol, hgb, and wbc in visits 1 and 2, the the summed MIC would be calculated as the sum of the MIC values for cal|fol, cal|hgb, etc.

In [8]:
# Open each patient file and impute!!!!

uc_pt_paths = os.listdir('/Users/emmadyer/Desktop/data/uc_pts')
test_pt = ['/Users/emmadyer/Desktop/data/uc_pts/22158.csv']
pt_file_lst = test_pt

In [None]:
def compute_weights(patient_matrix, mask_matrix, value):
    '''
    Helper function to compute the weights of a given lab value and
    all other available lab values. Must have correlation matrix pre-calculated. 

    Inputs:
    patient_matrix (DataFrame): contains labs as columns and visits as rows
    mask_matrix (DataFrame): Boolean mask of the patient matrix where True = null
    and False = int. 
    value

    '''


In [31]:
def find_neighbors(patient_matrix, visit_v_idx, visit_u_idx, lab_to_impute, whole_correlation_dict=whole_correlation_dict):
    ''' 
    Find the distance correlation sum between two given visits. This function
    compares the lab values that are mutually present between two visits, accesses
    the distance correlation between those labs, and sums the distance correlation
    between the lab value that is missing and the lab values that contain measurments
    in both visits. 

    Example: Visit 1 has lab values for cal, pro, fol, fer, hgb, hct, and wbc. Visit 2 has
    values for alt, pro, alb, hgb, pmn, wbc, fer, btwelve, and hct. The value being imputed is 
    plt. This function identifies that both visits have lab values for pro, wbc, fer, hct, and
    hgb. Then, it identifies the distance correlation between pro-plt, wbc-plt, fer-plt, hct-plt,
    and hgb-plt. Finally, the identified distance correlations are summed together, which is then
    used to calculate the distance metric. The inverse distance metric and the 
    value of the comparator visit (visit_u) are saved and used for the output.

    Inputs:
    patient_matrix (DataFrame): contains labs as columns and visits as rows
    visit_d_idx (int): index for the visit that contains the value to be imputed.
    visit_u_idx (int): index for the visit being compared to the visit with the value
    that is to be imputed. 
    lab_to_impute (str): represents the lab type (column header) of the value being imputed

    Outputs:
    weights, values (tuple [list, list]): Weights of each neighbor and the value
    of the neighbor, each stored in a list where the index of the list 
    corresponds to matching weights and values. 
    '''
    visit_v = patient_matrix.loc[visit_v_idx,:].dropna()
    visit_u = patient_matrix.loc[visit_u_idx, :].dropna()
    v_labs = list(visit_v.index)
    u_labs = list(visit_v.index)
    shared_labs = [l for l in v_labs and u_labs if l in v_labs and l in u_labs]
    correlation_vals = []
    for l in shared_labs:
        val = whole_correlation_dict[lab_to_impute][l]
        correlation_vals.append(val)
    sum_dis_cors = sum(correlation_vals)
    #print("Distance Correlations Sum: ", sum_dis_cors, '\n')
    weights_values = []
    for l in shared_labs:
        #print('Lab: ', l)
        v_j = visit_v.loc[l]
        u_j = visit_u.loc[l]
        distance_metric = math.sqrt(sum_dis_cors * (v_j - u_j)**2) / sum_dis_cors
        #print("Distance Metric: ", distance_metric)
        # Calculate the inverse of the distance metric because the smaller the distance
        # the larger the weight (i.e. closer neighbor), we can then say that larger 
        # weights are closer, instead of working with smaller distances.
        weight = 1 / (distance_metric + 1e-6) # add epsilon to avoid division by 0
        #print("Weight: ", weight, '\n')
        weights_values.append((weight, u_j))
    return weights_values


In [32]:
# Testing the find_neighbors() function:
patient_matrix = pd.read_csv('/Users/emmadyer/Desktop/data/uc_pts/22158.csv')
pt_id = patient_matrix.loc[0:'patient_id']
patient_matrix = patient_matrix.drop(['index', 'patient_id', 'year'], axis=1)

w_v = find_neighbors(patient_matrix, 0, 1, 'crp')
print(w_v)

[(11.468403116549295, 0.609375), (37.90090810812778, 0.9385382059800664), (5.451533630035259, 0.572682925995478), (10.249897775099143, 0.2272727272727272), (2.3295406724504337, 1.0), (2.795447504518787, 0.8333333333333338), (3.0284007577507444, 0.8461538461538459), (3.979625413416238, 0.804878049173111), (3.9064543520976662, 0.7706422018348625), (5.1767423105843635, 0.5499999999999999), (7.3910706917175375, 0.6848184818481847), (10.649240188922873, 0.78125)]


In [None]:
def impute_wknn(k, patient_matrix, mask_matrix, lab, visit_idx, correlation_matrix=correlation_matrix):
    ''' 
    Impute missing lab values given a patient matrix.

    Inputs:
    k (integer): k-number of neighbors to use when imputing
    patient_matrix (DataFrame): contains labs as columns and visits as rows
    mask_matrix (DataFrame): Boolean mask of the patient matrix where True = null
    and False = int. 
    lab (str): lab type (column header) of the missing value
    visit_idx (int): index in the patient matrix of the visit from which the missing value arises

    Output:
    imputed_value (int): Imputed value for the missing values of a given lab
    for a given patient.
    '''
    # Loop and get data pairs to calculate d(u,v)
    all_weights_values = []
    value = mask_matrix.loc[visit_idx, lab]
    # impute missing value
    for i in range(0, len(patient_matrix.rows.values)):
        if visit_idx == i: # Do not want to compare a visit to itself
            continue
        if mask_matrix.loc[i, lab] == True: # Make sure the comparing visit has a value
            continue
        else:
            weights_values = find_neighbors(patient_matrix, visit_idx, i, lab)
        all_weights_values = all_weights_values + weights_values
    sorted_weights_vals = sorted(all_weights_values)
    neighbors = sorted_weights_vals[:k]  
    w_knn, v_knn = map(list,zip(*sorted_weights_vals))

            

In [43]:
l = [(5,4), (1,2), (3,4)]
w = [(1,2), (2,1)]
s = l[:3]
#print(s)
w, v = map(list,zip(*l))
print(w)
print(v)

[5, 1, 3]
[4, 2, 4]


In [53]:
v_num = list(range(0,5))

for pt in pt_file_lst:
    visits = pd.read_csv(pt)
    visits = visits.reset_index()
    pt_id = visits.loc[0:'patient_id']
    visits = visits.drop(['index', 'patient_id', 'year'], axis=1)
    mask_visits = visits.isnull()
    for lab in lab_value_names: # look through each lab in a given visit
        for n in v_num: # check the value of the lab
            value = mask_visits.loc[n, lab]
            if value == True: # value is missing â€” impute
                impute_wknn()

