In [2]:
import pandas as pd
import numpy as np


In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 20)

In [4]:
all_time_points = pd.read_csv('all_time_points_07.05.22.csv')
all_time_points = all_time_points.rename(columns={'Unnamed: 0':'Visit'})
all_time_points.head()

Unnamed: 0,Visit,CandID,PSCID_AD8,CandID.1,Study_visit_label_AD8,Visit_label_AD8,Date_taken,Candidate_Age,AD8_total_score,1_judgment_problems,...,50,anosmia_diagnosis_Mild Microsmia,anosmia_diagnosis_Moderate Microsmia,anosmia_diagnosis_Normosmia,anosmia_diagnosis_Severe Microsmia,anosmia_diagnosis_Total Anosmia,RBANS_version_A,RBANS_version_B,RBANS_version_C,RBANS_version_D
0,BL00,108583,MTL0392,108583,NAPBL00,BL00,2014-08-07,813.1,0.0,0.0,...,1.379539,0,0,1,0,0,1,0,0,0
1,BL00,113451,MTL0415,113451,NAPBL00,BL00,2014-10-02,764.7,0.0,0.0,...,-0.594918,0,0,1,0,0,1,0,0,0
2,BL00,115095,MTL0380,115095,PREBL00,BL00,2014-07-22,780.2,0.0,0.0,...,-0.099748,0,0,1,0,0,1,0,0,0
3,BL00,117603,MTL0482,117603,PREBL00,BL00,2015-08-25,723.9,0.0,0.0,...,0.861089,0,0,1,0,0,1,0,0,0
4,BL00,122650,MTL0008,122650,PREBL00,BL00,2013-01-30,885.3,0.0,0.0,...,-0.299726,1,0,0,0,0,1,0,0,0


In [5]:
#setting multilevel indices 
inds = pd.MultiIndex.from_frame(all_time_points.iloc[:,0:2])

all_time_points = pd.DataFrame(np.array(all_time_points.iloc[:,2:]), index=inds, columns = all_time_points.iloc[:,2:].columns)

In [6]:
#selecting all columns with age 
age_cols = ['Candidate_Age','Candidate_Age_Aud_pro','Candidate_Age_BP_Pulse_Weight','Candidate_Age_CSF_Proteins','Candidate_Age_lab','Candidate_Age_Med_use','Candidate_Age_RBANS','Candidate_Age_Smell']

In [7]:
#making sure they are relatively the same except for some missing values 
all_time_points[age_cols].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Candidate_Age,Candidate_Age_Aud_pro,Candidate_Age_BP_Pulse_Weight,Candidate_Age_CSF_Proteins,Candidate_Age_lab,Candidate_Age_Med_use,Candidate_Age_RBANS,Candidate_Age_Smell
Visit,CandID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BL00,108583,813.1,,813.1,814.5,813.1,813.1,813.1,813.1
BL00,113451,764.7,765.3,765.3,765.6,765.3,765.3,765.3,765.3
BL00,115095,780.2,,780.2,,780.2,780.2,780.2,780.2
BL00,117603,723.9,,723.8,,723.8,723.8,723.8,722.8
BL00,122650,885.3,,885.0,,885.0,885.0,885.0,885.0


In [8]:
all_time_points[age_cols].isna().sum()

Candidate_Age                     341
Candidate_Age_Aud_pro             830
Candidate_Age_BP_Pulse_Weight     306
Candidate_Age_CSF_Proteins       1148
Candidate_Age_lab                 301
Candidate_Age_Med_use             306
Candidate_Age_RBANS                 0
Candidate_Age_Smell               324
dtype: int64

All paticipants have an age value for Candidate_Age_RBANS so this is the age value we will be using to establish time difference between visits (relative to initial visit)

In [9]:
differences = []
ids = []

#loop through all participants
for participant in all_time_points.index.get_level_values('CandID').unique():
    #select all of their visits
    visits = all_time_points.xs(participant, level=1)
    
    t1 = visits.iloc[0,:]
    #for T1
    diff = 0    
    differences.append(diff)
    ids.append(participant)  
    
    #verify if at least two visits    
    if len(visits.index)>1:
        for i in range(1,len(visits.index)):
            #for subsequent time points 
            t2 = visits.iloc[i,:]
            #computing time difference since T1 based on age at assessement in months 
            diff = t2.Candidate_Age_RBANS - t1.Candidate_Age_RBANS 
            differences.append(diff)
            ids.append(participant)


diff_df = pd.DataFrame(np.array(differences), columns = ['Time_diff_months'], index = ids)

In [10]:
#adding a time difference column to our dataset
all_time_points['time_diff'] = differences

In [11]:
#manual column inpection 
list(all_time_points.drop(columns=[
                              'PSCID_AD8',
                              'CandID.1',
                              'PSCID_APS',
                              'PSCID',
                              'PSCID_BP_Pulse_Weight',
                              'PSCID_CSF_Proteins',
                              'PSCID_lab',
                              'PSCID_Med_use',
                              'PSCID_Smell',
                              'PSCID_RBANS',
                              'RBANS_version',
                              'Study_visit_label_AD8',
                              'Visit_label_AD8',
                              'Date_taken',
                              'Study_visit_label_APS',
                              'Visit_label_APS',
                              'Study_visit_label',
                              'Visit_label',
                              'Date_taken_Aud_pro',
                              'Study_visit_label_BP_Pulse_Weight',
                              'Visit_label_BP_Pulse_Weight',
                              'Date_taken_BP_Pulse_Weight',
                              'Study_visit_label_lab',
                              'Visit_label_lab',
                              'Date_taken_lab',
                              'Study_visit_label_Med_use',
                              'Visit_label_Med_use',
                              'Date_taken_Med_use',
                              'Study_visit_label_RBANS',
                              'Visit_label_RBANS',
                              'Date_taken_RBANS',
                              'Study_visit_label_CSF_Proteins',
                              'Visit_label_CSF_Proteins',
                              'Date_taken_CSF_Proteins',
                              'Candidate_Age',
                              'Candidate_Age_Aud_pro',
                              'Candidate_Age_BP_Pulse_Weight',
                              'Candidate_Age_CSF_Proteins',
                              'Candidate_Age_lab',
                              'Candidate_Age_Med_use',
                              'Candidate_Age_Smell',
                             ]).columns)

['AD8_total_score',
 '1_judgment_problems',
 '2_less_interest',
 '3_repeat',
 '4_trouble_learning_tool',
 '5_forget_month_year',
 '6_trouble_financial_affairs',
 '7_trouble_appointments',
 '8_daily_memory_trouble',
 'APS_score',
 'diagnosed_impairment',
 'hearing_aid',
 'subjective_hearing_impairment',
 'subjective_processing_impairment',
 '500_hz_left',
 '500_hz_right',
 '1000_hz_left',
 '1000_hz_right',
 '4000_hz_left',
 '4000_hz_right',
 'dsi_left',
 'dsi_right',
 'dsi_right_left',
 'worse_ear_dsi',
 'worse_ear_minus10db',
 'Systolic_blood_pressure',
 'Diastolic_blood_pressure',
 'Pulse',
 'Weight',
 'tau',
 'ptau',
 'Amyloid_beta_1_42',
 'ApoE',
 'G_CSF',
 'IL_15',
 'IL_8',
 'VEGF',
 'PCSK9',
 'hba1c_value',
 'tsh_value',
 'b12_value',
 'total_cholesterol_value',
 'HDL_value',
 'LDL_value',
 'SU_medication',
 'PRN_medication',
 'Candidate_Age_RBANS',
 'probable_MCI_visit',
 'immediate_memory_index_score',
 'visuospatial_constructional_index_score',
 'language_index_score',
 'attent

In [12]:
#dropping columns
all_time_points = all_time_points.drop(columns=[
                              'PSCID_AD8',
                              'CandID.1',
                              'PSCID_APS',
                              'PSCID',
                              'PSCID_BP_Pulse_Weight',
                              'PSCID_CSF_Proteins',
                              'PSCID_lab',
                              'PSCID_Med_use',
                              'PSCID_Smell',
                              'PSCID_RBANS',
                              'RBANS_version',
                              'Study_visit_label_AD8',
                              'Visit_label_AD8',
                              'Date_taken',
                              'Study_visit_label_APS',
                              'Visit_label_APS',
                              'Study_visit_label',
                              'Visit_label',
                              'Date_taken_Aud_pro',
                              'Study_visit_label_BP_Pulse_Weight',
                              'Visit_label_BP_Pulse_Weight',
                              'Date_taken_BP_Pulse_Weight',
                              'Study_visit_label_lab',
                              'Visit_label_lab',
                              'Date_taken_lab',
                              'Study_visit_label_Med_use',
                              'Visit_label_Med_use',
                              'Date_taken_Med_use',
                              'Study_visit_label_RBANS',
                              'Visit_label_RBANS',
                              'Date_taken_RBANS',
                              'Study_visit_label_CSF_Proteins',
                              'Visit_label_CSF_Proteins',
                              'Date_taken_CSF_Proteins',
                              'Candidate_Age',
                              'Candidate_Age_Aud_pro',
                              'Candidate_Age_BP_Pulse_Weight',
                              'Candidate_Age_CSF_Proteins',
                              'Candidate_Age_lab',
                              'Candidate_Age_Med_use',
                              'Candidate_Age_Smell',
                              'Study_visit_label_Smell',
                              'Visit_label_Smell', 
                              'Date_taken_Smell',
                              'diagnosis_anosmia',
                              'comments_uncategorized',
                              'SU_medication',
                              'PRN_medication',
                             ])

In [13]:
all_time_points = all_time_points.rename(columns={'Candidate_Age_RBANS':'Candidate_Age'})

In [14]:
list(all_time_points.columns)

['AD8_total_score',
 '1_judgment_problems',
 '2_less_interest',
 '3_repeat',
 '4_trouble_learning_tool',
 '5_forget_month_year',
 '6_trouble_financial_affairs',
 '7_trouble_appointments',
 '8_daily_memory_trouble',
 'APS_score',
 'diagnosed_impairment',
 'hearing_aid',
 'subjective_hearing_impairment',
 'subjective_processing_impairment',
 '500_hz_left',
 '500_hz_right',
 '1000_hz_left',
 '1000_hz_right',
 '4000_hz_left',
 '4000_hz_right',
 'dsi_left',
 'dsi_right',
 'dsi_right_left',
 'worse_ear_dsi',
 'worse_ear_minus10db',
 'Systolic_blood_pressure',
 'Diastolic_blood_pressure',
 'Pulse',
 'Weight',
 'tau',
 'ptau',
 'Amyloid_beta_1_42',
 'ApoE',
 'G_CSF',
 'IL_15',
 'IL_8',
 'VEGF',
 'PCSK9',
 'hba1c_value',
 'tsh_value',
 'b12_value',
 'total_cholesterol_value',
 'HDL_value',
 'LDL_value',
 'Candidate_Age',
 'probable_MCI_visit',
 'immediate_memory_index_score',
 'visuospatial_constructional_index_score',
 'language_index_score',
 'attention_index_score',
 'delayed_memory_index_sc

In [15]:
all_time_points.isna().sum()

AD8_total_score                                   391
1_judgment_problems                               344
2_less_interest                                   344
3_repeat                                          344
4_trouble_learning_tool                           344
5_forget_month_year                               344
6_trouble_financial_affairs                       344
7_trouble_appointments                            344
8_daily_memory_trouble                            344
APS_score                                        1069
diagnosed_impairment                              830
hearing_aid                                       831
subjective_hearing_impairment                     831
subjective_processing_impairment                  832
500_hz_left                                       830
500_hz_right                                      830
1000_hz_left                                      830
1000_hz_right                                     830
4000_hz_left                

In [16]:
all_time_points.index.get_level_values('CandID').isna().sum()

0

In [17]:
all_time_points.shape

(1562, 144)

# Getting fixed demographic measures from baseline assessement

In [18]:
import pandas as pd 

In [19]:
fixed = pd.read_csv('EN00_07.05.22.csv')

In [20]:
list(fixed.columns)

['CandID',
 'Education_years',
 'Height',
 'Handedness_result',
 'Handedness_left_total',
 'Handedness_right_total',
 'Handedness_right_left_difference',
 'Handedness_cumulative_total',
 'father_dx_ad_dementia',
 'mother_dx_ad_dementia',
 'sibling_dx_ad_dementia',
 'Candidate_Age_x',
 'Candidate_Age_y',
 'hba1c_value',
 'tsh_value',
 'total_cholesterol_value',
 'HDL_value',
 'LDL_value',
 'Candidate_Age_x.1',
 'Systolic_blood_pressure',
 'Diastolic_blood_pressure',
 'Pulse',
 'Weight',
 'Candidate_Age_y.1',
 'treatment_hypertension',
 'treatment_hyperlipidemia',
 'treatment_diabetes',
 'past_cancer',
 'past_depression',
 'past_osteoporosis',
 'past_migraine',
 'past_asthma',
 'past_atrial_fibrillation',
 'past_arthritis',
 'Candidate_Age_x.2',
 'CAIDE_total_score',
 'CAIDE_age_subscore',
 'CAIDE_education_subscore',
 'CAIDE_sex_subscore',
 'CAIDE_systolic_bp_subscore',
 'CAIDE_BMI_subscore',
 'CAIDE_cholesterol_subscore',
 'CAIDE_activity_subscore',
 'CAIDE_APOE4_subscore',
 'Candidate

In [21]:
#appending fixed variables to our dataset
df = all_time_points.merge(fixed, on='CandID', how = 'outer')

In [22]:
df.shape

(1562, 333)

In [23]:
#dropping visit without brain imaging recording
df = df.dropna(subset=['50'])

In [24]:
df.shape

(916, 333)

In [25]:
#dropping visit without apoe screening
df = df.dropna(subset=['APOE'])

In [26]:
df.shape

(799, 333)

In [27]:
candids = list(df.CandID)

In [28]:
df = df.drop(columns=['probable_MCI_visit'])

In [29]:
len(df.CandID.unique())

318

In [30]:
list(df.columns)

['CandID',
 'AD8_total_score',
 '1_judgment_problems',
 '2_less_interest',
 '3_repeat',
 '4_trouble_learning_tool',
 '5_forget_month_year',
 '6_trouble_financial_affairs',
 '7_trouble_appointments',
 '8_daily_memory_trouble',
 'APS_score',
 'diagnosed_impairment',
 'hearing_aid',
 'subjective_hearing_impairment',
 'subjective_processing_impairment',
 '500_hz_left',
 '500_hz_right',
 '1000_hz_left',
 '1000_hz_right',
 '4000_hz_left',
 '4000_hz_right',
 'dsi_left',
 'dsi_right',
 'dsi_right_left',
 'worse_ear_dsi',
 'worse_ear_minus10db',
 'Systolic_blood_pressure_x',
 'Diastolic_blood_pressure_x',
 'Pulse_x',
 'Weight_x',
 'tau',
 'ptau',
 'Amyloid_beta_1_42',
 'ApoE',
 'G_CSF',
 'IL_15',
 'IL_8',
 'VEGF',
 'PCSK9',
 'hba1c_value_x',
 'tsh_value_x',
 'b12_value',
 'total_cholesterol_value_x',
 'HDL_value_x',
 'LDL_value_x',
 'Candidate_Age',
 'immediate_memory_index_score',
 'visuospatial_constructional_index_score',
 'language_index_score',
 'attention_index_score',
 'delayed_memory_in

In [31]:
len(df)

799

# Missing Data Imputation

In [32]:
#checking rows where sex is missing
np.where((df.Sex_Female==0) & (df.Sex_Male==0))

(array([], dtype=int64),)

In [33]:
#checking rows where age is missing
np.where((df.Candidate_Age.isna()))

(array([], dtype=int64),)

In [34]:
from tqdm import tqdm

In [35]:
np.random.seed(0)
def my_impute(df):
    new_df = pd.DataFrame()
    for col in tqdm(df.columns[1:-1]):
        #print(df[col].dtypes)
        arr = pd.to_numeric(df[col])
        #print(arr.value_counts())
        arr = np.array(arr)
        #print(f'Replacing %i NaN values for {col}!' % np.sum(np.isnan(arr)))
        b_nan = np.isnan(arr)
        b_negative = arr < 0
        b_bad = b_nan | b_negative
        arr[b_bad] = np.random.choice(arr[~b_bad], np.sum(b_bad))
        new_df[col] = arr
    return new_df

In [36]:
imputed_df = my_impute(df)

  
100%|███████████████████████████████████████| 330/330 [00:00<00:00, 1694.56it/s]


In [37]:
imputed_df['APOE'] = list(df['APOE'])

  """Entry point for launching an IPython kernel.


In [38]:
imputed_df['CandID'] = list(df.CandID)

  """Entry point for launching an IPython kernel.


In [39]:
imputed_df.to_csv('prevent_AD_data_aug_2022.csv')

In [40]:
imputed_df.isna().sum()

AD8_total_score                                                   0
1_judgment_problems                                               0
2_less_interest                                                   0
3_repeat                                                          0
4_trouble_learning_tool                                           0
5_forget_month_year                                               0
6_trouble_financial_affairs                                       0
7_trouble_appointments                                            0
8_daily_memory_trouble                                            0
APS_score                                                         0
diagnosed_impairment                                              0
hearing_aid                                                       0
subjective_hearing_impairment                                     0
subjective_processing_impairment                                  0
500_hz_left                                     