In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.cross_decomposition import PLSRegression, PLSCanonical
from scipy.stats import pearsonr
from tqdm import tqdm
from nilearn.signal import clean
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge

In [2]:
pd.set_option('display.max_rows', 20)

In [3]:
pd.set_option('display.max_columns', 20)

# Fixed data

In [4]:
#retreiving fixed data (demographics, genetics, etc..) recorder at initial visit
EN00 = pd.read_csv(os.path.abspath('/Users/chloesavignac/_bzdok_lab_notebooks/Prevent-AD/internal/non-imaging-data/EL00.csv'), index_col=0)

In [5]:
#setting indixes to participant ids
EN00 = EN00.set_index('PSCID_x',drop=False).rename(columns={'PSCID_x':'PSCID'})

In [6]:
#verifying that each age entry is the same 
EN00[['Candidate_Age_x','Candidate_Age_y','Candidate_Age_x.1','Candidate_Age_y.1','Candidate_Age_x.2','Candidate_Age_y.2']]

Unnamed: 0_level_0,Candidate_Age_x,Candidate_Age_y,Candidate_Age_x.1,Candidate_Age_y.1,Candidate_Age_x.2,Candidate_Age_y.2
PSCID_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MTL0001,846.6,846.6,846.6,846.6,846.6,846.6
MTL0002,864.3,864.3,864.3,864.3,864.3,864.3
MTL0003,832.7,832.7,832.7,832.7,832.7,832.7
MTL0004,750.4,750.4,750.4,750.4,750.4,750.4
MTL0005,734.5,734.5,734.5,734.5,734.5,734.5
...,...,...,...,...,...,...
MTL0699,741.0,741.0,741.0,741.0,741.0,741.0
MTL0703,837.2,837.2,837.2,837.2,837.2,837.2
MTL0705,720.1,720.1,720.1,720.1,720.1,720.1
MTL0706,767.3,767.3,767.3,767.3,767.3,767.3


In [7]:
#keeping only one column for age
EN00['Age_baseline_months'] = EN00.Candidate_Age_x

In [8]:
EN00_cleaned = EN00.drop(columns=['father_onset_age',
                                  'mother_onset_age',
                                  'sibling_dx_ad_dementia_count',
                                  'sibling_onset_age_1',
                                  'sibling_onset_age_2',
                                  'sibling_onset_age_3',
                                  'sibling_onset_age_4',
                                  'sibling_onset_age_5',
                                  'other_family_members_AD',
                                  'other_maternal_family_members_AD',
                                  'other_paternal_family_members_AD',
                                  'b12_value',
                                  '7_memory_NoPoint',
                                  'CandID',
                                  'PSCID',
                                  'PSCID_y',
                                  'PSCID_x.1',
                                  'Study_visit_label_x',
                                  'Visit_label_x', 
                                  'Date_taken_x',
                                  'PSCID_y.1',
                                  'Study_visit_label_y',
                                  'Visit_label_y',
                                  'Date_taken_y',
                                  'PSCID_x.2', 
                                  'Study_visit_label_x.1', 
                                  'Visit_label_x.1',
                                  'Date_taken_x.1', 
                                  'PSCID_y.2', 
                                  'Study_visit_label_y.1',
                                  'Visit_label_y.1', 
                                  'Date_taken_y.1',
                                  'PSCID_x.3',
                                  'Study_visit_label_x.2', 
                                  'Visit_label_x.2', 
                                  'Date_taken_x.2',
                                  'PSCID_y.3', 
                                  'Study_visit_label_y.2', 
                                  'Visit_label_y.2',
                                  'Date_taken_y.2',
                                  'Work',
                                  'SU_medication',
                                  'PRN_medication'
                                 ])
EN00_cleaned = EN00_cleaned.dropna(subset=['Handedness_result',
                                           'hba1c_value',
                                           'tsh_value',
                                           'LDL_value',
                                           'Pulse',
                                           'Weight',
                                           'CAIDE_total_score',
                                           'MOCA_total_score',
                                          ])

In [9]:
apoe = list(EN00_cleaned.APOE)

In [10]:
#separate numerical and non-numerical data
EN00_cleaned_num = EN00_cleaned.select_dtypes(exclude=['object'])
EN00_cleaned_cat = EN00_cleaned.select_dtypes(include=['object'])

In [11]:
#dummy-code all categorical data
EN00_cleaned_cat = pd.get_dummies(EN00_cleaned_cat, dummy_na=False)

In [12]:
EN00_cleaned_cat.shape

(332, 95)

In [13]:
#merge back categorical and numerical variables
EN00_cleaned = EN00_cleaned_num.join(EN00_cleaned_cat)
EN00_cleaned.shape

(332, 187)

In [None]:
#separate numerical and non-numerical data
EN00_cleaned_num = EN00_cleaned.select_dtypes(exclude=['object'])
EN00_cleaned_cat = EN00_cleaned.select_dtypes(include=['object'])

#dummy-code all categorical data
EN00_cleaned_cat = pd.get_dummies(EN00_cleaned_cat, dummy_na=False)

#merge back categorical and numerical variables
EN00_cleaned = EN00_cleaned_num.join(EN00_cleaned_cat)
EN00_cleaned.shapedu

In [14]:
EN00_cleaned['CandID'] = EN00['CandID']

In [15]:
EN00_cleaned['APOE'] = apoe

In [16]:
EN00_cleaned = EN00_cleaned.set_index('CandID').sort_index()

In [17]:
EN00_cleaned['APOE']

CandID
108583    3 3
113451    3 2
115095    4 3
117603    3 3
125043    3 3
         ... 
988974    4 3
992335    3 3
996215    4 3
996554    3 3
999919    3 3
Name: APOE, Length: 332, dtype: object

In [18]:
EN00_cleaned.to_csv('EN00_07.05.22.csv')

# Longitudinal data 

In [19]:
#retreiving longitudinal data
BL00 = pd.read_csv(os.path.abspath('/Users/chloesavignac/_bzdok_lab_notebooks/Prevent-AD/internal/non-imaging-data/BL00_outter_merge.csv'), index_col=0)
FU12 = pd.read_csv(os.path.abspath('/Users/chloesavignac/_bzdok_lab_notebooks/Prevent-AD/internal/non-imaging-data/FU12_outter_merge.csv'), index_col=0)
FU24 = pd.read_csv(os.path.abspath('/Users/chloesavignac/_bzdok_lab_notebooks/Prevent-AD/internal/non-imaging-data/FU24_outter_merge.csv'), index_col=0)
FU36 = pd.read_csv(os.path.abspath('/Users/chloesavignac/_bzdok_lab_notebooks/Prevent-AD/internal/non-imaging-data/FU36_outter_merge.csv'), index_col=0)
FU48 = pd.read_csv(os.path.abspath('/Users/chloesavignac/_bzdok_lab_notebooks/Prevent-AD/internal/non-imaging-data/FU48_outter_merge.csv'), index_col=0)

In [20]:
#Canonical Variates
BL00_CCA = pd.read_csv('BL00_CCA_modes.csv',index_col=0).rename(columns={'0':'PSCID'})
FU12_CCA = pd.read_csv('FU12_CCA_modes.csv',index_col=0).rename(columns={'0':'PSCID'})
FU24_CCA = pd.read_csv('FU24_CCA_modes.csv',index_col=0).rename(columns={'0':'PSCID'})
FU36_CCA = pd.read_csv('FU36_CCA_modes.csv',index_col=0).rename(columns={'0':'PSCID'})
FU48_CCA = pd.read_csv('FU48_CCA_modes.csv',index_col=0).rename(columns={'0':'PSCID'})

In [21]:
BL00['PSCID'] = BL00.PSCID_RBANS
FU12['PSCID'] = FU12.PSCID_RBANS
FU24['PSCID'] = FU24.PSCID_RBANS
FU36['PSCID'] = FU36.PSCID_RBANS
FU48['PSCID'] = FU48.PSCID_RBANS

In [22]:
BL00_merged = BL00.merge(BL00_CCA, on='PSCID', how = 'outer')
FU12_merged = FU12.merge(FU12_CCA, on='PSCID', how = 'outer')
FU24_merged = FU24.merge(FU24_CCA, on='PSCID', how = 'outer')
FU36_merged = FU36.merge(FU36_CCA, on='PSCID', how = 'outer')
FU48_merged = FU48.merge(FU48_CCA, on='PSCID', how = 'outer')

In [23]:
BL00_merged['PSCID'] = BL00_merged.PSCID_RBANS
FU12_merged['PSCID'] = FU12_merged.PSCID_RBANS
FU24_merged['PSCID'] = FU24_merged.PSCID_RBANS
FU36_merged['PSCID'] = FU36_merged.PSCID_RBANS
FU48_merged['PSCID'] = FU48_merged.PSCID_RBANS

In [24]:
BL00_merged = BL00_merged.set_index('CandID', drop = False).sort_index()
FU12_merged = FU12_merged.set_index('CandID', drop = False).sort_index()
FU24_merged = FU24_merged.set_index('CandID', drop = False).sort_index()
FU36_merged = FU36_merged.set_index('CandID', drop = False).sort_index()
FU48_merged = FU48_merged.set_index('CandID', drop = False).sort_index()

In [25]:
dfs = [BL00_merged,FU12_merged,FU24_merged,FU36_merged,FU48_merged]

In [26]:
all_time_points = pd.concat(dfs, keys=["BL00", "FU12","FU24","FU36",'FU48'])
all_time_points

Unnamed: 0_level_0,Unnamed: 1_level_0,PSCID_AD8,CandID,Study_visit_label_AD8,Visit_label_AD8,Date_taken,Candidate_Age,AD8_total_score,1_judgment_problems,2_less_interest,3_repeat,...,41,42,43,44,45,46,47,48,49,50
Unnamed: 0_level_1,CandID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
BL00,108583,MTL0392,108583,NAPBL00,BL00,2014-08-07,813.1,0.0,0.0,0.0,0.0,...,1.165130,-0.782472,-0.056009,-0.513515,1.240126,0.538546,0.009536,-0.480883,-1.134830,1.379539
BL00,113451,MTL0415,113451,NAPBL00,BL00,2014-10-02,764.7,0.0,0.0,0.0,0.0,...,0.421054,0.207186,0.570846,-2.411117,0.615360,-1.162505,-0.502572,-0.222151,-0.262458,-0.594918
BL00,115095,MTL0380,115095,PREBL00,BL00,2014-07-22,780.2,0.0,0.0,0.0,0.0,...,1.301543,0.315062,-0.184577,0.675348,0.201383,1.217526,-0.027366,-1.329216,-0.574692,-0.099748
BL00,117603,MTL0482,117603,PREBL00,BL00,2015-08-25,723.9,0.0,0.0,0.0,0.0,...,-0.756919,-1.575811,0.436410,0.278309,0.392172,-0.074069,-0.292730,-0.085280,-0.073419,0.861089
BL00,122650,MTL0008,122650,PREBL00,BL00,2013-01-30,885.3,0.0,0.0,0.0,0.0,...,0.227245,0.177550,-0.287491,-0.512066,0.362787,0.292497,0.265293,0.411346,-0.388128,-0.299726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FU48,981116,MTL0014,981116,PREFU48,FU48,2017-01-24,794.4,0.0,0.0,0.0,0.0,...,-0.334663,-0.235534,-0.593212,-0.263434,0.111491,-0.560572,-0.870231,0.331414,-0.632706,0.044158
FU48,996215,MTL0265,996215,NAPFU48,FU48,2017-10-03,750.4,0.0,0.0,0.0,0.0,...,,,,,,,,,,
FU48,996554,MTL0122,996554,NAPFU48,FU48,2017-04-12,1058.3,0.0,0.0,0.0,0.0,...,,,,,,,,,,
FU48,999145,MTL0002,999145,NAPFU48,FU48,2017-04-12,929.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [27]:
all_time_points.select_dtypes(include=['object']).columns

Index(['PSCID_AD8', 'Study_visit_label_AD8', 'Visit_label_AD8', 'Date_taken',
       'PSCID_APS', 'Study_visit_label_APS', 'Visit_label_APS', 'PSCID',
       'Study_visit_label', 'Visit_label', 'Date_taken_Aud_pro',
       'PSCID_BP_Pulse_Weight', 'Study_visit_label_BP_Pulse_Weight',
       'Visit_label_BP_Pulse_Weight', 'Date_taken_BP_Pulse_Weight',
       'PSCID_CSF_Proteins', 'Study_visit_label_CSF_Proteins',
       'Visit_label_CSF_Proteins', 'Date_taken_CSF_Proteins', 'PSCID_lab',
       'Study_visit_label_lab', 'Visit_label_lab', 'Date_taken_lab',
       'PSCID_Med_use', 'Study_visit_label_Med_use', 'Visit_label_Med_use',
       'Date_taken_Med_use', 'SU_medication', 'PRN_medication', 'PSCID_RBANS',
       'Study_visit_label_RBANS', 'Visit_label_RBANS', 'Date_taken_RBANS',
       'probable_MCI_visit', 'RBANS_version', 'PSCID_Smell',
       'Study_visit_label_Smell', 'Visit_label_Smell', 'Date_taken_Smell',
       'diagnosis', 'comments_uncategorized'],
      dtype='object')

In [28]:
all_time_points = all_time_points.rename(columns={'diagnosis':'diagnosis_anosmia'})

In [29]:
diagnosis_anosmia = pd.get_dummies(all_time_points.diagnosis_anosmia, prefix='anosmia_diagnosis', dummy_na=False)
diagnosis_anosmia.shape

(1562, 5)

In [30]:
diagnosis_anosmia

Unnamed: 0_level_0,Unnamed: 1_level_0,anosmia_diagnosis_Mild Microsmia,anosmia_diagnosis_Moderate Microsmia,anosmia_diagnosis_Normosmia,anosmia_diagnosis_Severe Microsmia,anosmia_diagnosis_Total Anosmia
Unnamed: 0_level_1,CandID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BL00,108583,0,0,1,0,0
BL00,113451,0,0,1,0,0
BL00,115095,0,0,1,0,0
BL00,117603,0,0,1,0,0
BL00,122650,1,0,0,0,0
...,...,...,...,...,...,...
FU48,981116,0,0,1,0,0
FU48,996215,1,0,0,0,0
FU48,996554,0,0,0,1,0
FU48,999145,0,1,0,0,0


In [31]:
RBANS_version = pd.get_dummies(all_time_points.RBANS_version, prefix='RBANS_version',dummy_na=False)

In [32]:
RBANS_version

Unnamed: 0_level_0,Unnamed: 1_level_0,RBANS_version_A,RBANS_version_B,RBANS_version_C,RBANS_version_D
Unnamed: 0_level_1,CandID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BL00,108583,1,0,0,0
BL00,113451,1,0,0,0
BL00,115095,1,0,0,0
BL00,117603,1,0,0,0
BL00,122650,1,0,0,0
...,...,...,...,...,...
FU48,981116,0,0,0,1
FU48,996215,0,0,0,1
FU48,996554,0,1,0,0
FU48,999145,0,1,0,0


In [33]:
all_time_points = pd.concat([all_time_points, diagnosis_anosmia], axis=1)

In [34]:
all_time_points = pd.concat([all_time_points,RBANS_version], axis=1)

In [35]:
all_time_points.select_dtypes(exclude=['object']).isna().sum()

CandID                               0
Candidate_Age                      341
AD8_total_score                    391
1_judgment_problems                344
2_less_interest                    344
                                  ... 
anosmia_diagnosis_Total Anosmia      0
RBANS_version_A                      0
RBANS_version_B                      0
RBANS_version_C                      0
RBANS_version_D                      0
Length: 150, dtype: int64

In [36]:
all_time_points.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PSCID_AD8,CandID,Study_visit_label_AD8,Visit_label_AD8,Date_taken,Candidate_Age,AD8_total_score,1_judgment_problems,2_less_interest,3_repeat,...,50,anosmia_diagnosis_Mild Microsmia,anosmia_diagnosis_Moderate Microsmia,anosmia_diagnosis_Normosmia,anosmia_diagnosis_Severe Microsmia,anosmia_diagnosis_Total Anosmia,RBANS_version_A,RBANS_version_B,RBANS_version_C,RBANS_version_D
Unnamed: 0_level_1,CandID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
BL00,108583,MTL0392,108583,NAPBL00,BL00,2014-08-07,813.1,0.0,0.0,0.0,0.0,...,1.379539,0,0,1,0,0,1,0,0,0
BL00,113451,MTL0415,113451,NAPBL00,BL00,2014-10-02,764.7,0.0,0.0,0.0,0.0,...,-0.594918,0,0,1,0,0,1,0,0,0
BL00,115095,MTL0380,115095,PREBL00,BL00,2014-07-22,780.2,0.0,0.0,0.0,0.0,...,-0.099748,0,0,1,0,0,1,0,0,0
BL00,117603,MTL0482,117603,PREBL00,BL00,2015-08-25,723.9,0.0,0.0,0.0,0.0,...,0.861089,0,0,1,0,0,1,0,0,0
BL00,122650,MTL0008,122650,PREBL00,BL00,2013-01-30,885.3,0.0,0.0,0.0,0.0,...,-0.299726,1,0,0,0,0,1,0,0,0


In [37]:
all_time_points.index.get_level_values('CandID').unique()

Int64Index([108583, 113451, 115095, 117603, 122650, 125043, 132967, 138233,
            139940, 141584,
            ...
            976111, 978946, 981116, 981909, 988974, 992335, 996215, 996554,
            999145, 999919],
           dtype='int64', name='CandID', length=386)

In [38]:
all_time_points.to_csv('all_time_points_07.05.22.csv')