In [1]:
import os
import errno
import sys
import time

In [2]:
import pandas as pd
import sklearn 
import numpy as np
import matplotlib.pyplot as plt

In [3]:
ROOT_PATH = '../'
ORIGINAL_DATA = 'data/DataforGe_v2.xlsx'

In [4]:
orig_data_path = os.path.join(ROOT_PATH, ORIGINAL_DATA)

In [5]:
if os.path.isfile(orig_data_path):
    dfs = pd.read_excel(orig_data_path, sheet_name=None)
else:
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), orig_data_path)

In [6]:
sheet_names = dfs.keys()
print('The loaded panda frames are', sheet_names)

The loaded panda frames are dict_keys(['DTIConnectData', 'restingstatedata', 'otherdata', 'outcome'])


# Inspect outcome

In [7]:
row_count = len(dfs['outcome'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(dfs['outcome'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 5237 rows.
The DataFrame has 1 features.


In [8]:
data_top = dfs['outcome'].head()
print(data_top)

     src_subject_id  becomeCHR_3yr
0  NDAR_INV00CY2MDM              0
1  NDAR_INV00HEV6HB              0
2  NDAR_INV00LH735Y              0
3  NDAR_INV014RTM1V              0
4  NDAR_INV019DXLU4              0


In [9]:
column_nan_counts = dfs['outcome'].isna().sum()
print(column_nan_counts)

src_subject_id    0
becomeCHR_3yr     0
dtype: int64


In [10]:
print('There are duplicated subjects {}'.format(dfs['outcome']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Inspect DTIConnectData

In [11]:
row_count = len(dfs['DTIConnectData'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(dfs['DTIConnectData'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 7266 rows.
The DataFrame has 33 features.


In [12]:
data_top = dfs['DTIConnectData'].head()
# print(data_top)

In [13]:
column_nan_counts = dfs['DTIConnectData'].isna().sum()
print(column_nan_counts)

src_subject_id                                      0
imgincl_FA_include                                  0
FA_All_agesexsite                                   0
FA_cingulatecingulum_R_agesexsite                   0
FA_CinguluteCingulum_L_agesexsite                   0
FA_corpuscallosum_agesexsite                        0
FA_Corticospinal_L_agesexsite                       0
FA_Corticospinal_R_agesexsite                       0
FA_forecepsmajor_agesexsite                         0
FA_forecepsminor_agesexsite                         0
FA_Fornix_L_agesexsite                              0
FA_Fornix_R_agesexsite                              0
FA_IFC_SupFrontal_L_agesexsite                      0
FA_IFC_SupFrontal_R_agesexsite                      0
FA_inferiorfrontooccipitalfasiculus_L_agesexsite    0
FA_inferiorfrontooccipitalfasiculus_R_agesexsite    0
FA_inferiorlongfascic_L_agesexsite                  0
FA_inferiorlongfascic_R_agesexsite                  0
FA_ParahippocampalCingulum_L

In [14]:
print('There are duplicated subjects {}'.format(dfs['DTIConnectData']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Inspect restingstatedata

In [15]:
row_count = len(dfs['restingstatedata'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(dfs['restingstatedata'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 6752 rows.
The DataFrame has 271 features.


In [16]:
data_top = dfs['restingstatedata'].head()
# print(data_top)

In [17]:
column_nan_counts = dfs['restingstatedata'].isna().sum()
print(column_nan_counts)

src_subject_id                         0
imgincl_rsfmri_include                 0
auditory_auditory_agesexsite           0
auditory_cingulooper_agesexsite        0
auditory_cinguloparietal_agesexsite    0
                                      ..
VIS_pallidumR_agesexsite               0
VIS_putamenL_agesexsite                0
VIS_putamenR_agesexsite                0
VIS_thalamusL_agesexsite               0
VIS_thalamusR_agesexsite               0
Length: 272, dtype: int64


In [18]:
print('There are duplicated subjects {}'.format(dfs['restingstatedata']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Inspect otherdata

In [19]:
row_count = len(dfs['otherdata'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(dfs['otherdata'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 5237 rows.
The DataFrame has 7 features.


In [20]:
data_top = dfs['otherdata'].head()
print(data_top)

     src_subject_id  ageat2yr  ravlt_sumtc_2yr  nihtbx_pattern_raw_2yr  \
0  NDAR_INV00CY2MDM      12.0               58                      46   
1  NDAR_INV00HEV6HB      12.0               56                      45   
2  NDAR_INV00LH735Y      11.0               83                      40   
3  NDAR_INV014RTM1V      11.0               76                      45   
4  NDAR_INV019DXLU4      12.0               74                      42   

   SumNegLifeEvents  N_Trauma_Types  fam_history_8_yes_no  dropingrades_2yr  
0                 6               2                     0                 1  
1                 3               1                     0                 1  
2                 4               0                     0                 0  
3                 7               0                     0                 0  
4                 7               3                     0                 0  


In [21]:
column_nan_counts = dfs['otherdata'].isna().sum()
print(column_nan_counts)

src_subject_id            0
ageat2yr                  0
ravlt_sumtc_2yr           0
nihtbx_pattern_raw_2yr    0
SumNegLifeEvents          0
N_Trauma_Types            0
fam_history_8_yes_no      0
dropingrades_2yr          0
dtype: int64


In [22]:
print('There are duplicated subjects {}'.format(dfs['otherdata']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Shared subjects

In [23]:
shared_subjects = set(dfs['outcome']['src_subject_id']).intersection(\
    set(dfs['DTIConnectData']['src_subject_id']),\
    set(dfs['restingstatedata']['src_subject_id']),\
    set(dfs['otherdata']['src_subject_id']))

In [24]:
filtered_dfs = {sheet_name: df[df['src_subject_id'].isin(shared_subjects)] for sheet_name, df in dfs.items()}

In [25]:
print('In the filtered data frames, the number of subjects in each sheet is')
for sheet_name, df in filtered_dfs.items():
    print(sheet_name, len(df))

In the filtered data frames, the number of subjects in each sheet is
DTIConnectData 3946
restingstatedata 3946
otherdata 3946
outcome 3946


# Load diffusivity & corticalthickness data

In [26]:
DIFnCOR_DATA = 'data/diffusivityandcorticalthicknessdataforGe.xlsx'

In [27]:
difncor_data_path = os.path.join(ROOT_PATH, DIFnCOR_DATA)

In [28]:
if os.path.isfile(difncor_data_path):
    difncor_dfs = pd.read_excel(difncor_data_path, sheet_name=None)
else:
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), difncor_data_path)

In [29]:
sheet_names = difncor_dfs.keys()
print('The loaded panda frames are', sheet_names)

The loaded panda frames are dict_keys(['diffusivity', 'corticalthickness'])


# Inspect diffusivity data

In [33]:
row_count = len(difncor_dfs['diffusivity'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(difncor_dfs['diffusivity'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 4261 rows.
The DataFrame has 37 features.


In [34]:
data_top = difncor_dfs['diffusivity'].head()
print(data_top)

     src_subject_id  diffusivity_L_ anterior thalamic radiations_site  \
0  NDAR_INVM15U5KMV                                          0.516047   
1  NDAR_INVAWG2NZC4                                          0.532019   
2  NDAR_INV0TEH16CM                                          0.543361   
3  NDAR_INVF1XKPBT6                                          0.511484   
4  NDAR_INV019DXLU4                                          0.528977   

   diffusivity_R_anterior thalamic radiations_site  \
0                                         0.533702   
1                                         0.538652   
2                                         0.568602   
3                                         0.505654   
4                                         0.530402   

   diffusivity_corpus callosum_site  diffusivity_L_ cingulate cingulum_site  \
0                          0.494924                                0.506804   
1                          0.510121                                0.523930   

In [35]:
column_nan_counts = difncor_dfs['diffusivity'].isna().sum()
print(column_nan_counts)

src_subject_id                                                       0
diffusivity_L_ anterior thalamic radiations_site                    15
diffusivity_R_anterior thalamic radiations_site                     15
diffusivity_corpus callosum_site                                    15
diffusivity_L_ cingulate cingulum_site                              15
diffusivity_R_cingulate cingulum_site                               15
diffusivity_L_ parahippocampal cingulum_site                        15
diffusivity_R_parahippocampal cingulum_site                         15
diffusivity_L_ corticospinal/pyramidal_site                         15
diffusivity_R_corticospinal/pyramidal_site                          15
diffusivity_foreceps major_site                                     15
diffusivity_foreceps minor_site                                     15
diffusivity_L_ superior corticostriate-frontal cortex only_site     15
diffusivity_R_superior corticostriate-frontal cortex only_site      15
diffus

In [36]:
print('There are duplicated subjects {}'.format(difncor_dfs['diffusivity']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


In [37]:
# dropna
difncor_dfs['diffusivity'] = difncor_dfs['diffusivity'].dropna()
row_count = len(difncor_dfs['diffusivity'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(difncor_dfs['diffusivity'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 4246 rows.
The DataFrame has 37 features.


# Inspect corticalthickness data

In [38]:
row_count = len(difncor_dfs['corticalthickness'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(difncor_dfs['corticalthickness'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 4400 rows.
The DataFrame has 148 features.


In [39]:
data_top = difncor_dfs['corticalthickness'].head()
print(data_top)

     src_subject_id  CortThk_L_fronto-marginal gyrus and sulcus_agesexsite  \
0  NDAR_INV00LH735Y                                           2.454222       
1  NDAR_INV1YWMB9TV                                           2.550441       
2  NDAR_INV25RHG3PJ                                           2.280261       
3  NDAR_INV2DZUL8LC                                           2.811705       
4  NDAR_INV2K3JH38W                                           2.431327       

   CortThk_L_inferior occipital gyrus and sulcus_agesexsite  \
0                                           2.572241          
1                                           2.501550          
2                                           2.506715          
3                                           2.545938          
4                                           2.159324          

   CortThk_L_paracentral lobule and sulcus_agesexsite  \
0                                           2.912427    
1                                     

In [40]:
column_nan_counts = difncor_dfs['corticalthickness'].isna().sum()
print(column_nan_counts)

src_subject_id                                              0
CortThk_L_fronto-marginal gyrus and sulcus_agesexsite       0
CortThk_L_inferior occipital gyrus and sulcus_agesexsite    0
CortThk_L_paracentral lobule and sulcus_agesexsite          0
CortThk_L_subcentral gyrus and sulci_agesexsite             0
                                                           ..
CortThk_R_suborbital sulcus_agesexsite                      0
CortThk_R_subparietal sulcus_agesexsite                     0
CortThk_R_inferior temporal sulcus_agesexsite               0
CortThk_R_superior temporal sulcus_agesexsite               0
CortThk_R_transverse temporal sulcus_agesexsite             0
Length: 149, dtype: int64


In [41]:
print('There are duplicated subjects {}'.format(difncor_dfs['corticalthickness']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Shared subjects of two dataframes

In [42]:
shared_subjects = set(dfs['outcome']['src_subject_id']).intersection(\
    set(dfs['DTIConnectData']['src_subject_id']),\
    set(dfs['restingstatedata']['src_subject_id']),\
    set(dfs['otherdata']['src_subject_id']),\
    set(difncor_dfs['diffusivity']['src_subject_id']),\
    set(difncor_dfs['corticalthickness']['src_subject_id']))

In [43]:
filtered_dfs = {sheet_name: df[df['src_subject_id'].isin(shared_subjects)] for sheet_name, df in dfs.items()}

In [45]:
filtered_difncor_dfs = {sheet_name: df[df['src_subject_id'].isin(shared_subjects)] for sheet_name, df in difncor_dfs.items()}

In [46]:
print('In the filtered data frames, the number of subjects in each sheet is')
for sheet_name, df in filtered_dfs.items():
    print(sheet_name, len(df))

In the filtered data frames, the number of subjects in each sheet is
DTIConnectData 3946
restingstatedata 3946
otherdata 3946
outcome 3946


In [47]:
print('In the filtered data frames, the number of subjects in each sheet is')
for sheet_name, df in filtered_difncor_dfs.items():
    print(sheet_name, len(df))

In the filtered data frames, the number of subjects in each sheet is
diffusivity 3946
corticalthickness 3946
