In [1]:
import os
import errno
import sys
import time

In [2]:
import pandas as pd
import sklearn 
import numpy as np
import matplotlib.pyplot as plt

In [3]:
ROOT_PATH = '../'
ORIGINAL_DATA = 'data/RawData.xlsx'

In [4]:
orig_data_path = os.path.join(ROOT_PATH, ORIGINAL_DATA)

In [5]:
if os.path.isfile(orig_data_path):
    dfs = pd.read_excel(orig_data_path, sheet_name=None)
else:
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), orig_data_path)

In [6]:
sheet_names = dfs.keys()
print('The loaded panda frames are', sheet_names)

dict_keys(['otherdata', 'RestingState', 'Corticalthickness', 'DTIconnectivity', 'outcome'])

# Inspect outcome

In [7]:
row_count = len(dfs['outcome'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(dfs['outcome'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 3946 rows.
The DataFrame has 10 features.


In [8]:
data_top = dfs['outcome'].head()
print(data_top)

   Unnamed: 0    src_subject_id  becomeCHR_3yr  diffusivity_all fibers_site  \
0           0  NDAR_INV00CY2MDM              0                     0.516813   
1           1  NDAR_INV00HEV6HB              0                     0.504674   
2           2  NDAR_INV014RTM1V              0                     0.499351   
3           3  NDAR_INV01AJ15N9              0                     0.499413   
4           4  NDAR_INV01NAYMZH              0                     0.484313   

   diffusivity_L_ hemisphere fibers_site  \
0                               0.516055   
1                               0.503342   
2                               0.493832   
3                               0.497758   
4                               0.482630   

   diffusivity_L_ hemisphere fibers without corpus callosum_site  \
0                                           0.516193               
1                                           0.503085               
2                                           0.500397    

In [9]:
column_nan_counts = dfs['outcome'].isna().sum()
print(column_nan_counts)

Unnamed: 0                                                       0
src_subject_id                                                   0
becomeCHR_3yr                                                    0
diffusivity_all fibers_site                                      0
diffusivity_L_ hemisphere fibers_site                            0
diffusivity_L_ hemisphere fibers without corpus callosum_site    0
diffusivity_R_hemisphere fibers without corpus callosum_site     0
diffusivity_R_hemisphere fibers_site                             0
CortThk_L_mean_agesexsite                                        0
CortThk_R_mean_agesexsite                                        0
CortThkcortical Destrieux ROI mean_agesexsite                    0
dtype: int64


In [10]:
print('There are duplicated subjects {}'.format(dfs['outcome']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Inspect DTIconnectivity

In [11]:
row_count = len(dfs['DTIconnectivity'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(dfs['DTIconnectivity'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 7266 rows.
The DataFrame has 36 features.


In [12]:
data_top = dfs['DTIconnectivity'].head()
# print(data_top)

In [13]:
column_nan_counts = dfs['DTIconnectivity'].isna().sum()
print(column_nan_counts)

Unnamed: 0                               0
src_subject_id                           0
imgincl_dmri_include                     0
age                                      0
sex_1isM                                 0
site                                     0
FA_cingulatecingulum_R                   0
FA_CinguluteCingulum_L                   0
FA_corpuscallosum                        0
FA_Corticospinal_L                       0
FA_Corticospinal_R                       0
FA_forecepsmajor                         0
FA_forecepsminor                         0
FA_Fornix_L                              0
FA_Fornix_R                              0
FA_IFC_SupFrontal_L                      0
FA_IFC_SupFrontal_R                      0
FA_inferiorfrontooccipitalfasiculus_L    0
FA_inferiorfrontooccipitalfasiculus_R    0
FA_inferiorlongfascic_L                  0
FA_inferiorlongfascic_R                  0
FA_ParahippocampalCingulum_L             0
FA_ParahippocampalCingulum_R             0
FA_parietal

In [14]:
print('There are duplicated subjects {}'.format(dfs['DTIconnectivity']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Inspect restingstatedata

In [15]:
row_count = len(dfs['RestingState'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(dfs['RestingState'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 6835 rows.
The DataFrame has 83 features.


In [16]:
data_top = dfs['RestingState'].head()
# print(data_top)

In [17]:
column_nan_counts = dfs['RestingState'].isna().sum()
print(column_nan_counts)

Unnamed: 0                   0
src_subject_id               0
imgincl_rsfmri_include       0
age                          0
sex_1isM                     0
                            ..
sensorymotormouth_VAN_ABS    0
sensorymotormouth_VIS_ABS    0
VAN_VAN_ABS                  0
VAN_VIS_ABS                  0
VAN_VIS_ABS.1                0
Length: 84, dtype: int64


In [18]:
print('There are duplicated subjects {}'.format(dfs['RestingState']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Inspect otherdata

In [19]:
row_count = len(dfs['otherdata'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(dfs['otherdata'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 5237 rows.
The DataFrame has 8 features.


In [20]:
data_top = dfs['otherdata'].head()
print(data_top)

   Unnamed: 0    src_subject_id  ageat2yr  ravlt_sumtc_2yr  \
0           0  NDAR_INV00CY2MDM      12.0               58   
1           1  NDAR_INV00HEV6HB      12.0               56   
2           2  NDAR_INV00LH735Y      11.0               83   
3           3  NDAR_INV014RTM1V      11.0               76   
4           4  NDAR_INV019DXLU4      12.0               74   

   nihtbx_pattern_raw_2yr  SumNegLifeEvents  N_Trauma_Types  \
0                      46                 6               2   
1                      45                 3               1   
2                      40                 4               0   
3                      45                 7               0   
4                      42                 7               3   

   fam_history_8_yes_no  dropingrades_2yr  
0                     0                 1  
1                     0                 1  
2                     0                 0  
3                     0                 0  
4                     0     

In [21]:
column_nan_counts = dfs['otherdata'].isna().sum()
print(column_nan_counts)

Unnamed: 0                0
src_subject_id            0
ageat2yr                  0
ravlt_sumtc_2yr           0
nihtbx_pattern_raw_2yr    0
SumNegLifeEvents          0
N_Trauma_Types            0
fam_history_8_yes_no      0
dropingrades_2yr          0
dtype: int64


In [22]:
print('There are duplicated subjects {}'.format(dfs['otherdata']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Inspect Corticalthickness

In [23]:
row_count = len(dfs['Corticalthickness'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(dfs['Corticalthickness'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 7682 rows.
The DataFrame has 154 features.


In [24]:
data_top = dfs['Corticalthickness'].head()
print(data_top)

   Unnamed: 0    src_subject_id                 eventname  yr2_age  sex  site  \
0         270  NDAR_INV00CY2MDM  2_year_follow_up_y_arm_1     12.0    1    20   
1         271  NDAR_INV00HEV6HB  2_year_follow_up_y_arm_1     12.0    1    12   
2         272  NDAR_INV00LH735Y  2_year_follow_up_y_arm_1     11.0    1     3   
3         273  NDAR_INV014RTM1V  2_year_follow_up_y_arm_1     11.0    1    17   
4         274  NDAR_INV019DXLU4  2_year_follow_up_y_arm_1     12.0    2    19   

   imgincl_t1w_include  CortThk_L_fronto-marginal gyrus and sulcus  \
0                    1                                       2.564   
1                    1                                       2.360   
2                    1                                       2.451   
3                    1                                       2.478   
4                    1                                       2.577   

   CortThk_L_inferior occipital gyrus and sulcus  \
0                                       

In [25]:
column_nan_counts = dfs['Corticalthickness'].isna().sum()
print(column_nan_counts)

Unnamed: 0                              0
src_subject_id                          0
eventname                               0
yr2_age                                 0
sex                                     0
                                       ..
CortThk_R_suborbital sulcus             0
CortThk_R_subparietal sulcus            0
CortThk_R_inferior temporal sulcus      0
CortThk_R_superior temporal sulcus      0
CortThk_R_transverse temporal sulcus    0
Length: 155, dtype: int64


In [26]:
print('There are duplicated subjects {}'.format(dfs['Corticalthickness']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Shared subjects

In [29]:
shared_subjects = set(dfs['outcome']['src_subject_id']).intersection(\
    set(dfs['DTIconnectivity']['src_subject_id']),\
    set(dfs['RestingState']['src_subject_id']),\
    set(dfs['Corticalthickness']['src_subject_id']),\
    set(dfs['otherdata']['src_subject_id']))

In [30]:
filtered_dfs = {sheet_name: df[df['src_subject_id'].isin(shared_subjects)] for sheet_name, df in dfs.items()}

In [31]:
print('In the filtered data frames, the number of subjects in each sheet is')
for sheet_name, df in filtered_dfs.items():
    print(sheet_name, len(df))

In the filtered data frames, the number of subjects in each sheet is
otherdata 3941
RestingState 3941
Corticalthickness 3941
DTIconnectivity 3941
outcome 3941


In [40]:
print(dfs['Corticalthickness']['imgincl_t1w_include'])

0       1
1       1
2       1
3       1
4       1
       ..
7677    1
7678    1
7679    1
7680    1
7681    1
Name: imgincl_t1w_include, Length: 7682, dtype: int64


In [42]:
len(dfs['RestingState'][dfs['RestingState']['imgincl_rsfmri_include']==1])

6835