In [1]:
import os
import errno
import sys
import time

In [2]:
import pandas as pd
import sklearn 
import numpy as np
import matplotlib.pyplot as plt

In [3]:
ROOT_PATH = '../'
ORIGINAL_DATA = 'data/DataforGe_v2.xlsx'

In [4]:
orig_data_path = os.path.join(ROOT_PATH, ORIGINAL_DATA)

In [5]:
if os.path.isfile(orig_data_path):
    dfs = pd.read_excel(orig_data_path, sheet_name=None)
else:
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), orig_data_path)

In [8]:
sheet_names = dfs.keys()
print('The loaded panda frames are', sheet_names)

The loaded panda frames are dict_keys(['DTIConnectData', 'restingstatedata', 'otherdata', 'outcome'])


# Inspect outcome

In [12]:
row_count = len(dfs['outcome'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(dfs['outcome'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 5237 rows.
The DataFrame has 1 features.


In [14]:
data_top = dfs['outcome'].head()
print(data_top)

     src_subject_id  becomeCHR_3yr
0  NDAR_INV00CY2MDM              0
1  NDAR_INV00HEV6HB              0
2  NDAR_INV00LH735Y              0
3  NDAR_INV014RTM1V              0
4  NDAR_INV019DXLU4              0


In [28]:
column_nan_counts = dfs['outcome'].isna().sum()
print(column_nan_counts)

src_subject_id    0
becomeCHR_3yr     0
dtype: int64


In [44]:
print('There are duplicated subjects {}'.format(dfs['outcome']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Inspect DTIConnectData

In [30]:
row_count = len(dfs['DTIConnectData'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(dfs['DTIConnectData'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 7266 rows.
The DataFrame has 33 features.


In [31]:
data_top = dfs['DTIConnectData'].head()
# print(data_top)

In [33]:
column_nan_counts = dfs['DTIConnectData'].isna().sum()
print(column_nan_counts)

src_subject_id                                      0
imgincl_FA_include                                  0
FA_All_agesexsite                                   0
FA_cingulatecingulum_R_agesexsite                   0
FA_CinguluteCingulum_L_agesexsite                   0
FA_corpuscallosum_agesexsite                        0
FA_Corticospinal_L_agesexsite                       0
FA_Corticospinal_R_agesexsite                       0
FA_forecepsmajor_agesexsite                         0
FA_forecepsminor_agesexsite                         0
FA_Fornix_L_agesexsite                              0
FA_Fornix_R_agesexsite                              0
FA_IFC_SupFrontal_L_agesexsite                      0
FA_IFC_SupFrontal_R_agesexsite                      0
FA_inferiorfrontooccipitalfasiculus_L_agesexsite    0
FA_inferiorfrontooccipitalfasiculus_R_agesexsite    0
FA_inferiorlongfascic_L_agesexsite                  0
FA_inferiorlongfascic_R_agesexsite                  0
FA_ParahippocampalCingulum_L

In [45]:
print('There are duplicated subjects {}'.format(dfs['DTIConnectData']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Inspect restingstatedata

In [34]:
row_count = len(dfs['restingstatedata'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(dfs['restingstatedata'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 6752 rows.
The DataFrame has 271 features.


In [35]:
data_top = dfs['restingstatedata'].head()
# print(data_top)

In [46]:
column_nan_counts = dfs['restingstatedata'].isna().sum()
print(column_nan_counts)

src_subject_id                         0
imgincl_rsfmri_include                 0
auditory_auditory_agesexsite           0
auditory_cingulooper_agesexsite        0
auditory_cinguloparietal_agesexsite    0
                                      ..
VIS_pallidumR_agesexsite               0
VIS_putamenL_agesexsite                0
VIS_putamenR_agesexsite                0
VIS_thalamusL_agesexsite               0
VIS_thalamusR_agesexsite               0
Length: 272, dtype: int64


In [47]:
print('There are duplicated subjects {}'.format(dfs['restingstatedata']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Inspect otherdata

In [37]:
row_count = len(dfs['otherdata'])
print(f'The DataFrame has {row_count} rows.')
column_names = list(dfs['otherdata'].columns.values)
print(f'The DataFrame has {len(column_names)-1} features.')

The DataFrame has 5237 rows.
The DataFrame has 7 features.


In [38]:
data_top = dfs['otherdata'].head()
print(data_top)

     src_subject_id  ageat2yr  ravlt_sumtc_2yr  nihtbx_pattern_raw_2yr  \
0  NDAR_INV00CY2MDM      12.0               58                      46   
1  NDAR_INV00HEV6HB      12.0               56                      45   
2  NDAR_INV00LH735Y      11.0               83                      40   
3  NDAR_INV014RTM1V      11.0               76                      45   
4  NDAR_INV019DXLU4      12.0               74                      42   

   SumNegLifeEvents  N_Trauma_Types  fam_history_8_yes_no  dropingrades_2yr  
0                 6               2                     0                 1  
1                 3               1                     0                 1  
2                 4               0                     0                 0  
3                 7               0                     0                 0  
4                 7               3                     0                 0  


In [39]:
column_nan_counts = dfs['otherdata'].isna().sum()
print(column_nan_counts)

src_subject_id            0
ageat2yr                  0
ravlt_sumtc_2yr           0
nihtbx_pattern_raw_2yr    0
SumNegLifeEvents          0
N_Trauma_Types            0
fam_history_8_yes_no      0
dropingrades_2yr          0
dtype: int64


In [48]:
print('There are duplicated subjects {}'.format(dfs['otherdata']['src_subject_id'].duplicated().any()))

There are duplicated subjects False


# Shared subjects

In [50]:
shared_subjects = set(dfs['outcome']['src_subject_id']).intersection(\
    set(dfs['DTIConnectData']['src_subject_id']),\
    set(dfs['restingstatedata']['src_subject_id']),\
    set(dfs['otherdata']['src_subject_id']))

In [57]:
filtered_dfs = {sheet_name: df[df['src_subject_id'].isin(shared_subjects)] for sheet_name, df in dfs.items()}

In [55]:
print('In the filtered data frames, the number of subjects in each sheet is')
for sheet_name, df in filtered_dfs.items():
    print(sheet_name, len(df))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
