In [None]:
import pandas as pd

In [None]:
base_dir = '/path/to/data'

In [None]:
df_inc=pd.read_table('{}/abcd_imgincl01.txt'.format(base_dir), skiprows=[1])
print(df_inc.shape)
print(df_inc.eventname.unique())

In [None]:
df_inc_bsl=df_inc[df_inc['eventname']=='baseline_year_1_arm_1']
df_inc_y2=df_inc[df_inc['eventname']=='2_year_follow_up_y_arm_1']

# sort
df_inc_bsl=df_inc_bsl.sort_values(by=['subjectkey','interview_date'])
df_inc_y2=df_inc_y2.sort_values(by=['subjectkey','interview_date'])

In [None]:
print(df_inc_bsl.shape)
print(df_inc_y2.shape)
print('Baseline unique ID: {}'.format(len(df_inc_bsl.subjectkey.unique())))
print('Year2 unique ID: {}'.format(len(df_inc_y2.subjectkey.unique())))

In [None]:
df_inc_bsl.head()

# Multiple entries per subject?
Even within an event, a subject commonly has 2 or more entries about its inclusion recommendation. Why? Let's figure out.

## Baseline

In [None]:
l = []
for subjid in df_inc_bsl.subjectkey.unique():
    df_tmp = df_inc_bsl[df_inc_bsl['subjectkey']==subjid]
    l.append(len(df_tmp))
    if len(df_tmp) == 3:
        print(subjid)

In [None]:
for num_entry in list(set(l)):
    lst_tmp = [1 for x in l if x==num_entry]
    print("{} subjects has {} entries.".format(sum(lst_tmp),num_entry))

print("Total number of entries: {}".format(sum(l)))

## Year 2

In [None]:
l = []
for subjid in df_inc_y2.subjectkey.unique():
    df_tmp = df_inc_y2[df_inc_y2['subjectkey']==subjid]
    l.append(len(df_tmp))
    if len(df_tmp) == 3:
        print(subjid)

In [None]:
set(l)

In [None]:
for num_entry in list(set(l)):
    lst_tmp = [1 for x in l if x==num_entry]
    print("{} subjects has {} entries.".format(sum(lst_tmp),num_entry))

print("Total number of entries: {}".format(sum(l)))

# Make subject list for all task fMRI
Include a subject only if it is recommended to include for MID, SST, and nBack, and for all entries. For example, if subject A has three entries of inclusion at baseline, you need to make sure all three entries are at 1 for all three tasks.

## Baseline

In [None]:
lst_bsl = []
for subjid in df_inc_bsl.subjectkey.unique():
    df_tmp = df_inc_bsl[df_inc_bsl['subjectkey']==subjid]
    dup = df_tmp[['imgincl_mid_include','imgincl_nback_include','imgincl_sst_include']].duplicated().values.tolist()
    if sum(dup) != len(dup)-1:
        print("{} Rows are not identical?".format(subjid))
    any_zero = (df_tmp[['imgincl_mid_include','imgincl_nback_include','imgincl_sst_include']]==0).any(axis=None)
    if any_zero == False:
        lst_bsl.append(subjid)

In [None]:
len(lst_bsl)

In [None]:
with open('{}/outputs/lst_subjkey_include_bsl.txt'.format(base_dir),'w+') as f:
    for subjid in lst_bsl:
        f.write("{}\n".format(subjid))

## Year 2

In [None]:
lst_y2 = []
for subjid in df_inc_y2.subjectkey.unique():
    df_tmp = df_inc_y2[df_inc_y2['subjectkey']==subjid]
    dup = df_tmp[['imgincl_mid_include','imgincl_nback_include','imgincl_sst_include']].duplicated().values.tolist()
    if sum(dup) != len(dup)-1:
        print("{} Rows are not identical?".format(subjid))
    any_zero = (df_tmp[['imgincl_mid_include','imgincl_nback_include','imgincl_sst_include']]==0).any(axis=None)
    if any_zero == False:
        lst_y2.append(subjid)

In [None]:
len(lst_y2)

In [None]:
with open('{}/outputs/lst_subjkey_include_y2.txt'.format(base_dir),'w+') as f:
    for subjid in lst_y2:
        f.write("{}\n".format(subjid))

# Check inconsistent duplicate rows

In [None]:
subjid='NDAR_INVJZ6BX4DY'
df_tmp = df_inc_bsl[df_inc_bsl['subjectkey']==subjid]
df_tmp

In [None]:
subjid='NDAR_INVZ2BXG0C8'
df_tmp = df_inc_bsl[df_inc_bsl['subjectkey']==subjid]
df_tmp

# Make subject list for T1 and T2 MRI
Include a subject only if it is recommended to include for T1 and T2, and for all entries. For example, if subject A has three entries of inclusion at baseline, you need to make sure all three entries are at 1 for all three tasks.

In [None]:
sub_headers = ['imgincl_t1w_include','imgincl_t2w_include']

## Baseline

In [None]:
lst_bsl = []
for subjid in df_inc_bsl.subjectkey.unique():
    df_tmp = df_inc_bsl[df_inc_bsl['subjectkey']==subjid]
    dup = df_tmp[sub_headers].duplicated().values.tolist()
    if sum(dup) != len(dup)-1:
        print("{} Rows are not identical?".format(subjid))
    any_zero = (df_tmp[sub_headers]==0).any(axis=None)
    if any_zero == False:
        lst_bsl.append(subjid)

In [None]:
len(lst_bsl)

In [None]:
with open('{}/outputs/lst_subjkey_include_T1_and_T2_bsl.txt'.format(base_dir),'w+') as f:
    for subjid in lst_bsl:
        f.write("{}\n".format(subjid))

## Year 2

In [None]:
lst_y2 = []
for subjid in df_inc_y2.subjectkey.unique():
    df_tmp = df_inc_y2[df_inc_y2['subjectkey']==subjid]
    dup = df_tmp[sub_headers].duplicated().values.tolist()
    if sum(dup) != len(dup)-1:
        print("{} Rows are not identical?".format(subjid))
    any_zero = (df_tmp[sub_headers]==0).any(axis=None)
    if any_zero == False:
        lst_y2.append(subjid)

In [None]:
len(lst_y2)

In [None]:
with open('{}/outputs/lst_subjkey_include_T1_and_T2_y2.txt'.format(base_dir),'w+') as f:
    for subjid in lst_y2:
        f.write("{}\n".format(subjid))

# Make subject list for rs-fMRI
Include a subject only if it is recommended to include for rsfMRI, and for all entries. For example, if subject A has three entries of inclusion at baseline, you need to make sure all three entries are at 1.

In [None]:
sub_headers = ['imgincl_rsfmri_include']

## Baseline

In [None]:
lst_bsl = []
for subjid in df_inc_bsl.subjectkey.unique():
    df_tmp = df_inc_bsl[df_inc_bsl['subjectkey']==subjid]
    dup = df_tmp[sub_headers].duplicated().values.tolist()
    if sum(dup) != len(dup)-1:
        print("{} Rows are not identical?".format(subjid))
    any_zero = (df_tmp[sub_headers]==0).any(axis=None)
    if any_zero == False:
        lst_bsl.append(subjid)

In [None]:
len(lst_bsl)

In [None]:
with open('{}/outputs/lst_subjkey_include_rsfMRI_bsl.txt'.format(base_dir),'w+') as f:
    for subjid in lst_bsl:
        f.write("{}\n".format(subjid))

## Year 2

In [None]:
lst_y2 = []
for subjid in df_inc_y2.subjectkey.unique():
    df_tmp = df_inc_y2[df_inc_y2['subjectkey']==subjid]
    dup = df_tmp[sub_headers].duplicated().values.tolist()
    if sum(dup) != len(dup)-1:
        print("{} Rows are not identical?".format(subjid))
    any_zero = (df_tmp[sub_headers]==0).any(axis=None)
    if any_zero == False:
        lst_y2.append(subjid)

In [None]:
len(lst_y2)

In [None]:
with open('{}/outputs/lst_subjkey_include_rsfMRI_y2.txt'.format(base_dir),'w+') as f:
    for subjid in lst_y2:
        f.write("{}\n".format(subjid))