In this notebook nwe prepare the number of subjects finally used, saving their demographic data for later uses.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from os.path import join as opj
import os
import pickle
import shutil
from scipy.spatial.distance import squareform

In [2]:
data_dir = "../data"

Load demo data from ABIDE

In [3]:
demo_data = pd.read_csv(opj(data_dir, "common_pheno_abide.csv"))
demo_data.shape

(2226, 61)

Get list of subjects with issues during preprocessing, from the list Amaia provided

In [4]:
amaia_subjects_dat = pd.read_csv(opj(data_dir, "structural.csv"))
subject_list = amaia_subjects_dat.sub_id.to_list()

amaia_discarded_ids=set(demo_data.SUB_ID).difference(set(amaia_subjects_dat.sub_id))

len(amaia_discarded_ids)

309

Create new SITE ID column by merging several cohorts, following ABIDE recommendations

In [5]:
site_names = demo_data.SITE_ID.unique()
merged_site_names = ['CALTECH', 'CMU', 'KKI', 'LEUVEN', 'LEUVEN', 
                     'MAX_MUN', 'NYU', 'OHSU', 'OLIN', 'PITT', 'SBL', 'SDSU','STANFORD', 
                     'TCD', 'UCLA',  'UCLA', 'UM', 'UM', 'USM', 
                     'YALE', 'BNI', 'EMC', 'ETH', 'GU', 'IP', 'IU',
                     'KKI', 'KUL_3', 'NYU', 'NYU_2', 'OHSU', 'ONRC', 
                     'SDSU', 'SU_2', 'TCD', 'UCD', 'UCLA', 'MIA_1', 'USM']

demo_data["SITE_ID_MERGED"] = demo_data.SITE_ID.replace(to_replace=site_names, value=merged_site_names)

Subset of data using the list of Amaia's subjects

In [6]:
demo_data_amaia = pd.merge(pd.DataFrame({'SUB_ID':subject_list}), demo_data, on="SUB_ID")
print(demo_data_amaia.shape)
print("number of cohort names",  len(demo_data_amaia.SITE_ID.unique()))
print("After merging cohorts", len(demo_data_amaia.SITE_ID_MERGED.unique()))

(1917, 62)
number of cohort names 35
After merging cohorts 26


Get subject IDS that had too short acquisitions

In [7]:
ab_I_short_ts = [int(subj.split("_")[::-1][0]) for subj in pd.read_csv(opj(data_dir, "data_I_subject2remove.txt"), 
                                                                        header=None).iloc[:,0].values]

ab_II_short_ts = [int(subj.split("_")[::-1][0]) for subj in pd.read_csv(opj(data_dir, "data_II_subject2remove.txt"), 
                                                                       header=None).iloc[:,0].values]
ab_short_ts = ab_I_short_ts + ab_II_short_ts
len(ab_short_ts)

130

A few other subjects that we found did not have full brain coverage when parcellating the time series,
so they showed NaNs in the connectivity matrices

In [8]:
not_full_atlas_cov = [51558, 50653, 50648, 50651, 50652] 

Extract subject from cohorts "KUL_3" and "NYU_2". These will be discarded because they only have ASD subjects, and combat harmonization could have problems with this

In [9]:
kul3_subjects = demo_data_amaia[demo_data_amaia.SITE_ID_MERGED == "KUL_3"].SUB_ID
nyu2_subjects = demo_data_amaia[demo_data_amaia.SITE_ID_MERGED == "NYU_2"].SUB_ID

Then: remove subjects with short TS, subjects corresponding to KUL and NYU_2 and those with no full Desikan coverage

In [10]:
cond = (demo_data_amaia.SUB_ID.isin(ab_short_ts)) | \
(demo_data_amaia.SUB_ID.isin(kul3_subjects)) | \
(demo_data_amaia.SUB_ID.isin(nyu2_subjects)) | demo_data_amaia.SUB_ID.isin(not_full_atlas_cov)
print(sum(cond))

91


Filter data leaving the previous subjects out

In [11]:
demo_data_amaia_filtered = demo_data_amaia.loc[~cond,:]
demo_data_amaia_filtered.shape

(1826, 62)

Add motion to this dataset to filter subjects with large head motion during the scanner

In [12]:
# Read motion parameters
motion_params_df = pd.concat([pd.read_csv(opj(data_dir, "data_I_mov_params.csv"), sep=";"),
                                   pd.read_csv(opj(data_dir, "data_II_mov_params.csv"), sep=";")])
motion_params_df.columns = [col.strip() for col in motion_params_df.columns]
motion_params_df = motion_params_df[~motion_params_df.ID.str.contains("Long")] # Discard longitudinal
motion_params_df['SUB_ID'] = list(map(lambda x: int(x.split("_")[::-1][0]), motion_params_df.ID.to_list()))

Set a very restricted threshold (0.3 mm) above which we are going to discard subjects

In [13]:
thr = 0.3
print("Threshold movement %f" % thr)
demo_data_amaia_filtered_fwd_03 = pd.merge(demo_data_amaia_filtered, motion_params_df, on ="SUB_ID")
# Take only those with FWD < 0.3
demo_data_amaia_filtered_fwd_03 = demo_data_amaia_filtered_fwd_03[demo_data_amaia_filtered_fwd_03.FD_mean < thr] 

print(demo_data_amaia_filtered_fwd_03.DX_GROUP.value_counts())
print(demo_data_amaia_filtered_fwd_03.groupby("DX_GROUP").FD_mean.mean())

demo_data_amaia_filtered_fwd_03.loc[:, demo_data_amaia_filtered_fwd_03.dtypes != "object"] = demo_data_amaia_filtered_fwd_03.loc[:, demo_data_amaia_filtered_fwd_03.dtypes != "object"].apply(lambda x: [pd.NA if ii < -9000 else ii for ii in x])

# Save this demographics to excel file
demo_data_amaia_filtered_fwd_03.to_excel(opj(data_dir, "demo_final_FD%s.xlsx" % (str(int(thr*10)).zfill(2))), index=False)

asd_subjs = demo_data_amaia_filtered_fwd_03.loc[demo_data_amaia_filtered_fwd_03.DX_GROUP==1, "SUB_ID"].to_numpy()
tc_subjs = demo_data_amaia_filtered_fwd_03.loc[demo_data_amaia_filtered_fwd_03.DX_GROUP==2, "SUB_ID"].to_numpy()

subjects_subgroups_fwd_03 = { 'asd_subjs': asd_subjs, 'tc_subjs': tc_subjs}

# Save this subset of subjects to disk
f = open(opj(data_dir, "subjects_subgroups_FD%s.pkl" %  (str(int(thr*10)).zfill(2))), "wb")
pickle.dump(subjects_subgroups_fwd_03, f)
f.close()

Threshold movement 0.300000
2    884
1    657
Name: DX_GROUP, dtype: int64
DX_GROUP
1    0.152385
2    0.134950
Name: FD_mean, dtype: float64


Save IDS in the repository to make them public

In [14]:
demo_data_amaia_filtered_fwd_03.ID.to_csv("../data/ABIDE_ids_used.txt", index=False, header=None)

### Some Descriptive stats

Age:

In [15]:
print("full sample AGE")
print("")
print(demo_data_amaia_filtered_fwd_03.AGE_AT_SCAN.describe().round(2))

full sample AGE

count    1541.00
mean       16.50
std         8.82
min         5.22
25%        10.69
50%        13.81
75%        19.66
max        64.00
Name: AGE_AT_SCAN, dtype: float64


In [16]:
print("full sample AGE")
print("")
print(demo_data_amaia_filtered_fwd_03.AGE_AT_SCAN.describe().round(2))

full sample AGE

count    1541.00
mean       16.50
std         8.82
min         5.22
25%        10.69
50%        13.81
75%        19.66
max        64.00
Name: AGE_AT_SCAN, dtype: float64


In [17]:
print("AGE per group")
print("")
print(demo_data_amaia_filtered_fwd_03.groupby('DX_GROUP').AGE_AT_SCAN.describe().round(2))

AGE per group

          count   mean   std   min    25%   50%    75%   max
DX_GROUP                                                    
1         657.0  16.85  9.18  5.22  10.92  14.0  19.41  59.0
2         884.0  16.25  8.54  5.89  10.44  13.5  19.75  64.0


In [18]:
print("By SITE ID AGE")
print("")
print(demo_data_amaia_filtered_fwd_03.groupby("SITE_ID_MERGED")['AGE_AT_SCAN'].describe().round(2))

By SITE ID AGE

                count   mean    std    min    25%    50%    75%    max
SITE_ID_MERGED                                                        
BNI              40.0  39.85  15.59  18.00  22.00  45.00  53.25  64.00
CALTECH          37.0  27.42   9.76  17.00  20.80  22.90  34.10  56.20
CMU              20.0  25.45   5.29  19.00  21.00  25.00  27.25  40.00
EMC              27.0   8.40   1.09   6.33   7.95   8.52   8.86  10.66
ETH              26.0  22.91   4.57  13.83  19.79  22.71  26.85  30.67
GU               76.0  10.92   1.66   8.06   9.51  10.89  12.31  13.91
IU               37.0  24.43   7.59  17.00  20.00  22.00  25.00  54.00
KKI             205.0  10.29   1.28   8.02   9.30  10.25  11.09  12.99
LEUVEN           61.0  18.18   4.97  12.20  14.20  16.60  22.00  32.00
MAX_MUN          44.0  28.77  11.79   7.00  22.75  29.00  33.00  58.00
NYU             245.0  13.82   6.74   5.22   8.90  11.69  16.31  39.10
OHSU             83.0  10.95   2.05   7.00   9.00  11.00  12.

Sex:

In [19]:
print("full sample Sex (1==Men, 2==Women)")
print("")
demo_data_amaia_filtered_fwd_03.SEX.value_counts()

full sample Sex (1==Men, 2==Women)



1    1237
2     304
Name: SEX, dtype: int64

In [20]:
print("Sex per group")
print("")
print(demo_data_amaia_filtered_fwd_03.groupby('DX_GROUP').SEX.value_counts())

Sex per group

DX_GROUP  SEX
1         1      565
          2       92
2         1      672
          2      212
Name: SEX, dtype: int64


In [21]:
print("Women stratified by site")
print("")
demo_data_amaia_filtered_fwd_03.groupby("SITE_ID_MERGED")['SEX'].apply(lambda x: sum(x==2))

Women stratified by site



SITE_ID_MERGED
BNI          0
CALTECH      8
CMU          4
EMC          4
ETH          0
GU          28
IU           9
KKI         70
LEUVEN       7
MAX_MUN      7
NYU         42
OHSU        31
OLIN         4
ONRC         5
PITT         7
SBL          0
SDSU        15
STANFORD     6
TCD          0
UCD          7
UCLA         8
UM          24
USM          5
YALE        13
Name: SEX, dtype: int64

In [22]:
print("Men stratified by site")
print("")
demo_data_amaia_filtered_fwd_03.groupby("SITE_ID_MERGED")['DX_GROUP'].apply(lambda x: sum(x==1))

Men stratified by site



SITE_ID_MERGED
BNI          19
CALTECH      18
CMU           8
EMC          15
ETH           6
GU           29
IU           17
KKI          48
LEUVEN       26
MAX_MUN      18
NYU         115
OHSU         33
OLIN         12
ONRC          7
PITT         22
SBL          13
SDSU         42
STANFORD     11
TCD          33
UCD          15
UCLA         36
UM           41
USM          52
YALE         21
Name: DX_GROUP, dtype: int64

In [23]:
pd.to_numeric(demo_data_amaia_filtered_fwd_03[demo_data_amaia_filtered_fwd_03.DX_GROUP==1]['FIQ']).describe().round(2)

count    610.00
mean     106.13
std       16.31
min       41.00
25%       95.00
50%      107.00
75%      118.00
max      149.00
Name: FIQ, dtype: float64

In [24]:
pd.to_numeric(demo_data_amaia_filtered_fwd_03[demo_data_amaia_filtered_fwd_03.DX_GROUP==1]['ADOS_TOTAL']).describe().round(2)

count    414.00
mean      11.48
std        3.86
min        2.00
25%        8.00
50%       11.00
75%       14.00
max       23.00
Name: ADOS_TOTAL, dtype: float64