This notebook creates a subject list for subjects that should be fed into baracus.
Criteria:
* demo info and mri
* age >=18
* at least one retest session >=7d
* exclude sites with N<20

The script makes corrections as som things are missing from the raw data:
* drop subjects with retest_duration "150-480". (NYU1)
* fill in retest units for IPCAS4 XHCUMS

Input:
* session_info.tsv (based on : 01_create_dl_file.py; 02_create_onestudy_bids.py; 03_collect_session_info.py ) 

In [1]:
import pandas as pd
import os
from os.path import join as opj
root_dir = "/Users/franzliem/Dropbox/baracorrus_data/data"
out_dir = "/Users/franzliem/Dropbox/baracorrus_data/results"

# get session info from session_info.tsv (derived from bids subject folders)

In [2]:
df_sessions = pd.read_csv(opj(root_dir, "from_1000fc", "session_info.tsv"), sep="\t", na_values="n/a")
df_sessions["session_kind"] = "retest"
df_sessions.loc[df_sessions.session_id=="ses-1", "session_kind"] = "test"
df_sessions = df_sessions[df_sessions.retest_duration != "150-480"]

# drop and take from corr pheno file:
df_sessions.drop(labels=[ "sex", "age_at_scan_1", "handedness"], axis=1, inplace=True)
print(df_sessions.isnull().sum())


df_sessions.head()

participant_id                     0
preceding_condition             3267
resting_state_instruction       3121
retest_design                   2913
retest_duration                 1510
retest_units                    1582
session_id                         0
site                               0
visual_stimulation_condition    3041
session_kind                       0
dtype: int64


Unnamed: 0,participant_id,preceding_condition,resting_state_instruction,retest_design,retest_duration,retest_units,session_id,site,visual_stimulation_condition,session_kind
0,BMB1x0003001,,,,,,ses-1,BMB1,,test
1,BMB1x0003001,,,,10.0,minutes,ses-2,BMB1,,retest
2,BMB1x0003002,,,,,,ses-1,BMB1,,test
3,BMB1x0003002,,,,10.0,minutes,ses-2,BMB1,,retest
4,BMB1x0003004,,,,,,ses-1,BMB1,,test


In [3]:
df_test = df_sessions[df_sessions.session_kind=="test"].copy()
df_test.drop(labels=["retest_duration", "retest_units"], axis=1, inplace=True)

In [4]:
# fill in or drop missing retest units info
df_retest = df_sessions[df_sessions.session_kind=="retest"].copy()

# some retest unist are missing and were derived from Table 1 of the Zuo paper
df_retest.loc[df_retest["site"]=="IPCAS4", ["retest_units"]] = "minutes"
df_retest.loc[df_retest["site"]=="XHCUMS", ["retest_units"]] = "days"
df_retest.replace({"retest_units":{"w":"weeks"}}, inplace=True)
df_retest = df_retest.dropna(subset=["retest_duration"])
df_retest.isnull().sum()

participant_id                     0
preceding_condition             2062
resting_state_instruction       2638
retest_design                   2425
retest_duration                    0
retest_units                       0
session_id                         0
site                               0
visual_stimulation_condition    2558
session_kind                       0
dtype: int64

In [5]:
# convert retest duration into days
def convert_into_days(duration, unit):
    if unit == "minutes":
        f = 1./(24*60)
    elif unit == "days":
        f = 1
    elif unit == "weeks":
        f = 7
    elif pd.isnull(unit):
        f = pd.np.nan
    else:
        raise Exception("Unknown unit {}".format(unit))
    return float(duration)*f

df_retest['retest_duration_days'] = \
 df_retest.apply(lambda row: convert_into_days(row["retest_duration"], row["retest_units"]), axis=1)
df_retest.drop(labels=["retest_duration", "retest_units"], axis=1, inplace=True)
df_retest.head()

Unnamed: 0,participant_id,preceding_condition,resting_state_instruction,retest_design,session_id,site,visual_stimulation_condition,session_kind,retest_duration_days
1,BMB1x0003001,,,,ses-2,BMB1,,retest,0.006944
3,BMB1x0003002,,,,ses-2,BMB1,,retest,0.006944
5,BMB1x0003004,,,,ses-2,BMB1,,retest,0.006944
7,BMB1x0003006,,,,ses-2,BMB1,,retest,0.006944
9,BMB1x0003007,,,,ses-2,BMB1,,retest,0.006944


# get pheno data (age, sex) from corr.csv

In [6]:
df_pheno = pd.read_csv(opj(root_dir, "from_corr", "corr.csv"), na_values=["#", "NoPhenotypicData"], dtype={"SUBID":str})
df_pheno.columns = [c.lower() for c in df_pheno.columns]
df_pheno.site = df_pheno.site.str.replace("_", "")
df_pheno.dropna(subset=["subid"], inplace=True)
df_pheno["subid_orig"] = df_pheno["subid"]
df_pheno["subid"] = df_pheno["subid_orig"].str.rjust(7, "0") #df_pheno["subid_orig"].map("{:010}".format)
df_pheno["participant_id"] = df_pheno["site"] + "x" + df_pheno["subid"] 

demos = df_pheno[["participant_id", "sex", "age_at_scan_1", "handedness"]].copy()
demos.drop_duplicates(subset=["participant_id"], inplace=True)
demos.dropna(subset=["age_at_scan_1"], inplace=True)
demos.head()
demos.to_clipboard()

#print("age info from {} subjects".format(demos.shape[0]))

In [7]:
df = pd.concat((df_test, df_retest))
df.sort_values(by=["participant_id", "session_id"], inplace=True)
df = pd.merge(df, demos, how="inner", on=["participant_id"])
df.to_clipboard()
print(df.shape)
df.head()

(3939, 12)


Unnamed: 0,participant_id,preceding_condition,resting_state_instruction,retest_design,retest_duration_days,session_id,session_kind,site,visual_stimulation_condition,sex,age_at_scan_1,handedness
0,BMB1x0003001,,,,,ses-1,test,BMB1,,2.0,25.13,R
1,BMB1x0003001,,,,0.006944,ses-2,retest,BMB1,,2.0,25.13,R
2,BMB1x0003002,,,,,ses-1,test,BMB1,,1.0,23.96,R
3,BMB1x0003002,,,,0.006944,ses-2,retest,BMB1,,1.0,23.96,R
4,BMB1x0003004,,,,,ses-1,test,BMB1,,2.0,31.15,R


# filter interesting subjects

* demo info and mri 
* age >=18
* at least one retest session >=7d
* exclude sites with N<20

In [8]:
subjects = df.loc[((df.age_at_scan_1 >=18) & (df.retest_duration_days>=7)), "participant_id"].unique()

In [9]:
len(subjects)

679

In [10]:
df_baracus = df[df.participant_id.isin(subjects)].copy()
N = df_baracus.groupby("site").count()["participant_id"]

drop_sites_N = N[N<20]
drop_sites = drop_sites_N.index
print("Drop the following site bc they have less than 20 subjects: {}".format(drop_sites))
print(drop_sites_N)

df_baracus = df_baracus[~df_baracus.site.isin(drop_sites)]

Drop the following site bc they have less than 20 subjects: Index(['SWU1', 'Utah2'], dtype='object', name='site')
site
SWU1      6
Utah2    10
Name: participant_id, dtype: int64


In [11]:
baracus_subjects = df_baracus.participant_id.unique().tolist()
print(len(baracus_subjects))

prepare_dir = opj(out_dir, "prepare")
if not os.path.isdir(prepare_dir):
    os.makedirs(prepare_dir)
out_file = opj(prepare_dir, "baracus_subjects.txt")
with open(out_file, "w") as fi:
    fi.write("\n".join(baracus_subjects))
print("subects written to {}".format(out_file))

677
subects written to /Users/franzliem/Dropbox/baracorrus_data/results/prepare/baracus_subjects.txt


In [12]:
df_ = df_baracus[["participant_id", "site"]].copy()
df_.drop_duplicates(inplace=True)
N = df_.groupby("site").count()["participant_id"]
N

site
BNU1       57
BNU2       61
HNU1       30
IACAS      28
IPCAS1     29
IPCAS8     13
LMU3       25
NYU2       31
SWU4      232
UM         80
UPSM1      26
UWM        25
Utah1      16
XHCUMS     24
Name: participant_id, dtype: int64