# Notebook for creating and saving Subject objects from datasets

In [1]:
%load_ext autoreload
%autoreload 2

import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from subprocess import call

import sys
sys.path.append('../src/features')

from subject import Subject

cur_dir = os.getcwd()
asd_diag_dir = os.path.dirname(cur_dir)
abide_dir = os.path.dirname(asd_diag_dir) + '/abide/'
roi_200_dir = abide_dir + '/data/ABIDEI_preprocessed/Outputs/cpac/filt_noglobal/rois_cc200/'
subjects_dir = os.path.dirname(cur_dir) + '/data/ABIDEI_subjects/'
os.path.exists(roi_200_dir)

True

# Load already made subjects for modification

In [2]:
# Load ABIDEI preprocessed rois by loading subjects
def open_pickle(f):
    file = open(f,'rb')
    o = pickle.load(file)
    file.close()
    return o

def load_subjects_d(subject_folder):
    subjects_d = {}
    for f in os.listdir(subject_folder):
        s = open_pickle(os.path.join(subject_folder, f))
        subjects_d[s._sub_id] = s
    return subjects_d

subjects_d = load_subjects_d(subjects_dir)

## Subjects from ABIDE I preprocessed

In [28]:
phen_file = abide_dir + 'Phenotypic_V1_0b_preprocessed1.csv'
phen_df = pd.read_csv(phen_file)

In [36]:
subjects = list()
for index, row in phen_df.iterrows():
    data = list()
    file = row['FILE_ID'] + '_rois_cc200.1D'
    if(file in os.listdir(roi_200_dir)):
        with open(str(os.path.join(roi_200_dir, file)), newline='') as f:
            Lines = f.readlines()
            for l in Lines[1:]:
                ts = np.empty((200,))
                for n, i in enumerate(l.split("\t")):
                    ts[n] = i
                data.append(ts)       
        data_dict = {'filt_noglobal_roi_200_Cradd' : np.array(data)}
        label_dict = {'dx_group': row['DX_GROUP']}
        s = Subject(row['SUB_ID'], row['SITE_ID'], row['SEX'], row['AGE_AT_SCAN'], data_dict, label_dict)
        subjects.append(s)
assert len(subjects) == len(os.listdir(roi_200_dir))

# Add ABIDE I preprocessed

In [17]:
def open_1d_rois(file):
    data = list()
    with open(file, newline='') as f:
        Lines = f.readlines()
        for l in Lines[1:]:
            ts = np.empty((200,))
            for n, i in enumerate(l.split("\t")):
                ts[n] = i
            data.append(ts)
    return np.array(data)

def add_data(subjects_d, data_dir, data_name):
    print(f'Number of files in data dir {len(os.listdir(data_dir))}')
    for f in os.listdir(data_dir):
        sub_id = None
        for i in f.split('_'):
            if(i[0:3] == '005'):
                sub_id = int(i)
        if(sub_id is None):
            print(f'error getting sub_id from {f}')
        else:
            data = open_1d_rois(os.path.join(data_dir, f))
            s = subjects_d[sub_id]
            s._data_dict[data_name] = data
        
# data_dir = abide_dir + '/data/ABIDEI_preprocessed/Outputs/cpac/nofilt_noglobal/rois_cc200/'
# data_name = 'nofilt_noglobal_roi_200_Cradd'
# data_dir = abide_dir + '/data/ABIDEI_preprocessed/Outputs/cpac/nofilt_global/rois_cc200/'
# data_name = 'nofilt_global_roi_200_Cradd'
data_dir = abide_dir + '/data/ABIDEI_preprocessed/Outputs/cpac/filt_global/rois_cc200/'
data_name = 'filt_global_roi_200_Cradd'
add_data(subjects_d, data_dir, data_name)

Number of files in data dir 884


In [18]:
# Check that data added 
g_c = 0
for sub_id, s in subjects_d.items():
    if(data_name in s._data_dict):
        g_c += 1
print(g_c)

884


# Add CPAC preprocessed data

In [3]:
data_dir = asd_diag_dir + '/data/'
# cpac_output_dir = data_dir + 'cpac_output/output/pipeline_abide_cpac_run_freq-filter_nuisance/'
# cpac_output_dir = data_dir + 'cpac_output_cmu_a/output/pipeline_abide_cpac_run/'
cpac_output_dir = data_dir + 'cpac_output_t/output/pipeline_abide_cpac_run/'

In [4]:
# for filter no global
# filt_noglobal = 'roi_timeseries/_scan_rest_run-1/_selector_CSF-2mmE-M_aC-CSF+WM-2mm-DPC5_M-SDB_P-2_BP-B0.01-T0.1/_mask_CC200_mask_file_..cpac_templates..CC200.nii.gz'
filt_noglobal = 'roi_timeseries/_scan_rest_run-1/_selector_CSF-2mm-M_aC-CSF+WM-2mm-DPC5_M-SDB_P-2_BP-B0.01-T0.1/_mask_CC200_mask_file_..cpac_templates..CC200.nii.gz'
def get_roi(sub_folder):
    roi_folder = os.path.join(sub_folder,filt_noglobal)
    if os.path.exists(os.path.join(roi_folder, 'roi_stats.npz')):
        call(['unzip', os.path.join(roi_folder, 'roi_stats.npz'), '-d', roi_folder])
        # first row is nans and skip first 5
        roi = np.load(os.path.join(roi_folder, 'arr_0.npy'), allow_pickle=True)[5:] 
    else:
        print(f"Couldn't get rois for {sub_folder}")
        roi = None
    return roi

cpac_sub_roi_d = {}
for sub_f in os.listdir(cpac_output_dir):
    sub_id = int(sub_f.split('-')[1].split('_')[0])
    sub_folder = os.path.join(cpac_output_dir, sub_f)
    roi = get_roi(sub_folder)
    if roi is not None:
        cpac_sub_roi_d[sub_id] = get_roi(sub_folder)

Couldn't get rois for /home/cparish/Capstone/asd_diagnosis_fmri/data/cpac_output_t/output/pipeline_abide_cpac_run/sub-0050657_ses-1
Couldn't get rois for /home/cparish/Capstone/asd_diagnosis_fmri/data/cpac_output_t/output/pipeline_abide_cpac_run/sub-0050649_ses-1
Couldn't get rois for /home/cparish/Capstone/asd_diagnosis_fmri/data/cpac_output_t/output/pipeline_abide_cpac_run/sub-0050651_ses-1


In [5]:
subjects = subjects_d.values()

In [6]:
cpac_sub_roi_d.keys()

dict_keys([50957, 50955, 50669, 50958, 50653, 50961, 50956, 50954, 50959, 50960])

In [7]:
sites = set()
g_set = set()
for s in subjects:
    if(s._site_id == 'NYU'):
        print(s._sub_id)
        g_set.add(s._sub_id)

51082
51109
51156
51102
50969
51069
50978
51027
50962
51012
51019
51071
51122
51018
51124
51086
50985
50995
51083
50986
51113
51152
51020
50994
51068
51050
51056
50958
50966
51010
50957
51058
51130
51025
51080
50982
51103
51044
51084
50998
51098
50959
51040
51155
51066
51126
51029
51106
51047
51110
51151
51034
51000
51148
51129
51032
51077
51090
51061
51001
50965
51078
50970
51087
51062
51064
50993
51013
51114
51100
51028
51093
51073
51075
51088
51065
51081
51128
51063
51057
51116
51007
51046
51085
51153
50967
51048
50983
51014
51002
51117
50997
51118
50976
51079
51036
51011
50981
51021
50972
50989
51023
50954
51009
51026
51059
51107
50955
51049
50968
51033
50984
51070
51094
51055
51042
50996
51101
51067
51017
51016
50991
51051
50956
51015
51149
51089
50988
51121
51146
50964
50973
50977
50992
51006
51105
51111
51053
50979
51104
51038
50987
51095
51045
51054
51024
51091
51074
51060
51097
51039
51150
51052
50999
51041
51147
51131
51003
51035
51072
50974
50960
51112
51096
51123
50990
5115

In [49]:
50961 in g_set

True

In [9]:
for s in subjects:
    if s._sub_id in cpac_sub_roi_d:
        s._data_dict['cpac_filt_noglobal_t'] = cpac_sub_roi_d[s._sub_id]

h
h
h
h
h
h
h
h
h
h


In [19]:
# Save subjects
save_dir = os.path.dirname(cur_dir) + '/data/ABIDEI_subjects/'
for subject in subjects:
    subject._save_subject(save_dir)

In [19]:
s = subjects_d[50653]

# Site TRs can be found on abide website or 
* https://www.researchgate.net/figure/SCANNING-PARAMETERS-OF-DIFFERENT-SITES-OF-ABIDE-1_tbl1_335722272

In [23]:
site_trs = {'CALTECH': 2, 'CMU': 2, 'KKI': '2.5', 'LEUVEN_1': 1.656, 'LEUVEN_2': 1.656, 'MAX_MUN': 3,
           'NYU': 2, 'OHSU': 2.5, 'OLIN': 2.5, 'PITT': 1.5, 'SBL': 2.2, 'SDSU': 2, 'STANFORD': 2, 
           'TRINITY': 2, 'UCLA_1': 3, 'UCLA_2': 3, 'UM_1': 2, 'UM_2': 2, 'USM': 2, 'YALE': 2}


In [24]:
import json

trs_save_file = save_dir = os.path.dirname(cur_dir) + '/data/dicts/ABIDEI_site_trs.json'
with open(trs_save_file, 'w') as fp:
    json.dump(site_trs, fp)