# Notebook for creating and saving Subject objects from datasets

In [27]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from subprocess import call

import sys
sys.path.append('../src/features')

from subject import Subject

cur_dir = os.getcwd()
asd_diag_dir = os.path.dirname(cur_dir)
abide_dir = os.path.dirname(asd_diag_dir) + '/abide/'
roi_200_dir = abide_dir + '/data/ABIDEI_preprocessed/Outputs/cpac/filt_noglobal/rois_cc200/'
os.path.exists(roi_200_dir)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

## Subjects from ABIDE I preprocessed

In [28]:
phen_file = abide_dir + 'Phenotypic_V1_0b_preprocessed1.csv'
phen_df = pd.read_csv(phen_file)

In [36]:
subjects = list()
for index, row in phen_df.iterrows():
    data = list()
    file = row['FILE_ID'] + '_rois_cc200.1D'
    if(file in os.listdir(roi_200_dir)):
        with open(str(os.path.join(roi_200_dir, file)), newline='') as f:
            Lines = f.readlines()
            for l in Lines[1:]:
                ts = np.empty((200,))
                for n, i in enumerate(l.split("\t")):
                    ts[n] = i
                data.append(ts)       
        data_dict = {'filt_noglobal_roi_200_Cradd' : np.array(data)}
        label_dict = {'dx_group': row['DX_GROUP']}
        s = Subject(row['SUB_ID'], row['SITE_ID'], row['SEX'], row['AGE_AT_SCAN'], data_dict, label_dict)
        subjects.append(s)
assert len(subjects) == len(os.listdir(roi_200_dir))

# Add CPAC preprocessed data

In [37]:
data_dir = asd_diag_dir + '/data/'
cpac_output_dir = data_dir + 'cpac_output/output/pipeline_abide_cpac_run_freq-filter_nuisance/'

In [38]:
# for filter no global
filt_noglobal = 'roi_timeseries/_scan_rest_run-1/_selector_CSF-2mmE-M_aC-CSF+WM-2mm-DPC5_M-SDB_P-2_BP-B0.01-T0.1/_mask_CC200_mask_file_..cpac_templates..CC200.nii.gz'
def get_roi(sub_folder):
    roi_folder = os.path.join(sub_folder,filt_noglobal)
    if os.path.exists(os.path.join(roi_folder, 'roi_stats.npz')):
        call(['unzip', os.path.join(roi_folder, 'roi_stats.npz'), '-d', roi_folder])
        # first row is nans and skip first 5
        roi = np.load(os.path.join(roi_folder, 'arr_0.npy'), allow_pickle=True)[5:] 
    else:
        print(f"Couldn't get rois for {sub_folder}")
        roi = None
    return roi

cpac_sub_roi_d = {}
for sub_f in os.listdir(cpac_output_dir):
    sub_id = int(sub_f.split('-')[1].split('_')[0])
    sub_folder = os.path.join(cpac_output_dir, sub_f)
    roi = get_roi(sub_folder)
    if roi is not None:
        cpac_sub_roi_d[sub_id] = get_roi(sub_folder)

Couldn't get rois for /home/cparish/Capstone/asd_diagnosis_fmri/data/cpac_output/output/pipeline_abide_cpac_run_freq-filter_nuisance/sub-0051459_ses-1
Couldn't get rois for /home/cparish/Capstone/asd_diagnosis_fmri/data/cpac_output/output/pipeline_abide_cpac_run_freq-filter_nuisance/sub-0051469_ses-1
Couldn't get rois for /home/cparish/Capstone/asd_diagnosis_fmri/data/cpac_output/output/pipeline_abide_cpac_run_freq-filter_nuisance/sub-0051472_ses-1


In [39]:
for s in subjects:
    if s._sub_id in cpac_sub_roi_d:
        s._data_dict['cpac_filt_noglobal'] = cpac_sub_roi_d[s._sub_id]

In [40]:
# Save subjects
save_dir = os.path.dirname(cur_dir) + '/data/ABIDEI_subjects/'
for subject in subjects:
    subject._save_subject(save_dir)

# Site TRs can be found on abide website or 
* https://www.researchgate.net/figure/SCANNING-PARAMETERS-OF-DIFFERENT-SITES-OF-ABIDE-1_tbl1_335722272

In [23]:
site_trs = {'CALTECH': 2, 'CMU': 2, 'KKI': '2.5', 'LEUVEN_1': 1.656, 'LEUVEN_2': 1.656, 'MAX_MUN': 3,
           'NYU': 2, 'OHSU': 2.5, 'OLIN': 2.5, 'PITT': 1.5, 'SBL': 2.2, 'SDSU': 2, 'STANFORD': 2, 
           'TRINITY': 2, 'UCLA_1': 3, 'UCLA_2': 3, 'UM_1': 2, 'UM_2': 2, 'USM': 2, 'YALE': 2}


In [24]:
import json

trs_save_file = save_dir = os.path.dirname(cur_dir) + '/data/dicts/ABIDEI_site_trs.json'
with open(trs_save_file, 'w') as fp:
    json.dump(site_trs, fp)