# Notebook for creating data for model training on Colab


In [1]:
%load_ext autoreload
%autoreload 2

import os
import json
import pickle
import random
random.seed(42)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import sys
sys.path.append('../src/features')

from subject import Subject

cur_dir = os.getcwd()
abide_dir = os.path.dirname(os.path.dirname(cur_dir)) + '/abide/'
data_dir = os.path.dirname(cur_dir) +'/data/'
ab_subjects_dir = os.path.dirname(cur_dir) + '/data/ABIDEI_subjects/'
ab2_subjects_dir = os.path.dirname(cur_dir) + '/data/ABIDEII_subjects/'
trs_save_file = save_dir = os.path.dirname(cur_dir) + '/data/dicts/ABIDEI_site_trs.json'

In [2]:
# Dictionary with TRs for each scanning site
with open(trs_save_file) as json_file:
    site_trs = json.load(json_file)

In [3]:
# Load ABIDEI preprocessed rois by loading subjects
def open_pickle(f):
    file = open(f,'rb')
    o = pickle.load(file)
    file.close()
    return o

def load_subjects_d(subject_folder):
    subjects_d = {}
    for f in os.listdir(subject_folder):
        s = open_pickle(os.path.join(subject_folder, f))
        subjects_d[s._sub_id] = s
    return subjects_d

In [4]:
def clean_subjects(subjects, trs=[2, 2.17]):
    clean_s = list()
    trs = set(trs)
    for s in subjects:
        if(s._tr in trs):
            clean_s.append(s)
    print(f'{len(clean_s)} clean out of {len(subjects)}')
    return clean_s

In [5]:
ab_subjects_d = load_subjects_d(ab_subjects_dir)
ab_subjects = list(ab_subjects_d.values())
ab_subjects_c = clean_subjects(ab_subjects)

548 clean out of 884


In [6]:
ab2_subjects_d = load_subjects_d(ab2_subjects_dir)
ab2_subjects = list(ab2_subjects_d.values())
ab2_subjects_c = clean_subjects(ab2_subjects)

401 clean out of 458


In [7]:
adhd_subjects_dir = os.path.dirname(cur_dir) + '/data/ADHD200_subjects/'
adhd_subjects_d = load_subjects_d(adhd_subjects_dir)
adhd_subjects = list(adhd_subjects_d.values())
adhd_subjects_c = clean_subjects(adhd_subjects)

82 clean out of 161


In [8]:
acpi_subjects_dir = os.path.dirname(cur_dir) + '/data/ACPI_subjects/'
acpi_subjects_d = load_subjects_d(acpi_subjects_dir)
acpi_subjects = list(acpi_subjects_d.values())
acpi_subjects_c = clean_subjects(acpi_subjects)

155 clean out of 155


In [9]:
ab_subjects_c.extend(ab2_subjects_c)
ab_subjects_c.extend(adhd_subjects_c)
ab_subjects_c.extend(acpi_subjects_c)
clean_subjects = ab_subjects_c
print(len(clean_subjects))

1186


In [7]:
# For now let's just look at sites with Trs of 2s
# clean_subjects = list()
# asd_c = 0
# for s in subjects:
#     if(s._tr == 2):
#         clean_subjects.append(s)
#         # Note dx group 1 is positive for ASD
#         if(s._label_dict['dx_group'] == 1):
#             asd_c += 1

In [8]:
print(f'{asd_c} subjects with ASD out of {len(clean_subjects)} subjects in clean list')

464 subjects with ASD out of 949 subjects in clean list


# Create Training Data files for Colab 

In [10]:
feat_name = 'filt_noglobal_roi_200_Cradd'

In [15]:
X = list()
Y = list()
s_c = 0
for s in clean_subjects:
    X.append(np.array(s._data_dict[feat_name]))
    asd = s._label_dict['dx_group'] == 1
    # 1 is male
    sex = s._sex == 1
    s_c += sex
    age = s._age
    Y.append(np.array([sex, age, asd]))
assert len(X) == len(Y)


In [16]:
s_c

876

In [12]:
X = np.array(X)
Y = np.array(Y)
print(len(X))

1186


  X = np.array(X)


In [13]:
training_dir = data_dir + 'colab_training/'
np.save(training_dir + 'total_X', X)
np.save(training_dir + 'total_Y', Y)

# Randomly extract sections of even length from scan to use for features
* Doing 3 mins trying to replicate https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5669262/

In [18]:
# Since scans are 2s apart 90 scans is 3 mins
L = 90
# Number of clips per subject
N=10
# Number of ROIs
N_rois = 200
feat_name = 'filt_noglobal_roi_200_Cradd'
def extract_feat_sections(s, feat_name=feat_name, L=L, N=N):
    data = s._data_dict[feat_name]
    feat_secs = list()
    for i in range(N):
        r = int(random.random() * (len(data) - L))
        feat_secs.append(data[r:r+L])
    return np.array(feat_secs)

def create_dataset(subjects, feat_name=feat_name, L=L,N=N):
    X = list()
    Y = list()
    for s in subjects:
        feat_secs = extract_feat_sections(s)
        X.extend(feat_secs)
        # 1 is still classified ASD and 0 is control
        asd = s._label_dict['dx_group'] == 1
        sex = s._sex == 1
        for i in range(len(feat_secs)):
            Y.append(np.array([asd, sex]))
#         # if(s._sex == 1):
#             Y.extend([1]*len(feat_secs))
#         else:
#             Y.extend([0]*len(feat_secs))
    assert len(X) == len(Y)
    X_ar = np.array(X).reshape(len(X), L, N_rois)
    # X_ar = np.array(X)
    Y_ar = np.array(Y)
    return X_ar, Y_ar

In [23]:
val_per = .05
test_per = .1
train_subs, val_subs = train_test_split(clean_subjects, test_size=val_per + test_per, random_state=42)
val_subs, test_subs = train_test_split(val_subs, test_size=test_per/(val_per + test_per), random_state=43)
print(f'{len(train_subs)} subjects for training')
print(f'{len(val_subs)} subjects for validation')
print(f'{len(test_subs)} subjects for testing')

465 subjects for training
27 subjects for validation
56 subjects for testing


In [24]:
train_X, train_Y = create_dataset(train_subs)
val_X, val_Y = create_dataset(val_subs)
test_X, test_Y = create_dataset(test_subs)

In [17]:
training_dir = data_dir + 'colab_training/'
np.save(training_dir + 'train_X', train_X)
np.save(training_dir + 'train_Y', train_Y)
np.save(training_dir + 'val_X', val_X)
np.save(training_dir + 'val_Y', val_Y)