# Notebook for using an LSTM to diagnose ASD 

In [49]:
%load_ext autoreload
%autoreload 2

import os
import json
import pickle
import random
random.seed(42)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.metrics import BinaryAccuracy
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import sys
sys.path.append('../src/features')

from subject import Subject

cur_dir = os.getcwd()
abide_dir = os.path.dirname(os.path.dirname(cur_dir)) + '/abide/'
subjects_dir = os.path.dirname(cur_dir) + '/data/ABIDEI_subjects/'
trs_save_file = save_dir = os.path.dirname(cur_dir) + '/data/dicts/ABIDEI_site_trs.json'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Dictionary with TRs for each scanning site
with open(trs_save_file) as json_file:
    site_trs = json.load(json_file)

In [3]:
def open_pickle(f):
    file = open(f,'rb')
    o = pickle.load(file)
    file.close()
    return o

def load_subjects(subject_folder):
    subjects = list()
    for f in os.listdir(subject_folder):
        subjects.append(open_pickle(os.path.join(subject_folder, f)))
    return subjects

In [4]:
subjects = load_subjects(subjects_dir)

In [5]:
# For now let's just look at sites with Trs of 2s
clean_subjects = list()
asd_c = 0
for s in subjects:
    if(site_trs[s._site_id] == 2):
        clean_subjects.append(s)
        # Note dx group 1 is positive for ASD
        if(s._label_dict['dx_group'] == 1):
            asd_c += 1

In [6]:
print(f'{asd_c} subjects with ASD out of {len(clean_subjects)} subjects in clean list')

253 subjects with ASD out of 548 subjects in clean list


# Randomly extract sections of even length from scan to use for features
* Doing 3 mins trying to replicate https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5669262/

In [75]:
# Since scans are 2s apart 90 scans is 3 mins
L = 90
# Number of clips per subject
N=10
# Number of ROIs
N_rois = 200
feat_name = 'roi_200_Cradd'
def extract_feat_sections(s, feat_name=feat_name, L=L, N=N):
    data = s._data_dict[feat_name]
    feat_secs = list()
    for i in range(N):
        r = int(random.random() * (len(data) - L))
        feat_secs.append(data[r:r+L])
    return np.array(feat_secs)

def create_dataset(subjects, feat_name=feat_name, L=L,N=N):
    X = list()
    Y = list()
    for s in subjects:
        feat_secs = extract_feat_sections(s)
        X.extend(feat_secs)
        # 1 is still classified ASD and 0 is control
        # if(s._label_dict['dx_group'] == 1):
        if(s._sex == 1):
            Y.extend([1]*len(feat_secs))
        else:
            Y.extend([0]*len(feat_secs))
    assert len(X) == len(Y)
    X_ar = np.array(X).reshape(len(X), L, N_rois)
    Y_ar = np.array(Y)
    return X_ar, Y_ar

In [76]:
# In original work 10 fold cross val used with proportion of subjects from each site was approximately the same in all folds
# To start will just randomly split subjects into groups
val_per = .05
test_per = .1
train_subs, val_subs = train_test_split(clean_subjects, test_size=val_per + test_per, random_state=42)
val_subs, test_subs = train_test_split(val_subs, test_size=test_per/(val_per + test_per), random_state=43)
print(f'{len(train_subs)} subjects for training')
print(f'{len(val_subs)} subjects for validation')
print(f'{len(test_subs)} subjects for testing')

465 subjects for training
27 subjects for validation
56 subjects for testing


In [77]:
train_X, train_Y = create_dataset(train_subs)
val_X, val_Y = create_dataset(val_subs)
test_X, test_Y = create_dataset(test_subs)
print(f'{len(train_X)} training examples')
print(f'{len(val_X)} validation examples')
print(f'{len(test_X)} testing examples')

4650 training examples
270 validation examples
560 testing examples


# Create LSTM model

In [78]:
# create and fit the LSTM network
# hidden_nodes = int(2/3 * (N_rois * L))
hidden_nodes = 16
model = Sequential()
model.add(LSTM(hidden_nodes, input_shape=(L, N_rois), return_sequences=True))
model.add(Dropout(0.25))
model.add(LSTM(hidden_nodes))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_25 (LSTM)               (None, 90, 16)            13888     
_________________________________________________________________
dropout_11 (Dropout)         (None, 90, 16)            0         
_________________________________________________________________
lstm_26 (LSTM)               (None, 16)                2112      
_________________________________________________________________
dropout_12 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 17        
Total params: 16,017
Trainable params: 16,017
Non-trainable params: 0
_________________________________________________________________


In [79]:
batch_size = 30
epochs = 50
model.fit(train_X, train_Y, batch_size=batch_size, epochs=epochs, validation_data=(val_X, val_Y))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1d95f187e50>

In [80]:
# evaluate the model
_, train_acc = model.evaluate(train_X, train_Y, verbose=0)
_, test_acc = model.evaluate(test_X, test_Y, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

Train: 0.997, Test: 0.832
