In [59]:
import os

import pandas as pd
import numpy as np

from scipy.stats import zscore

In [2]:
def match_cond_to_data(fname_match, conditions_fnames):
    
    if "s2" in fname_match:
        matcher = "2"
    else:
        matcher = "1"
    
    for fname in conditions_fnames:
        if fname[0:4] == fname_match[0:4] and fname[-5] == matcher:
            return fname

In [49]:
# So Tensorflow runs on GPU
import tensorflow as tf

# For running on GPU.
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

### Data Organization and Cleaning

First, I want to enssure that all the file descriptions are paired with their data files.

In [4]:
data_fpath = "../data/fNIRS_Data/"

conditions_fnames = [fname for fname in os.listdir(data_fpath) if "_conditions_" in fname]
deoxy_fnames = [fname for fname in os.listdir(data_fpath) if "_Deoxy" in fname]

file_pairs = {}
for fname in deoxy_fnames:
    file_pairs[f"{fname[0:4]}"] = [fname, match_cond_to_data(fname, conditions_fnames)]
    
print(len(file_pairs))
    

31


### Using the description files to index the data.

In this step I will want to read through the "conditions files" to index and label the fNIRS data.

In [5]:
data_mats = []
data_labels = []
for k, val in file_pairs.items():
    if val[1] != None:
        cond_file = pd.read_csv(f"{data_fpath}{val[1]}")
        data_file = pd.read_csv(f"{data_fpath}{val[0]}", header=34)
        for x in range(1, 25):
            data_mats.append(data_file[int(cond_file[f"Task{str(x)}"][0]):int(cond_file[f"Task{str(x)}"][0]) + int(cond_file[f"Task{str(x)}"][1])])
            if cond_file[f"Task{str(x)}"][2] == "cr":
                data_labels.append(0)
            else:
                data_labels.append(1)
                
print(data_labels)
        

[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 

### Clean DataFrame and Convert to Array

Next, we want to remove some variables from that data that are not useful to the modeling so we are just left with the the deoxy data. Then we will want to store in a way (a format) that is easier to input into the LSTM model.

In [6]:
cols_to_drop = ["Probe1(Deoxy)", "Mark", "Time", "BodyMovement", "RemovalMark", "PreScan"]

data_as_np = []
for df in data_mats:
    df.drop(cols_to_drop, axis=1, inplace=True)
    df = df.reset_index()
    if df.shape[0] != 250:
        df = df[df.index <= 249]
    data_as_np.append(df.values)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [7]:
# Sanity checking

print(len(data_as_np))
print(len(data_labels))

432
432


### Some extra data cleaning for odd samples

Some samples ended up being empty. Need to remove those samples as well as their labels from the dataset

In [21]:
for indx, elm in enumerate(data_as_np):
    if elm.shape != (250, 53):
        data_as_np = np.delete(data_as_np, indx)
        del data_labels[indx]
        
# Sanity checking

for elm in data_as_np:
    print(elm.shape)
    
print(len(data_as_np))
print(len(data_labels))

(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)
(250, 53)


### Normalizing and Changing Input Shape of Sample

Still unsure of what the best input shape for this model might be. I'm currently passing the entire sample, but it may be wiser to input chunks of ~5 seconds. Thus far, doing this has not helped much. Normalization has not helped much either.

In [70]:
def organize_train_test_sets(data_as_np, data_labels):
    
    from sklearn.preprocessing import MinMaxScaler
    
    sample_factor = 1
    
    from sklearn.model_selection import train_test_split

    data_labels = [[1,0] if elm == 0 else [0,1] for elm in data_labels]
    
    reshaped = []
    for elm in data_as_np:
        elm = elm.reshape(int(250 / sample_factor), 53 * sample_factor)
        scaler = MinMaxScaler()
        normalized = scaler.fit_transform(elm)
        reshaped.append(normalized)

    return train_test_split(reshaped, data_labels, shuffle=True, test_size=0.20)

X_train, X_test, y_train, y_test = organize_train_test_sets(data_as_np, data_labels)

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

print(X_test[0])

[[0.         0.48363804 0.         ... 0.07120007 0.14512322 0.9089267 ]
 [0.00401606 0.49476386 0.01046718 ... 0.07009006 0.13565887 0.90397037]
 [0.00803213 0.50593395 0.02049871 ... 0.06896917 0.12564947 0.89889755]
 ...
 [0.99196787 0.04727778 1.         ... 0.25999646 1.         0.        ]
 [0.99598394 0.04727778 1.         ... 0.25999646 1.         0.        ]
 [1.         0.04727778 1.         ... 0.25999646 1.         0.        ]]


### The Model

Currently, getting coint flip accuracy on the validation set. Though, it as actually learning the training set (finally). Was having an issue where I was getting coin flip accuracy on both the training and validation set. The model is certainly overfitting, but it does seem that the normalization helped the model to at least do that.

In [72]:
from keras.layers import LSTM, Dense, Dropout
from keras.models import Sequential

time_steps=250

model = Sequential()
model.add(LSTM(32, input_shape=(time_steps, 53), activation="tanh"))
model.add(Dropout(.3))
model.add(Dense(2, activation="softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
model.fit(X_train, y_train, epochs=20, batch_size=16, verbose=1, validation_data=(X_test, y_test))



Train on 264 samples, validate on 67 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x7f0b9103cef0>