In [1]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tqdm import tqdm
import random

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


## Defining Samples

The problem is basically one of time-series classification. Thus a single sample can be defined as $(X, y)$ where $X$ is a time-series and $y$ is the corresponding label. The labels are as follows:

-   0 - Start Hesitation FoG
-   1 - Turn FoG
-   2 - Walking FoG
-   3 - No FoG

## Constructing Samples
For a given row, say row $i$, take the previous $x$ rows and the succeeding $y$ rows into consideration. The entire set of rows from $i-x$ to $i+y$ together constitute one (time-series) input. The corresponding label for this series is the label for the $i^{th}$ row.

In [2]:
class FoGPrediction(object):
    def __init__(self, train_path, past, future):
        self.train_path = train_path
        self.past = past
        self.future = future
        self.train_inputs = []  # all samples
        self.train_targets = []  # all (corresponding) labels
    
    def sample_normalize(self, sample):
        """
        standardize all values to have
        mean 0 and standard deviation 1
        """
        mean = tf.math.reduce_mean(sample)
        std = tf.math.reduce_std(sample)
        sample = tf.math.divide_no_nan(sample - mean, std)
        return sample.numpy()
    
    def get_class_indices(self):
        """
        returns the list of indices
        corresponding to each class
        """
        start_hes = np.where(self.train_targets[:,0] == 1)[0]
        turn = np.where(self.train_targets[:,1] == 1)[0]
        walk = np.where(self.train_targets[:,2] == 1)[0]
        nothing = np.where(
            (
                (self.train_targets[:,0] == 0) &
                (self.train_targets[:,1] == 0) &
                (self.train_targets[:,2] == 0)
            )
        )[0]
        return start_hes, turn, walk, nothing
    
    def plot_distribution(self):
        """
        plots the number of items
        in each class
        """
        start_hes, turn, walk, nothing = self.get_class_indices()
        # print(f"Start Hestitation: {len(start_hes)}, Turn: {len(turn)}, Walk: {len(walk)}, Others: {len(nothing)}")
        dist = {
            "Start Hestitation FoG": len(start_hes), 
            "Turn FoG": len(turn), 
            "Walk FoG": len(walk),
            "No FoG": len(nothing)
        }
        plt.bar(*zip(*dist.items()))
        plt.show()
    
    def balance(self):
        """
        samples equal number of data points
        from all classes
        """
        start_hes, turn, walk, nothing = self.get_class_indices()
        n = min(
            len(start_hes),
            len(turn),
            len(walk),
            len(nothing)
        )
        final_indexes = self.random_sampling(start_hes, n) + \
                            self.random_sampling(turn, n) + \
                            self.random_sampling(walk, n) + \
                            self.random_sampling(nothing, n)
        self.train_inputs = np.asarray([self.train_inputs[i] for i in final_indexes], dtype=np.float32)
        self.train_targets = np.array([0] * n + [1] * n + [2] * n + [3] * n)
        print(f"Inputs shape: {self.train_inputs.shape}, targets shape: {self.train_targets.shape}")
    
    def random_sampling(self, arr, n):
        """
        randomly selects `n` (unique) items
        from `arr`
        """
        return random.sample(arr.tolist(), n)

## `tdcsfog`
This section pertains to sample creation for the `tdcsfog` series

In [3]:
class TdcsFoG(FoGPrediction):
    def __init__(self, train_path, past, future):
        super().__init__(train_path, past, future)
    
    def create_samples(self, file):
        """
        creates samples from 
        the time-series in `file`
        """
        try:
            series = pd.read_csv(os.path.join(self.train_path, file))
            series['AccV'] = self.sample_normalize(series['AccV'].values)
            series['AccML'] = self.sample_normalize(series['AccML'].values)
            series['AccAP'] = self.sample_normalize(series['AccAP'].values)
            inputs = series[['AccV', 'AccML', 'AccAP']]
            targets = series[['StartHesitation', 'Turn', 'Walking']]
            # convert to numpy
            inputs = inputs.values
            inputs = inputs.astype(np.float32)
            targets = targets.values
            targets = targets.astype(np.float32)
            # print(f"Input shape: {inputs.shape}, Target shape: {targets.shape}")

            input_samples = []
            target_samples = []
            n = len(inputs)
            input_start = 0
            input_main = self.past
            input_end = input_main + self.future
            while input_end < n:
                input_samples.append(inputs[input_start: input_end + 1])
                target_samples.append(targets[input_main])
                input_start += 1
                input_main += 1
                input_end += 1
        except FileNotFoundError: 
            return None, None
        return input_samples, target_samples
    
    def load_all(self):
        """
        reads all files, creates samples
        and loads to `self.series_inputs`
        and `self.series_targets`
        """
        tdcsfog_metadata = pd.read_csv(
            '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/tdcsfog_metadata.csv'
        ).set_index('Id')
        # set subjects for validation set
        val_subjects = ['07285e', '220a17', '54ee6e', '312788', '24a59d', '4bb5d0', '48fd62', '79011a', '7688c1']
        # get train and validation IDs
        train_ids = tdcsfog_metadata[tdcsfog_metadata['Subject'].apply(lambda x: x not in val_subjects)].index.tolist()
        val_ids = tdcsfog_metadata[tdcsfog_metadata['Subject'].apply(lambda x: x in val_subjects)].index.tolist()
        # create all training samples
        for _id in tqdm(train_ids):
            inputs, targets = self.create_samples(_id + ".csv")
            if inputs is None:
                continue
            self.train_inputs += inputs
            self.train_targets += targets
        # convert to numpy
        self.train_inputs = np.array(self.train_inputs, dtype=np.float32)
        self.train_targets = np.array(self.train_targets, dtype=np.uint8)
        print(f"Inputs shape: {self.train_inputs.shape}, targets shape: {self.train_targets.shape}")

In [4]:
tdcs = TdcsFoG("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog", 10, 10)
tdcs.load_all()
tdcs.balance()

100%|██████████| 717/717 [00:31<00:00, 22.66it/s]


Inputs shape: (6058543, 21, 3), targets shape: (6058543, 3)
Inputs shape: (703972, 21, 3), targets shape: (703972,)


### ❗ Shuffle
The samples are all ordered class-wise. They need to be shuffle for proper training

In [5]:
from sklearn.utils import shuffle
X, y = shuffle(tdcs.train_inputs, tdcs.train_targets)

## Building the Model

### Model - LSTM-FCN

Refer [here](https://paperswithcode.com/paper/multivariate-lstm-fcns-for-time-series) for this model's details

In [6]:
X = np.swapaxes(X, 1, 2)

from tensorflow.keras.layers import Conv1D, BatchNormalization, GlobalAveragePooling1D, Permute, Dropout, Flatten, Masking, Reshape, multiply, concatenate
from tensorflow.keras.layers import Dense, LSTM, concatenate, Activation, GRU, SimpleRNN
from tensorflow.keras import Model
from tensorflow.keras import Input

MAX_SEQUENCE_LENGTH = X.shape[2]
FEAT = X.shape[1]
NUM_CELLS = 8
NB_CLASS = 4

def squeeze_excite_block(input):
    ''' Create a squeeze-excite block
    Args:
        input: input tensor
        filters: number of output filters
        k: width factor

    Returns: a keras tensor
    '''
    filters = input.shape[-1] # channel_axis = -1 for TF

    se = GlobalAveragePooling1D()(input)
    se = Reshape((1, filters))(se)
    se = Dense(filters // 16,  activation='relu', kernel_initializer='he_normal', use_bias=False)(se)
    se = Dense(filters, activation='sigmoid', kernel_initializer='he_normal', use_bias=False)(se)
    se = multiply([input, se])
    return se

In [7]:
ip = Input(shape=(FEAT, MAX_SEQUENCE_LENGTH))

x = Masking()(ip)
x = LSTM(8)(x)
x = Dropout(0.8)(x)

_y = Permute((2, 1))(ip)
_y = Conv1D(128, 8, padding='same', kernel_initializer='he_uniform')(_y)
_y = BatchNormalization()(_y)
_y = Activation('relu')(_y)
_y = squeeze_excite_block(_y)

_y = Conv1D(256, 5, padding='same', kernel_initializer='he_uniform')(_y)
_y = BatchNormalization()(_y)
_y = Activation('relu')(_y)
_y = squeeze_excite_block(_y)

_y = Conv1D(128, 3, padding='same', kernel_initializer='he_uniform')(_y)
_y = BatchNormalization()(_y)
_y = Activation('relu')(_y)

_y = GlobalAveragePooling1D()(_y)

x = concatenate([x, _y])

out = Dense(NB_CLASS, activation='softmax')(x)

model = Model(ip, out)
# model.summary()

## Training (with 10-fold Cross Validation)

In [8]:
from sklearn.model_selection import KFold

def cross_validate(my_model, X, y):
    accuracy_all = []

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    f = 1
    for train_index, test_index in kfold.split(X):
        print("Fold: ", f)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.compile(
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            optimizer=tf.keras.optimizers.Adam(),
            metrics=['sparse_categorical_accuracy']
        )

        history = my_model.fit(
            X_train,
            y_train,
            batch_size=256,
            epochs=20,
            validation_data=(X_test, y_test),
            verbose=1,
        )
        
        accuracy_all += history.history['sparse_categorical_accuracy']
        f += 1
        
    return accuracy_all

In [9]:
result = cross_validate(model, X, y)

Fold:  1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Fold:  2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Fold:  3
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Fold:  4
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Fold:  5
Epoch 1/20
Epoch 2/20
Epoch 3/2

In [10]:
print(result)

[0.5949265360832214, 0.6477080583572388, 0.6739054918289185, 0.6931487321853638, 0.7097718715667725, 0.7240290641784668, 0.7361981272697449, 0.7477200627326965, 0.7575973868370056, 0.7666602730751038, 0.7747887372970581, 0.7821959257125854, 0.7891122698783875, 0.7953041195869446, 0.8016001582145691, 0.806725025177002, 0.8113148808479309, 0.8166496753692627, 0.8208622932434082, 0.8244987726211548, 0.8233829140663147, 0.8281763195991516, 0.8324710130691528, 0.8351131677627563, 0.8381972908973694, 0.8413965702056885, 0.8442991375923157, 0.847088098526001, 0.8496576547622681, 0.8522619009017944, 0.8539302349090576, 0.8564429879188538, 0.8578934669494629, 0.8602041602134705, 0.8619087934494019, 0.8642842173576355, 0.8647592663764954, 0.8681700825691223, 0.8693775534629822, 0.8709858655929565, 0.8673874735832214, 0.8700122237205505, 0.8718241453170776, 0.8730268478393555, 0.8750802874565125, 0.8768811821937561, 0.8777698278427124, 0.8792724013328552, 0.8806676268577576, 0.8816462159156799, 0

## Testing and Submission


**Note**: Strictly speaking, 2 models need to be built - one for `tdcsfog` and another for `defog`. However, presently, only the model for `tdcsfog` has been built and the same is used to make predictions for the `defog` test series as well.

In [11]:
def create_test_samples(series):
    series['AccV'] = tdcs.sample_normalize(series['AccV'].values)
    series['AccML'] = tdcs.sample_normalize(series['AccML'].values)
    series['AccAP'] = tdcs.sample_normalize(series['AccAP'].values)
    inputs = series[['AccV', 'AccML', 'AccAP']]
    # convert to numpy
    inputs = inputs.values
    inputs = inputs.astype(np.float32)
    
    input_samples = []
    n = len(inputs)
    input_start = 0
    input_main = tdcs.past
    input_end = input_main + tdcs.future
    while input_end < n:
        input_samples.append(inputs[input_start: input_end + 1])
        input_start += 1
        input_main += 1
        input_end += 1
    return input_samples

### Testing for `tdcsfog`

In [12]:
def testing(series_type):
    for file in os.listdir(os.path.join(test_dir, series_type)):
        _id = file[:-4]  # ignore the last 4 chars (.csv)
        series = pd.read_csv(os.path.join(test_dir, series_type, file))
        print(len(series))

        t = 1

        def add_default_row():
            ids.append(_id + '_' + str(t))
            start_hes.append(0)
            turn.append(0)
            walk.append(0)

        # no FoG for the first 10 entries
        for i in range(10):
            add_default_row();
            t += 1

        inputs = create_test_samples(series)
        predictions = model.predict(
            np.swapaxes(inputs, 1, 2)
        )
        for pred in predictions:
            _class = np.argmax(pred)
            add_default_row()
            if _class == 0:
                start_hes[-1] = 1
            elif _class == 1:
                turn[-1] = 1
            elif _class == 2:
                walk[-1] = 1

        # no FoG for the last 10 entries
        for i in range(10):
            add_default_row()
            t += 1

In [13]:
test_dir = "/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test"

ids = []
start_hes = []
turn = []
walk = []

testing("tdcsfog")
testing("defog")

4682
281688


### Save Submission File

In [14]:
submission = pd.DataFrame({
    "Id": ids,
    "StartHesitation": start_hes,
    "Turn": turn,
    "Walking": walk
})
submission.to_csv("/kaggle/working/submission.csv", index=False)