In [2]:
import os
from os.path import join, pardir, curdir
import glob
import numpy as np
import pandas as pd
import scipy as sp
from tqdm import tqdm
from matplotlib import pyplot as plt
from joblib import load, dump
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

# Set current directory to "src"
os.chdir(join(os.getcwd(), os.pardir, "src"))
print(f"Current working directory: {os.getcwd()}")

from feature_extractor import *
from process import *

Current working directory: /home/fa926284/Documents/Parkinson_FOG_Detection/src


In [3]:
DATASETS = ['tdcsfog', 'defog']
DATA_PATH = join(pardir, 'data')
PROCESSED_DATA_PATH = join(pardir, "data", "processed")
FILE_DIRS = []
TRAIN_DIRS = []
VAL_DIRS = []
TEST_DIRS = []

RANDOM_SEED = 42

if not os.path.isdir(PROCESSED_DATA_PATH):
    os.mkdir(PROCESSED_DATA_PATH)

for dataset in DATASETS:
    FILE_DIRS.extend(glob.glob(join(DATA_PATH, "train", dataset, "*.csv"))[:])
    TEST_DIRS.extend(glob.glob(join(DATA_PATH, "test", dataset, "*.csv"))[:])
TRAIN_DIRS, VAL_DIRS = random_split_list(FILE_DIRS, 
                                         random_seed=42,
                                         split=0.98)

print(f"Total subjects: {len(FILE_DIRS)}")
print(f"Train subjects: {len(TRAIN_DIRS)}")
print(f"Validation subjects: {len(VAL_DIRS)}")
print(f"Test subjects: {len(TEST_DIRS)}")


Total subjects: 924
Train subjects: 905
Validation subjects: 19
Test subjects: 2


In [4]:
def get_feature_label(df):
    if 'Task' in df.columns and 'Valid' in df.columns:
        df = df[(df['Valid'] == True) & (df['Task'] == True)]
        df = df.drop(['Valid', 'Task'], axis=1).reset_index()

    df['label'] = (df['StartHesitation'] * 1) | (df['Turn']
                                                 * 2) | (df['Walking'] * 3)
    df = df.astype({'label': 'int'})

    acc_ap_win, label = get_windowed_data(df['AccAP'].to_numpy(),
                                          df['label'].to_numpy(),
                                          win_len=100, slide_step=1)
    acc_v_win, _ = get_windowed_data(df['AccV'].to_numpy(),
                                     df['label'].to_numpy(),
                                     win_len=100, slide_step=1)
    acc_ml_win, _ = get_windowed_data(df['AccML'].to_numpy(),
                                      df['label'].to_numpy(),
                                      win_len=100, slide_step=1)

    acc_ap_stat = get_stat_features(acc_ap_win, prefix='AccAP')
    acc_v_stat = get_stat_features(acc_v_win, prefix='AccV')
    acc_ml_stat = get_stat_features(acc_ml_win, prefix='AccML')
    acc_ap_freq = get_freq_features(
        acc_ap_win, axis=1, fs=128, nperseg=20, prefix='AccAP')
    acc_v_freq = get_freq_features(
        acc_v_win, axis=1, fs=128, nperseg=20, prefix='AccV')
    acc_ml_freq = get_freq_features(
        acc_ml_win, axis=1, fs=128, nperseg=20, prefix='AccML')

    features = pd.concat([acc_ap_stat,
                          acc_v_stat,
                          acc_ml_stat,
                          acc_ap_freq,
                          acc_v_freq,
                          acc_ml_freq], axis=1)

    label = label.astype(int)

    return features, label


def get_feature(df):
    if 'Task' in df.columns and 'Valid' in df.columns:
        df = df[(df['Valid'] == True) & (df['Task'] == True)]
        df = df.drop(['Valid', 'Task'], axis=1).reset_index()

    acc_ap_win, _ = get_windowed_data(df['AccAP'].to_numpy(),
                                          None,
                                          win_len=100, slide_step=1)
    acc_v_win, _ = get_windowed_data(df['AccV'].to_numpy(),
                                     None,
                                     win_len=100, slide_step=1)
    acc_ml_win, _ = get_windowed_data(df['AccML'].to_numpy(),
                                      None,
                                      win_len=100, slide_step=1)

    acc_ap_stat = get_stat_features(acc_ap_win, prefix='AccAP')
    acc_v_stat = get_stat_features(acc_v_win, prefix='AccV')
    acc_ml_stat = get_stat_features(acc_ml_win, prefix='AccML')
    acc_ap_freq = get_freq_features(
        acc_ap_win, axis=1, fs=128, nperseg=20, prefix='AccAP')
    acc_v_freq = get_freq_features(
        acc_v_win, axis=1, fs=128, nperseg=20, prefix='AccV')
    acc_ml_freq = get_freq_features(
        acc_ml_win, axis=1, fs=128, nperseg=20, prefix='AccML')

    features = pd.concat([acc_ap_stat,
                          acc_v_stat,
                          acc_ml_stat,
                          acc_ap_freq,
                          acc_v_freq,
                          acc_ml_freq], axis=1)

    return features


In [9]:
if not os.path.isfile(join(PROCESSED_DATA_PATH, "X_train_df.joblib")):
    print(f"\n\nProcessing Train Dataset ... ", end='\n\n')
    
    X_train_df = []
    y_train = []
    
    for file_dir in tqdm(TRAIN_DIRS):
        df = pd.read_csv(file_dir)
        features, label = get_feature_label(df)
        X_train_df.append(features)
        y_train.append(label)

    X_train_df = pd.concat(X_train_df)
    X_train = X_train_df.to_numpy()
    y_train = np.concatenate(y_train)

    dump(X_train_df, join(PROCESSED_DATA_PATH, "X_train_df.joblib"))
    dump(X_train, join(PROCESSED_DATA_PATH, "X_train.joblib"))
    dump(y_train, join(PROCESSED_DATA_PATH, "y_train.joblib"))

else:
    X_train_df = load(join(PROCESSED_DATA_PATH, "X_train_df.joblib"))
    X_train = load(join(PROCESSED_DATA_PATH, "X_train.joblib"))
    y_train = load(join(PROCESSED_DATA_PATH, "y_train.joblib"))


In [10]:
if not os.path.isfile(join(PROCESSED_DATA_PATH, "X_val_df.joblib")):
    print(f"\n\nProcessing Validation Dataset ... ", end='\n\n')

    X_val_df = []
    y_val = []

    for file_dir in tqdm(VAL_DIRS):
        df = pd.read_csv(file_dir)
        features, label = get_feature_label(df)
        X_val_df.append(features)
        y_val.append(label)

    X_val_df = pd.concat(X_val_df)
    X_val = X_val_df.to_numpy()
    y_val = np.concatenate(y_val)

    dump(X_val_df, join(PROCESSED_DATA_PATH, "X_val_df.joblib"))
    dump(X_val, join(PROCESSED_DATA_PATH, "X_val.joblib"))
    dump(y_val, join(PROCESSED_DATA_PATH, "y_val.joblib"))

else:
    X_val_df = load(join(PROCESSED_DATA_PATH, "X_val_df.joblib"))
    X_val = load(join(PROCESSED_DATA_PATH, "X_val.joblib"))
    y_val = load(join(PROCESSED_DATA_PATH, "y_val.joblib"))


In [12]:
if not os.path.isfile(join(PROCESSED_DATA_PATH, "X_test_df0.joblib")):
    print(f"\n\nProcessing Test Dataset ... ", end='\n\n')

    for idx, file_dir in enumerate(TEST_DIRS):
        df = pd.read_csv(file_dir)
        X_test_df = get_feature(df)
        X_test = X_test_df.to_numpy()
        dump(X_test_df, join(PROCESSED_DATA_PATH, f"X_test_df{idx}.joblib"))
        dump(X_test, join(PROCESSED_DATA_PATH, f"X_test{idx}.joblib"))

else:
    X_test0_df = load(join(PROCESSED_DATA_PATH, "X_test_df0.joblib"))
    X_test0 = load(join(PROCESSED_DATA_PATH, "X_test0.joblib"))
    X_test1_df = load(join(PROCESSED_DATA_PATH, "X_test_df1.joblib"))
    X_test1 = load(join(PROCESSED_DATA_PATH, "X_test1.joblib"))


In [14]:
print(X_train.shape, X_val.shape, X_test0.shape, X_test1.shape)
print(y_train.shape, y_val.shape)

(10928677, 66) (153841, 66) (4583, 66) (281589, 66)
(10928677,) (153841,)
