In [1]:
import os
from os.path import join, pardir, curdir
import glob
import numpy as np
import pandas as pd
import scipy as sp
from tqdm import tqdm
from matplotlib import pyplot as plt
from joblib import load, dump
from sklearn.preprocessing import StandardScaler

# Set current directory to "src"
os.chdir(join(os.getcwd(), os.pardir, "src"))
print(f"Current working directory: {os.getcwd()}")

from feature_extractor import *
from process import *

Current working directory: /home/fa926284/Documents/Parkinson_FOG_Detection/src


In [5]:
DATA_PATH = join(pardir, 'data')
DATASETS = ['tdcsfog', 'defog']
FILE_DIRS = []
TRAIN_DIRS = []
VAL_DIRS = []
TEST_DIRS = []

RANDOM_SEED = 42

for dataset in DATASETS:
    FILE_DIRS.extend(glob.glob(join(DATA_PATH, "train", dataset, "*.csv"))[:])
    TEST_DIRS.extend(glob.glob(join(DATA_PATH, "test", dataset, "*.csv"))[:])
TRAIN_DIRS, VAL_DIRS = random_split_list(FILE_DIRS, 
                                         random_seed=42,
                                         split=0.98)

print(f"Total subjects: {len(FILE_DIRS)}")
print(f"Train subjects: {len(TRAIN_DIRS)}")
print(f"Validation subjects: {len(VAL_DIRS)}")
print(f"Test subjects: {len(TEST_DIRS)}")


Total subjects: 924
Train subjects: 905
Validation subjects: 19
Test subjects: 2


In [3]:
X_df = []
y = []

print(f"\n\nProcessing {dataset} dataset ... ", end='\n\n')

    
for file_dir in tqdm([TRAIN_DIRS[0]]):
    df = pd.read_csv(file_dir)
    
    if 'Task' in df.columns and 'Valid' in df.columns:
        df = df[(df['Valid'] == True) & (df['Task'] == True)]
        df = df.drop(['Valid', 'Task'], axis=1).reset_index()

        df['label'] = (df['StartHesitation'] * 1) | (df['Turn'] * 2) | (df['Walking'] * 3)
    
        acc_ap_win, label = get_windowed_data(df['AccAP'].to_numpy(),
                                              df['label'].to_numpy(),
                                              win_len=100, slide_step=1)
        acc_v_win, _ = get_windowed_data(df['AccV'].to_numpy(),
                                         df['label'].to_numpy(),
                                         win_len=100, slide_step=1)
        acc_ml_win, _ = get_windowed_data(df['AccML'].to_numpy(),
                                          df['label'].to_numpy(),
                                          win_len=100, slide_step=1)
    
        acc_ap_stat = get_stat_features(acc_ap_win, prefix='AccAP')
        acc_v_stat = get_stat_features(acc_v_win, prefix='AccV')
        acc_ml_stat = get_stat_features(acc_ml_win, prefix='AccML')
        acc_ap_freq = get_freq_features(acc_ap_win, axis=1, fs=128, nperseg=20, prefix='AccAP')
        acc_v_freq = get_freq_features(acc_v_win, axis=1, fs=128, nperseg=20, prefix='AccV')
        acc_ml_freq = get_freq_features(acc_ml_win, axis=1, fs=128, nperseg=20, prefix='AccML')

        features = pd.concat([acc_ap_stat,
                              acc_v_stat,
                              acc_ml_stat,
                              acc_ap_freq,
                              acc_v_freq,
                              acc_ml_freq], axis=1)

        X_df.append(features)
        y.append(label)

# X_df = pd.concat(X_df)
# X = X_df.to_numpy()
# y = np.concatenate(y)



Processing tdcsfog dataset ... 



100%|██████████| 833/833 [00:02<00:00, 283.13it/s]




Processing defog dataset ... 



100%|██████████| 91/91 [00:04<00:00, 19.05it/s]


In [4]:
dump(X_df, join(pardir, "data", "processed", "X_df.joblib"))
dump(X, join(pardir, "data", "processed", "X.joblib"))
dump(y, join(pardir, "data", "processed", "y.joblib"))


['../data/processed/y.joblib']

In [5]:
X_df.shape

(222114, 66)

In [6]:
np.unique(y, return_counts=True)


(array([0. , 0.5, 1. , 1.5, 2. , 2.5, 3. ]),
 array([168602,      3,   6156,     13,  41796,      2,   5542]))