In [34]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import scipy.signal as signal 
import scipy
import datetime
import sys
sys.path.append('../')
from dsmuc.custom import detect_peaks
sns.set(style="darkgrid")


In [35]:
data_path = "../../data/data_PD/PT18/"

interested_cols = [ 'AccX', 'AccY', 'AccZ', 'GyroX','GyroY', 'GyroZ']
window_size = datetime.timedelta(seconds=2)
window_slide = datetime.timedelta(seconds=1)

In [36]:
feature_list = [ 
                        'mean',
                        'min',
                        'max',
                        'range',
                        'entropy_',
                        'var',
                        'kurtosis',
                        'skew',
                        'quantile25',
                        'quantile50',
                        'quantile75',
                        'energy', 
                        'label',
                        'frequency_features',
                        'subject_id']

In [37]:
def mean_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.mean(a)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_mean'
    return var
def min_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.min(a)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_min'
    return var
def max_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.max(a)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_max'
    return var
def range_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = (np.max(a)-np.min(a))
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_range'
    return var
def entropy_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        e =np.histogram(a)[0]
        var_temp = scipy.stats.entropy(e/ np.sum(e))
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_entropy'
    return var
def var_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.var(a)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_var'
    return var
def kurtosis_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = scipy.stats.kurtosis(a, fisher=True)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_kurtosis'
    return var
def skewness_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = scipy.stats.skew(a)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_skew'
    return var
def quantile25_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.percentile(a,25)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_q25'
    return var
def quantile50_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.percentile(a,50)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_q50'
    return var
def quantile75_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.percentile(a,75)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_q75'
    return var
def energy_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.sum(np.mean(a**2)) 
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_energy'
    return var


In [38]:
def find_nearest(array,value):
    idx = (np.abs(array-value)).argmin()
    return idx
def frequency_features(df):
    list_= []
    index_name = []
    frequency_features_list = ['energy_total','energy_interested','max_total', 'max_interested']
    for col in interested_cols:
        a = df[col].values
        f, psdX = signal.periodogram(a, fs=50, nfft = 256)
        i_low = find_nearest(f,4.0)
        i_high = find_nearest(f,7.0)

        energy_total = np.sum(psdX) 
        energy_interested = np.sum(psdX[i_low : i_high + 1]) 
        max_total = np.max(psdX)
        max_interested = np.max(psdX[i_low : i_high + 1])
        var_temp = [energy_total, energy_interested, max_total, max_interested]
        list_.extend(var_temp)
        index_name.extend([col+'_'+x for x in frequency_features_list])
    var = pd.Series(list_, index=index_name)
    return var
def average_over_axis(df):
    aoa = df[interested_cols].mean(axis = 0)
    aoa.index += '_aoa'
    return aoa
def average_time_elapse(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        mph = a.mean()
        ind = detect_peaks(a, mph = mph, mpd=20, show=False)
        list_.append(np.diff(ind).mean())
    ate = pd.Series(list_, index=interested_cols)
    ate.index += '_ate'
    return ate
def average_peak_freq(df):
    list_f= []
    for col in interested_cols:
        a = df[col].values
        mph = a.mean()
        ind = detect_peaks(a, mph = mph, mpd=20, show=False)
        list_f.append(len(ind)/a.shape[0])
    apf = pd.Series(list_f, index=interested_cols)
    apf.index += '_apf'
    return apf
def rms_func(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        rms_temp = np.sqrt(np.mean(a**2))
        list_.append(rms_temp)
    rms = pd.Series(list_, index=interested_cols)
    rms.index += '_rms'
    return rms
def std_func(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        std_temp = np.std(a)
        list_.append(std_temp)
    std = pd.Series(list_, index=interested_cols)
    std.index += '_std'
    return std
def minmax_func(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        minmax_temp = np.max(a)-np.min(a)
        list_.append(minmax_temp)
    minmax = pd.Series(list_, index=interested_cols)
    minmax.index += '_minmax'
    return minmax
def cor_func(df):
    a = df[interested_cols[:3]].corr()
    b= df[interested_cols[3:]].corr()
    indexes = ['CorAccXAccY','CorAccXAccZ','CorAccYAccZ', 'CorGyroXGyroY','CorGyroXGyroZ','CorGyroYGyroZ']
    Cor = (a['AccX'][1:]).append(a['AccY'][2:]).append((b['GyroX'][1:]).append(b['GyroY'][2:]))
    corr = pd.Series(Cor.values, indexes)
    corr.index += '_corr'
    return corr
def label_(df):
    return pd.Series(df['label'][0], index=['label'])
def subject_id_(df):
    return pd.Series(df['subject_id'][0], index=['subject_id'])
    

In [39]:
def get_all_features(df, index, feature_list ):
    
    feature_func_dict = {
                        'aoa':average_over_axis,
                        'ate': average_time_elapse,
                        'apf':average_peak_freq,
                        'rms':rms_func,
                        'std':std_func,
                        'minimax':minmax_func,
                        'cor':cor_func,
                        'mean':mean_,
                        'min':min_,
                        'max':max_,
                        'range':range_,
                        'entropy_':entropy_,
                        'var':var_,
                        'kurtosis' : kurtosis_,
                        'skew':skewness_,
                        'quantile25':quantile25_,
                        'quantile50':quantile50_,
                        'quantile75':quantile75_,
                        'energy':energy_,
                        'frequency_features':frequency_features,
                        'label':label_,
                        'subject_id':subject_id_
        }


    ser_list = []
    ser_list.append(pd.Series(str(index[0]),index=['start']))
    ser_list.append(pd.Series(str(index[1]),index=['end']))
    for x in feature_list:
        ser_list.append(feature_func_dict[x](df))
    ser = pd.concat(ser_list)
    if type(index)!=str:
        index = str(index)
    ser.name = index
    return ser
    

In [40]:
'''def get_all_features(df, index):
    feature_list = ['subject_id', 'aoa', 'ate', 'apf', 'rms', 'std', 'minmax', 'cor', 'label']
#     feature_func_dict = {'aoa':average_over_axis,
#                         'ate': average_time_elapse,
#                         'apf':average_peak_freq,
#                         'rms':rms_func,
#                         'std':std_func,
#                         'minimax':minmax_func,
#                         'cor':cor_func}
    
    aoa = average_over_axis(df)
    ate = average_time_elapse(df)
    apf = average_peak_freq(df)
    rms = rms_func(df)
    std = std_func(df)
    minmax = minmax_func(df)
    cor = cor_func(df)
    subject_id = pd.Series(df['subject_id'][0], index=['subject_id'])
    label = pd.Series(df['label'][0], index=['label'])
    
    ser_list = [pd.Series(str(index[0]),index=['start']),pd.Series(str(index[1]),index=['end']), subject_id, aoa, ate,apf, rms,std, minmax, cor, label]
    ser = pd.concat(ser_list)
    if type(index)!=str:
        index = str(index)
    ser.name = index
    
    return ser
    '''

"def get_all_features(df, index):\n    feature_list = ['subject_id', 'aoa', 'ate', 'apf', 'rms', 'std', 'minmax', 'cor', 'label']\n#     feature_func_dict = {'aoa':average_over_axis,\n#                         'ate': average_time_elapse,\n#                         'apf':average_peak_freq,\n#                         'rms':rms_func,\n#                         'std':std_func,\n#                         'minimax':minmax_func,\n#                         'cor':cor_func}\n    \n    aoa = average_over_axis(df)\n    ate = average_time_elapse(df)\n    apf = average_peak_freq(df)\n    rms = rms_func(df)\n    std = std_func(df)\n    minmax = minmax_func(df)\n    cor = cor_func(df)\n    subject_id = pd.Series(df['subject_id'][0], index=['subject_id'])\n    label = pd.Series(df['label'][0], index=['label'])\n    \n    ser_list = [pd.Series(str(index[0]),index=['start']),pd.Series(str(index[1]),index=['end']), subject_id, aoa, ate,apf, rms,std, minmax, cor, label]\n    ser = pd.concat(ser_list)\n    

In [41]:
from os import listdir

def find_csv_filenames( path_to_dir, suffix=".csv" ):
    filenames = listdir(path_to_dir)
    return [ filename for filename in filenames if filename.endswith( suffix ) ]

In [42]:

window_size_seconds = 2
window_slide_seconds = 1
min_samples = 20
save_to = '../../data/PT18_preprocessed_extra/'

for file_name in find_csv_filenames(data_path):

    window_size = datetime.timedelta(seconds=window_size_seconds)
    window_slide = datetime.timedelta(seconds=window_slide_seconds)
    try:
    
        df = pd.read_csv(data_path+file_name, index_col=0)

        df['date_time'] = pd.to_datetime(df['time'],unit='ms')
        df = df.set_index(pd.DatetimeIndex(df['date_time']))

        df = df[['accelerometerX', 'accelerometerY', 'accelerometerZ', 'gyroscopeX',
               'gyroscopeY', 'gyroscopeZ','label']]

        df.columns = interested_cols + ['label']

        df['subject_id'] = (file_name.split('.000')[1].split('.')[0])

        df = df.sort_index(ascending = True)


        df = df.sort_index(ascending = True)
        ## Extract Segments


        print('Extracting segments and saving file :', file_name)
        samples_count = []
        DF = pd.DataFrame()

        t = df.index[0]
        end_time = df.index[-1]
        increment = 0
        while(t + datetime.timedelta(seconds=1) < end_time):

            t_end = t + window_size
            sensor_data = df.between_time(t.to_pydatetime().time(), t_end.to_pydatetime().time()
                                               ,include_start=True, include_end=False)
            if sensor_data.shape[0]>= min_samples:
                increment +=1
                ser = get_all_features(sensor_data, index=(t, t_end), feature_list=feature_list)
                DF = DF.append(ser, verify_integrity=True)

            t = t+window_slide

        DF.to_csv(save_to+file_name+'_preprocessed.csv', index=True)
    except Exception:
        print('file: '+file_name+' not readable')
        continue

Extracting segments and saving file : pt18usv.00024.csv


  mask |= (ar1 == a)


Extracting segments and saving file : pt18usv.00013.csv


KeyboardInterrupt: 

In [None]:
print("Finished")

In [None]:
pd.read_cs