In [12]:
# %matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import datetime
import scipy.signal as signal 
import scipy
import sys
sys.path.insert(0, '../')
import dsmuc
from dsmuc.custom import detect_peaks
sns.set(style="darkgrid")

label_dict = {1:'walking',
             0:'not_walking'}
saveto = "../../data/data_PD/snippets/"
label_folders = next(os.walk(saveto))[1]
interested_cols = [ 'AccX', 'AccY', 'AccZ', 'GyroX','GyroY', 'GyroZ']

In [13]:
label_folders

['not_walking', 'walking']

In [14]:
feature_list = [ 
                        'mean',
                        'min',
                        'max',
                        'range',
                        'entropy_',
                        'var',
                        'kurtosis',
                        'skew',
                        'quantile25',
                        'quantile50',
                        'quantile75',
                        'energy', 
                        'label',
                        'frequency_features']

In [15]:
def mean_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.mean(a)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_mean'
    return var
def min_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.min(a)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_min'
    return var
def max_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.max(a)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_max'
    return var
def range_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = (np.max(a)-np.min(a))
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_range'
    return var
def entropy_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        e =np.histogram(a)[0]
        var_temp = scipy.stats.entropy(e/ np.sum(e))
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_entropy'
    return var
def var_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.var(a)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_var'
    return var
def kurtosis_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = scipy.stats.kurtosis(a, fisher=True)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_kurtosis'
    return var
def skewness_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = scipy.stats.skew(a)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_skew'
    return var
def quantile25_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.percentile(a,25)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_q25'
    return var
def quantile50_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.percentile(a,50)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_q50'
    return var
def quantile75_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.percentile(a,75)
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_q75'
    return var
def energy_(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        var_temp = np.sum(np.mean(a**2)) 
        list_.append(var_temp)
    var = pd.Series(list_, index=interested_cols)
    var.index += '_energy'
    return var


In [16]:
def find_nearest(array,value):
    idx = (np.abs(array-value)).argmin()
    return idx
def frequency_features(df):
    list_= []
    index_name = []
    frequency_features_list = ['energy_total','energy_interested','max_total', 'max_interested']
    for col in interested_cols:
        a = df[col].values
        f, psdX = signal.periodogram(a, fs=50, nfft = 256)
        i_low = find_nearest(f,4.0)
        i_high = find_nearest(f,7.0)

        energy_total = np.sum(psdX) 
        energy_interested = np.sum(psdX[i_low : i_high + 1]) 
        max_total = np.max(psdX)
        max_interested = np.max(psdX[i_low : i_high + 1])
        var_temp = [energy_total, energy_interested, max_total, max_interested]
        list_.extend(var_temp)
        index_name.extend([col+'_'+x for x in frequency_features_list])
    var = pd.Series(list_, index=index_name)
    return var
def average_over_axis(df):
    aoa = df[interested_cols].mean(axis = 0)
    aoa.index += '_aoa'
    return aoa
def average_time_elapse(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        mph = a.mean()
        ind = detect_peaks(a, mph = mph, mpd=20, show=False)
        list_.append(np.diff(ind).mean())
    ate = pd.Series(list_, index=interested_cols)
    ate.index += '_ate'
    return ate
def average_peak_freq(df):
    list_f= []
    for col in interested_cols:
        a = df[col].values
        mph = a.mean()
        ind = detect_peaks(a, mph = mph, mpd=20, show=False)
        list_f.append(len(ind)/a.shape[0])
    apf = pd.Series(list_f, index=interested_cols)
    apf.index += '_apf'
    return apf
def rms_func(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        rms_temp = np.sqrt(np.mean(a**2))
        list_.append(rms_temp)
    rms = pd.Series(list_, index=interested_cols)
    rms.index += '_rms'
    return rms
def std_func(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        std_temp = np.std(a)
        list_.append(std_temp)
    std = pd.Series(list_, index=interested_cols)
    std.index += '_std'
    return std
def minmax_func(df):
    list_= []
    for col in interested_cols:
        a = df[col].values
        minmax_temp = np.max(a)-np.min(a)
        list_.append(minmax_temp)
    minmax = pd.Series(list_, index=interested_cols)
    minmax.index += '_minmax'
    return minmax
def cor_func(df):
    a = df[interested_cols[:3]].corr()
    b= df[interested_cols[3:]].corr()
    indexes = ['CorAccXAccY','CorAccXAccZ','CorAccYAccZ', 'CorGyroXGyroY','CorGyroXGyroZ','CorGyroYGyroZ']
    Cor = (a['AccX'][1:]).append(a['AccY'][2:]).append((b['GyroX'][1:]).append(b['GyroY'][2:]))
    corr = pd.Series(Cor.values, indexes)
    corr.index += '_corr'
    return corr
def label_(df):
    return pd.Series(df['label'][0], index=['label'])
    

In [17]:
def get_all_features(df, file, feature_list ):
    
    feature_func_dict = {
                        'aoa':average_over_axis,
                        'ate': average_time_elapse,
                        'apf':average_peak_freq,
                        'rms':rms_func,
                        'std':std_func,
                        'minimax':minmax_func,
                        'cor':cor_func,
                        'mean':mean_,
                        'min':min_,
                        'max':max_,
                        'range':range_,
                        'entropy_':entropy_,
                        'var':var_,
                        'kurtosis' : kurtosis_,
                        'skew':skewness_,
                        'quantile25':quantile25_,
                        'quantile50':quantile50_,
                        'quantile75':quantile75_,
                        'energy':energy_,
                        'frequency_features':frequency_features,
                        'label':label_
        }


    ser_list = []
    for x in feature_list:
        ser_list.append(feature_func_dict[x](df))
    ser = pd.concat(ser_list)
    ser.name = file
    return ser
    

In [18]:
feature_list = [ 
                        'mean',
                        'min',
                        'max',
                        'range',
                        'entropy_',
                        'var',
                        'kurtosis',
                        'skew',
                        'quantile25',
                        'quantile50',
                        'quantile75',
                        'energy', 
                        'label',
                        'frequency_features']
DF = pd.DataFrame()
for fol in label_folders:
    folder = saveto+fol+'/'
    print("Started the process of : ",folder )
    for root,dirs,files in os.walk(folder):
        for file_ in files:
            if file_.endswith(".csv"):
                df_temp = pd.read_csv(folder+file_)
                break
                ser = get_all_features(df_temp, file = fol+file_.split('.')[0], feature_list= feature_list)
                ser = ser.round(4)
                DF = DF.append(ser, verify_integrity=True)
    
    print("Finished the process of : ",folder )


Started the process of :  ../../data/data_PD/snippets/not_walking/
Finished the process of :  ../../data/data_PD/snippets/not_walking/
Started the process of :  ../../data/data_PD/snippets/walking/
Finished the process of :  ../../data/data_PD/snippets/walking/


In [19]:
df_temp

Unnamed: 0,date,AccX,AccY,AccZ,GyroX,GyroY,GyroZ,label
0,2016-06-20 16:45:35.405,-0.692627,0.452148,-0.093262,-121.46300,72.89630,31.92070,1
1,2016-06-20 16:45:35.412,-0.798340,0.172119,-0.250977,-130.39600,74.02440,39.78660,1
2,2016-06-20 16:45:35.420,-0.740479,0.488037,-0.157471,-89.57320,66.82930,50.97560,1
3,2016-06-20 16:45:35.470,-0.738281,0.705078,-0.136475,-9.66463,49.08540,55.12200,1
4,2016-06-20 16:45:35.474,-0.735840,0.920410,-0.139648,5.51829,48.62810,56.85980,1
5,2016-06-20 16:45:35.478,-0.751465,0.913818,-0.174561,-39.11590,45.76220,58.90240,1
6,2016-06-20 16:45:35.489,-0.780762,0.864014,-0.180908,-120.97600,47.92680,62.65240,1
7,2016-06-20 16:45:35.505,-0.794678,0.781006,-0.232178,-149.29900,50.73170,65.73170,1
8,2016-06-20 16:45:35.559,-0.707275,0.600586,-0.092285,-177.95700,54.48170,75.70120,1
9,2016-06-20 16:45:35.565,-0.673828,0.545898,0.008545,-180.88400,59.32930,76.46340,1


In [99]:
DF.to_csv('preprocessed_data2.csv')

In [101]:
DF.head()

Unnamed: 0,AccX_energy,AccX_energy_interested,AccX_energy_total,AccX_entropy,AccX_kurtosis,AccX_max,AccX_max_interested,AccX_max_total,AccX_mean,AccX_min,...,GyroZ_max_total,GyroZ_mean,GyroZ_min,GyroZ_q25,GyroZ_q50,GyroZ_q75,GyroZ_range,GyroZ_skew,GyroZ_var,label
not_walking5824,0.9961,0.0,0.0,1.9273,0.4585,-0.9956,0.0,0.0,-0.998,-1.0002,...,0.0004,-0.7441,-0.8232,-0.7622,-0.7317,-0.7012,0.1829,0.1202,0.002,0.0
not_walking4786,0.0046,0.0001,0.0009,1.9316,-0.1113,-0.0425,0.0,0.0001,-0.0666,-0.1125,...,11.0488,-1.6719,-10.7927,-5.4878,-1.4177,0.9604,22.2256,0.3629,23.0738,0.0
not_walking6716,0.9962,0.0,0.0,2.226,-0.9833,-0.9954,0.0,0.0,-0.9981,-1.0015,...,0.0005,-0.7248,-0.8232,-0.7622,-0.7317,-0.7012,0.1829,-0.1681,0.0018,0.0
not_walking38357,0.1949,0.0,0.0,1.9719,-0.0751,0.4443,0.0,0.0,0.4415,0.4387,...,0.0531,-0.6891,-1.0671,-0.8232,-0.6707,-0.5488,0.6707,-0.1952,0.0266,0.0
not_walking24689,0.3923,0.0001,0.0002,1.9302,0.4248,-0.6084,0.0,0.0,-0.6263,-0.6423,...,0.0752,-0.6185,-2.3476,-1.0061,-0.4878,-0.2744,2.8049,-0.566,0.2659,0.0
