In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import time
import os
import glob
import seaborn as sns
sns.set(style="darkgrid")
import sys
# sys.path.append("../dsmuc/")
import dsmuc.io as io
import dsmuc.preprocessing as pp
import dsmuc.features as ff
import dsmuc.custom as cs
import pywt
from joblib import Parallel, delayed


def dwtdwt(vector):
    return pywt.wavedec(vector, 'haar', level=1)[0]


### Read raw data 

In [None]:
'''
Combine all Datasets collected using G9 and extract necessary Sensor data for 
processing later on it 

method to read output file: pd.read_csv(OUT_FOLDER + 'data.csv',index_col='date', parse_dates=True)
'''
DATA_SET_FOLDER = '/home/ahmet/notebooks/data/G9_data/new_Dataset/'
OUT_FOLDER = '/home/ahmet/notebooks/data/G9_data/Raw/'
output_file_name = 'combined_raw_stairs.csv' # for stairs dataset change it to 'combined_raw_stairs.csv'
    

print('Reading each datasets...')
big_list = []
for file_path in glob.glob(DATA_SET_FOLDER + '/*/*/*'+'.csv'): # for stairs add one more *
    basename = os.path.basename(file_path)
    if True:
        print("Reading the file :", basename)
        subject_id = basename.split("_")[0]
        df_data = io.read_g9(file=file_path,subject_id=subject_id)
        big_list.append(df_data)
df_raw = pd.concat(big_list)
df_raw.sort_index(ascending=True, inplace=True)


In [None]:
df_raw

In [None]:
print("file saved as :",OUT_FOLDER +output_file_name)
df_raw.to_csv(OUT_FOLDER +output_file_name, index=True)

### Label Data

In [None]:
def read_label_file(file_path = None):
    xl = pd.ExcelFile(label_file_path)
    df_label = xl.parse("Sheet1")
    df_label = df_label[['start_time', 'end_time','subject', 'label']]
    df_label.columns = ['start_time', 'end_time','subject_id', 'label']
    return df_label

In [None]:
raw_data_path = '/home/ahmet/notebooks/data/G9_data/Raw/combined_raw_stairs.csv'
label_file_path = '/home/ahmet/notebooks/data/G9_data/new_Dataset/Additional Stairs Dataset/additional_stairs_detail_timestamp.xlsx'
out_file_path = '/home/ahmet/notebooks/data/G9_data/Raw/labeled_stairs.csv'
print('reading raw data ...')
df_raw = pd.read_csv(raw_data_path,index_col=0, parse_dates=True)
print('reading label file in excel ...')
df_label = read_label_file(label_file_path)

df_label['start'] = pd.to_datetime(df_label['start_time'])  
df_label['end'] = pd.to_datetime(df_label['end_time']) 

In [None]:
print('labeling data...')
df_labeled = pp.label_data(df_data=df_raw, df_label=df_label)
print('saving labeled file to the path:'+out_file_path +'...')
df_labeled.to_csv(out_file_path)

In [None]:
np.unique(df_labeled['subject_id'])

### Extract Windows

In [None]:
file_path = "/home/ahmet/notebooks/data/G9_data/Raw/labeled.csv"
file_path_stairs = "/home/ahmet/notebooks/data/G9_data/Raw/labeled_stairs.csv"
saveto = "/home/ahmet/notebooks/data/G9_data/Raw/snippets3sec/"
label_dict = {1:'walking',
             2:'walking upstairs',
             3:'walking downstairs',
             4:'sitting',
             5:'standing',
             6:'lying'}
interested_cols = [ 'accX', 'accY', 'accZ', 'gyroX','gyroY', 'gyroZ']

In [None]:
min_samples = 20
window_size_seconds = 3
window_slide_seconds = 1
print('Creating output folders for each label')
isSure = cs.create_label_folders(label_dict= label_dict, saveto = saveto)
if not isSure:
    if not cs.query_yes_no('There are already folders specified.Are you sure to continue to extract segment?'):
        print('There are already folders specified.Are you sure to continue to extract segment?')
    else:
        import shutil
        print('Clearing the directory: ', saveto)
        shutil.rmtree(saveto)
        cs.create_label_folders(label_dict= label_dict, saveto = saveto)

window_size = datetime.timedelta(seconds=window_size_seconds)
window_slide = datetime.timedelta(seconds=window_slide_seconds)


df_old = pd.read_csv(file_path, index_col='date', parse_dates=True)
df_stairs = pd.read_csv(file_path_stairs, index_col='date', parse_dates=True)
df_stairs['subject_id'] = (df_stairs['subject_id']+100)
df = pd.concat([df_old, df_stairs])
df = df.sort_index(ascending = True)
## Extract Segments

subject_ids = np.unique(df['subject_id']).tolist()
labels = list(label_dict.keys())

print('Extracting windows')
samples_count = []
for s in subject_ids:
    print('subject: ',s)
    for l in labels:
        print('label: ',label_dict[l])
        df_temp = df[np.logical_and(df['subject_id']==s, df['label']==l)]
        if df_temp.shape[0] !=0:
            win_list = pp.ext_windows(df=df_temp,window_size_seconds=window_size_seconds, \
                               window_slide_seconds=window_slide_seconds)
            win_list =[win for win in win_list if win.shape[0]>min_samples ]

            df_ = pd.DataFrame()
            i = 0
            index = str(s)+str(l)
            for window in win_list:
                window.to_csv(saveto+label_dict[l]+'/'+str(s)+"_"+str(i)+'.csv')
                i += 1

In [None]:
pd.read_csv('/home/ahmet/notebooks/data/G9_data/Raw/snippets3sec/sitting/10_298.csv', index_col='date', parse_dates=True)

## Extract Features

In [7]:
raw_windows_path = "/home/ahmet/notebooks/data/G9_data/Raw/snippets3sec/"
processed_file_path = "/home/ahmet/notebooks/data/G9_data/processed_3sec.csv"

In [8]:
win_paths = glob.glob(raw_windows_path+'*/*.csv')
print("Found {} windows".format(len(win_paths)))

Found 34958 windows


### For test 

In [83]:
def process_file(fp):
    win = pd.read_csv(fp, index_col='date', parse_dates=True)
    try:
        win = pp.resample(win, fs = 128/3)[:128]
    except:
        return None
    if win.shape[0] == 128:
        win['accnorm'] = win[["accX", "accY", "accZ"]].apply(np.linalg.norm, axis = 1)
        win['gyronorm'] = win[["gyroX", "gyroY", "gyroZ"]].apply(np.linalg.norm, axis = 1)
        x = win[['accX', 'accY', 'accZ', 'gyroX', 'gyroY', 'gyroZ', 'accnorm', "gyronorm"]].values
        y = win['label'][0]
        z =int(win['subject_id'][0])
        return x, y, z
    else:
        return None


In [84]:
a = Parallel(n_jobs=-1)(delayed(process_file)(f) for f in win_paths)

In [85]:
c = np.array(a)
c = c[c !=None]

In [100]:
X = np.concatenate([x[0] for x in c])
y = np.array([x[1] for x in c])
z = np.array([x[2] for x in c])
X = X.reshape((-1, 8, 128))


In [102]:
import _pickle as cPickle

In [103]:
with open('/home/ahmet/notebooks/data/G9_data/action_data.pkl', 'wb') as f:
      cPickle.dump((X,y,z) , f)



In [105]:
with open('/home/ahmet/notebooks/data/G9_data/action_data.pkl','rb') as f:
    X, y, z = cPickle.load(f)
    print(X.shape)

(30489, 8, 128)


In [None]:
%%time
win_list = []
i = 0
for win in map(lambda w: pd.read_csv(w,index_col='date', parse_dates=True), win_paths):
    i += 1 
    win['win_index'] = i
    win_list.append(win)
    if i%200 ==0:
        print(i,"of windows out of ", len(win_paths))

In [None]:
df = pd.concat(win_list)

In [None]:
ff.interested_cols

In [None]:
%%time
ff.interested_cols = [ 'accX', 'accY', 'accZ', 'gyroX','gyroY', 'gyroZ', 'win_index']
feature_list = ['aoa','ate','apf','rms','std','minimax', 'energy','min','max']
preserved_features = ['start','subject_id','label']

df_final = ff.extract_features(df, feature_list=feature_list , preserved_features=preserved_features)
df_final.dropna()
df_final.sort_values('start', inplace=True)



In [None]:
df_final.head()

In [None]:
print('Saving processed file to the path:',processed_file_path)
df_final.to_csv(processed_file_path, index=False)

## Old Feature Calculation 

In [None]:
raw_windows_path = "/home/ahmet/notebooks/data/data_PD/snippets/"
processed_file_path = "/home/ahmet/notebooks/data/data_PD/preprocessed_data_all_features.csv"

In [None]:
win_paths = glob.glob(raw_windows_path+'*/*.csv')
print("Found {} windows".format(len(win_paths)))

In [None]:
%%time
win_list = []
i = 0
for win in map(lambda w: pd.read_csv(w,index_col='date', parse_dates=True), win_paths):
    i += 1 
    win['win_index'] = i
    win_list.append(win)
    if i%200 ==0:
        print(i,"of windows out of ", len(win_paths))

In [None]:
features_all = ['aoa',
'ate',
'apf',
'rms',
'std',
'minimax',
'cor',
'mean',
'min',
'max',
'range',
'entropy',
'var',
'kurtosis',
'skew',
'quantile25',
'quantile50',
'quantile75',
'energy',
'frequency_features',
'acc_norm_mean',
'acc_norm_std',
'gyro_norm_mean',
'gyro_norm_std',
'mazilu_power',
'acc_mean_crossings',
'gyro_mean_crossings',]

In [None]:
%%time
import dsmuc.features as ff
feature_list = features_all
preserved_features=['start', 'subject_id', 'label']
df_final = pd.DataFrame()
index = 0
for window in win_list:
    df_final = df_final.append(ff.extract_features(window, index=index, feature_list=feature_list ,\
    preserved_features=preserved_features))
    index += 1

In [None]:
df_final.sort_values('start', inplace=True)

In [None]:
print('Saving processed file to the path:',processed_file_path)
df_final.to_csv(processed_file_path, index=False)

In [None]:
processed_file_path

In [None]:
df_final.head()
