In [2]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import os
import glob
import seaborn as sns
sns.set(style="darkgrid")
import sys
sys.path.append("../dsmuc/")
import dsmuc.io as io
import dsmuc.preprocessing as pp
import dsmuc.features as ff
import dsmuc.custom as cs

### Read raw data 

In [15]:
'''
Combine all Datasets collected using G9 and extract necessary Sensor data for 
processing later on it 

method to read output file: pd.read_csv(OUT_FOLDER + 'data.csv',index_col='date', parse_dates=True)
'''
DATA_SET_FOLDER = '/home/ahmet/notebooks/data/G9_data/new_Dataset/'
OUT_FOLDER = '/home/ahmet/notebooks/data/G9_data/Raw/'
output_file_name = 'combined_raw.csv' # for stairs dataset change it to 'combined_raw_stairs.csv'
    

print('Reading each datasets...')
big_list = []
for file_path in glob.glob(DATA_SET_FOLDER + '/*/*/*'+'.csv'):
    basename = os.path.basename(file_path)
    if True:
        print("Reading the file :", basename)
        subject_id = basename.split("_")[0]
        df_data = io.read_g9(file=file_path,subject_id=subject_id)
        big_list.append(df_data)
df_raw = pd.concat(big_list)

df_raw.sort_index(ascending=True, inplace=True)

Reading each datasets...
Reading the file : 8_stairs_L.csv
Extracting interested sensor data...
Reading the file : 13_stairs_L.csv
Extracting interested sensor data...
Reading the file : 11_stairs_L.csv
Extracting interested sensor data...
Reading the file : 10_stairs_L.csv
Extracting interested sensor data...
Reading the file : 12_stairs_L.csv
Extracting interested sensor data...
Reading the file : 19_stairs_L.csv
Extracting interested sensor data...
Reading the file : 3_stairs_L.csv
Extracting interested sensor data...
Reading the file : 18_stairs_L.csv
Extracting interested sensor data...
Reading the file : 9_stairs_L.csv
Extracting interested sensor data...
Reading the file : 15_stairs_L.csv
Extracting interested sensor data...
Reading the file : 17_stairs_L.csv
Extracting interested sensor data...
Reading the file : 14_stairs_L.csv
Extracting interested sensor data...
Reading the file : 4_stairs_L.csv
Extracting interested sensor data...
Reading the file : 1_stairs_L.csv
Extractin

In [16]:
df_raw.head()

Unnamed: 0_level_0,accX,accY,accZ,gyroX,gyroY,gyroZ,subject_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-02-21 07:05:02.006,-0.531513,-8.121132,-6.0286,0.013316,0.013316,-0.034621,1
2018-02-21 07:05:02.009,-0.531513,-8.121132,-6.0286,0.014381,0.001065,-0.034088,1
2018-02-21 07:05:02.029,-0.545878,-8.236053,-6.019023,0.014381,0.001065,-0.034088,1
2018-02-21 07:05:02.031,-0.545878,-8.236053,-6.019023,-0.005859,0.002663,-0.034088,1
2018-02-21 07:05:02.047,-0.608127,-8.346188,-6.105214,-0.005859,0.002663,-0.034088,1


In [17]:
print("file saved as :",OUT_FOLDER +output_file_name)
df_raw.to_csv(OUT_FOLDER +output_file_name, index=True)

file saved as : /home/ahmet/notebooks/data/G9_data/Raw/combined_raw.csv


### Label Data

In [18]:
def read_label_file(file_path = None):
    xl = pd.ExcelFile(label_file_path)
    df_label = xl.parse("Sheet1")
    df_label = df_label[['start_time', 'end_time','subject', 'label']]
    df_label.columns = ['start_time', 'end_time','subject_id', 'label']
    return df_label

In [19]:
raw_data_path = '/home/ahmet/notebooks/data/G9_data/Raw/combined_raw_stairs.csv'
label_file_path = '/home/ahmet/notebooks/data/G9_data/new_Dataset/Additional Stairs Dataset/additional_stairs_detail_timestamp.xlsx'
out_file_path = '/home/ahmet/notebooks/data/G9_data/Raw/labeled_stairs.csv'
print('reading raw data ...')
df_raw = pd.read_csv(raw_data_path,index_col=0, parse_dates=True)
print('reading label file in excel ...')
df_label = read_label_file(label_file_path)

df_label['start'] = pd.to_datetime(df_label['start_time'])  
df_label['end'] = pd.to_datetime(df_label['end_time']) 

reading raw data ...
reading label file in excel ...


In [20]:
print('labeling data...')
df_labeled = pp.label_data(df_data=df_raw, df_label=df_label)
print('saving labeled file to the path:'+out_file_path +'...')
df_labeled.to_csv(out_file_path)

labeling data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_temp['label'] = int(label)


saving labeled file to the path:/home/ahmet/notebooks/data/G9_data/Raw/labeled_stairs.csv...


In [21]:
df_labeled['subject_id'].unique()

array([ 1,  2,  3,  4,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])

### Extract Windows

In [23]:
file_path = "/home/ahmet/notebooks/data/G9_data/Raw/labeled.csv"
file_path_stairs = "/home/ahmet/notebooks/data/G9_data/Raw/labeled_stairs.csv"
saveto = "/home/ahmet/notebooks/data/G9_data/Raw/snippets/"
label_dict = {1:'walking',
             2:'walking upstairs',
             3:'walking downstairs',
             4:'sitting',
             5:'standing',
             6:'laying'}
interested_cols = [ 'accX', 'accY', 'accZ', 'gyroX','gyroY', 'gyroZ']

In [24]:
min_samples = 20
window_size_seconds = 2
window_slide_seconds = 1
print('Creating output folders for each label')
isSure = cs.create_label_folders(label_dict= label_dict, saveto = saveto)
if not isSure:
    if not cs.query_yes_no('There are already folders specified.Are you sure to continue to extract segment?'):
        print('There are already folders specified.Are you sure to continue to extract segment?')
    else:
        import shutil
        print('Clearing the directory: ', saveto)
        shutil.rmtree(saveto)
        cs.create_label_folders(label_dict= label_dict, saveto = saveto)

window_size = datetime.timedelta(seconds=window_size_seconds)
window_slide = datetime.timedelta(seconds=window_slide_seconds)


df_old = pd.read_csv(file_path, index_col='date', parse_dates=True)
df_stairs = pd.read_csv(file_path_stairs, index_col='date', parse_dates=True)
df_stairs['subject_id'] = (df_stairs['subject_id']+100)
df = pd.concat([df_old, df_stairs])
df = df.sort_index(ascending = True)
## Extract Segments

subject_ids = np.unique(df['subject_id']).tolist()
labels = list(label_dict.keys())

print('Extracting windows')
samples_count = []
for s in subject_ids:
    print('subject: ',s)
    for l in labels:
        print('label: ',label_dict[l])
        df_temp = df[np.logical_and(df['subject_id']==s, df['label']==l)]
        if df_temp.shape[0] !=0:
            win_list = pp.ext_windows(df=df_temp,window_size_seconds=window_size_seconds, \
                               window_slide_seconds=window_slide_seconds)
            win_list =[win for win in win_list if win.shape[0]>min_samples ]

            df_ = pd.DataFrame()
            i = 0
            index = str(s)+str(l)
            for window in win_list:
                window.to_csv(saveto+label_dict[l]+'/'+str(s)+"_"+str(i)+'.csv')
                i += 1

Creating output folders for each label
/home/ahmet/notebooks/data/G9_data/Raw/snippets/walking  already exist
/home/ahmet/notebooks/data/G9_data/Raw/snippets/walking upstairs  already exist
/home/ahmet/notebooks/data/G9_data/Raw/snippets/walking downstairs  already exist
/home/ahmet/notebooks/data/G9_data/Raw/snippets/sitting  already exist
/home/ahmet/notebooks/data/G9_data/Raw/snippets/standing  already exist
/home/ahmet/notebooks/data/G9_data/Raw/snippets/laying  already exist
There are already folders specified.Are you sure to continue to extract segment? [Y/n] yes
Clearing the directory:  /home/ahmet/notebooks/data/G9_data/Raw/snippets/
/home/ahmet/notebooks/data/G9_data/Raw/snippets/walking is created
/home/ahmet/notebooks/data/G9_data/Raw/snippets/walking upstairs is created
/home/ahmet/notebooks/data/G9_data/Raw/snippets/walking downstairs is created
/home/ahmet/notebooks/data/G9_data/Raw/snippets/sitting is created
/home/ahmet/notebooks/data/G9_data/Raw/snippets/standing is cr

## Extract Features

In [25]:
raw_windows_path = "/home/ahmet/notebooks/data/G9_data/Raw/snippets/"
processed_file_path = "/home/ahmet/notebooks/data/G9_data/processed.csv"

In [26]:
win_paths = glob.glob(raw_windows_path+'*/*.csv')
print("Found {} windows".format(len(win_paths)))

Found 34519 windows


In [27]:
win_list = [pd.read_csv(win_paths[i], index_col='date', parse_dates=True) for i in range(len(win_paths))]

In [28]:
len(win_list)

34519

In [29]:
feature_list = ['aoa','ate','apf','rms','std','minimax','cor','mean','min','max']
preserved_features = ['start','subject_id','label']
df_final = pd.DataFrame()
index = 0
for window in win_list:
    df_final = df_final.append(ff.extract_features(window, index=index, feature_list=feature_list ,\
    preserved_features=preserved_features))
    index += 1
    if index%200 ==0:
        print(index,"of windows out of ", len(win_list))
    

  list_.append(np.diff(ind).mean())
  ret = ret.dtype.type(ret / rcount)


200 of wondows out of  34519
400 of wondows out of  34519
600 of wondows out of  34519
800 of wondows out of  34519
1000 of wondows out of  34519
1200 of wondows out of  34519
1400 of wondows out of  34519
1600 of wondows out of  34519
1800 of wondows out of  34519
2000 of wondows out of  34519
2200 of wondows out of  34519
2400 of wondows out of  34519
2600 of wondows out of  34519
2800 of wondows out of  34519
3000 of wondows out of  34519
3200 of wondows out of  34519
3400 of wondows out of  34519
3600 of wondows out of  34519
3800 of wondows out of  34519
4000 of wondows out of  34519
4200 of wondows out of  34519
4400 of wondows out of  34519
4600 of wondows out of  34519
4800 of wondows out of  34519
5000 of wondows out of  34519
5200 of wondows out of  34519
5400 of wondows out of  34519
5600 of wondows out of  34519
5800 of wondows out of  34519
6000 of wondows out of  34519
6200 of wondows out of  34519
6400 of wondows out of  34519
6600 of wondows out of  34519
6800 of wondow

In [30]:
df_final.head()

Unnamed: 0,CorAccXAccY_corr,CorAccXAccZ_corr,CorAccYAccZ_corr,CorGyroXGyroY_corr,CorGyroXGyroZ_corr,CorGyroYGyroZ_corr,accX_aoa,accX_apf,accX_ate,accX_max,...,gyroZ_ate,gyroZ_max,gyroZ_mean,gyroZ_min,gyroZ_minmax,gyroZ_rms,gyroZ_std,label,start,subject_id
0,-0.191494,-0.01855,-0.291556,0.72788,0.461031,0.569434,-6.158093,0.043011,24.333333,-6.009446,...,40.0,0.017993,0.000428,-0.018226,0.036219,0.007781,0.007769,5.0,2018-01-16 06:30:12.029,15.0
1,0.157201,0.276847,0.112554,0.674492,-0.504778,-0.2729,-9.687722,0.040816,27.0,-8.925584,...,,0.39007,-0.025655,-0.391834,0.781904,0.198135,0.196467,5.0,2017-12-20 07:21:47.028,11.0
2,0.346551,-0.330945,-0.692002,-0.124647,0.323075,0.331159,-9.611379,0.04,29.0,-9.457097,...,24.333333,0.056933,0.000213,-0.02882,0.085754,0.013421,0.013419,5.0,2018-01-16 04:35:14.010,14.0
3,-0.53168,-0.738291,0.21737,0.868206,0.699076,0.557565,5.266624,0.035294,30.0,5.549759,...,22.333333,0.149237,0.007024,-0.140515,0.289752,0.045776,0.045234,5.0,2018-02-12 07:36:59.083,28.0
4,-0.163335,-0.209158,-0.039511,0.581457,-0.036526,-0.345909,-9.551191,0.038835,30.333333,-9.270349,...,37.5,0.191847,-0.004006,-0.20017,0.392017,0.081773,0.081675,5.0,2017-12-08 09:29:15.016,1.0


In [31]:
df_final.shape

(34519, 63)

In [32]:
print('Saving processed file to the path:',processed_file_path)
df_final.to_csv(processed_file_path, index=False)

Saving processed file to the path: /home/ahmet/notebooks/data/G9_data/processed.csv
