In [2]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import os
import glob
import seaborn as sns
sns.set(style="darkgrid")
import sys
sys.path.append("../dsmuc/")
import dsmuc.io as io
import dsmuc.preprocessing as pp
import dsmuc.features as ff
import dsmuc.custom as cs

### Read raw data 

In [3]:
'''
Combine all Datasets collected using G9 and extract necessary Sensor data for 
processing later on it 

method to read output file: pd.read_csv(OUT_FOLDER + 'data.csv',index_col='date', parse_dates=True)
'''
DATA_SET_FOLDER = '/home/ahmet/notebooks/data/G9_data/Data_Sets/'
OUT_FOLDER = '/home/ahmet/notebooks/data/G9_data/Raw/'
output_file_name = 'combined_raw.csv'
    

if not os.path.exists(OUT_FOLDER):
    os.mkdir(OUT_FOLDER)
    print(OUT_FOLDER, " is created")

data_set_folders  = next(os.walk(DATA_SET_FOLDER))[1]



print('Reading each datasets...')
big_list = []
for data_set in data_set_folders:
    child_folder = DATA_SET_FOLDER + data_set
    for root,dirs,files in os.walk(child_folder):
        for file in files:
            if file.endswith(".csv") and file.split("_")[0]!='12':
                print("Reading the file :", file)
                subject_id = file.split("_")[0]
                df_data = io.read_g9(file_path=child_folder+'/'+ file,subject_id=subject_id)
                big_list.append(df_data)
        df_raw = pd.concat(big_list)

df_raw.sort_index(ascending=True, inplace=True)

Reading each datasets...
Reading the file : 26_no_label_L.csv
Extracting interested sensor data...
Reading the file : 25_no_label_L.csv
Extracting interested sensor data...
Reading the file : 24_no_label_L.csv
Extracting interested sensor data...
Reading the file : 23_no_label_L.csv
Extracting interested sensor data...
Reading the file : 22_no_label_L.csv
Extracting interested sensor data...
Reading the file : 20_no_label_L.csv
Extracting interested sensor data...
Reading the file : 19_no_label_L.csv
Extracting interested sensor data...
Reading the file : 21_no_label_L.csv
Extracting interested sensor data...
Reading the file : 18_no_label_L.csv
Extracting interested sensor data...
Reading the file : 15_no_label_L.csv
Extracting interested sensor data...
Reading the file : 14_no_label_L.csv
Extracting interested sensor data...
Reading the file : 11_no_label_L.csv
Extracting interested sensor data...
Reading the file : 16_no_label_L.csv
Extracting interested sensor data...
Reading the f

In [6]:
df_raw.drop_duplicates(inplace=True) 

In [13]:
df_raw = df_raw[::2]

In [15]:
print("file saved as :",OUT_FOLDER +output_file_name)
df_raw.to_csv(OUT_FOLDER +output_file_name, index=True)

file saved as : /home/ahmet/notebooks/data/G9_data/Raw/combined_raw.csv


### Label Data

In [16]:
def read_label_file(file_path = None):
    xl = pd.ExcelFile(label_file_path)
    df_label = xl.parse("Sheet1")
    df_label = df_label[['start_time', 'end_time','subject', 'label']]
    df_label.columns = ['start_time', 'end_time','subject_id', 'label']
    return df_label

In [17]:
raw_data_path = '/home/ahmet/notebooks/data/G9_data/Raw/combined_raw.csv'
label_file_path = '/home/ahmet/notebooks/data/G9_data/Data_Sets/Routine_with_detail_timestamp.xlsx'
out_file_path = '/home/ahmet/notebooks/data/G9_data/Raw/labeled.csv'
print('reading raw data ...')
df_raw = pd.read_csv(raw_data_path,index_col=0, parse_dates=True)
print('reading label file in excel ...')
df_label = read_label_file(label_file_path)

df_label['start'] = pd.to_datetime(df_label['start_time'])  
df_label['end'] = pd.to_datetime(df_label['end_time']) 

reading raw data ...
reading label file in excel ...


In [18]:
print('labeling data...')
df_labeled = pp.label_data(df_data=df_raw, df_label=df_label)
print('saving labeled file to the path:'+out_file_path +'...')
df_labeled.to_csv(out_file_path)

labeling data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_temp['label'] = int(df_label['label'][i])


saving labeled file to the path:/home/ahmet/notebooks/data/G9_data/Raw/labeled.csv...


In [19]:
df_labeled.head()

Unnamed: 0_level_0,accX,accY,accZ,gyroX,gyroY,gyroZ,subject_id,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-12-08 04:11:12.002,4.419696,-5.741296,6.071695,-0.004278,0.00491,0.015588,23,1
2017-12-08 04:11:12.013,4.496311,-5.81791,6.114791,-0.004278,0.009704,0.009729,23,1
2017-12-08 04:11:12.035,4.462792,-5.731719,6.100426,0.002113,0.013432,-0.000924,23,1
2017-12-08 04:11:12.064,4.525041,-5.702989,6.119579,0.003179,0.00278,-0.009446,23,1
2017-12-08 04:11:12.074,4.486734,-5.712565,6.138733,0.003711,0.006508,-0.013174,23,1


In [20]:
print('saving labeled file to the path:'+out_file_path )
df_labeled.to_csv(out_file_path)

saving labeled file to the path:/home/ahmet/notebooks/data/G9_data/Raw/labeled.csv


### Extract Windows

In [21]:
file_path = "/home/ahmet/notebooks/data/G9_data/Raw/labeled.csv"
saveto = "/home/ahmet/notebooks/data/G9_data/Raw/snippets/"
label_dict = {1:'walking',
             2:'walking upstairs',
             3:'walking downstairs',
             4:'sitting',
             5:'standing',
             6:'laying'}
interested_cols = [ 'accX', 'accY', 'accZ', 'gyroX','gyroY', 'gyroZ']

In [22]:
min_samples = 20
window_size_seconds = 2
window_slide_seconds = 1
print('Creating output folders for each label')
isSure = cs.create_label_folders(label_dict= label_dict, saveto = saveto)
if not isSure:
    if not cs.query_yes_no('There are already folders specified.Are you sure to continue to extract segment?'):
        print('There are already folders specified.Are you sure to continue to extract segment?')
    else:
        import shutil
        print('Clearing the directory: ', saveto)
        shutil.rmtree(saveto)
        cs.create_label_folders(label_dict= label_dict, saveto = saveto)

window_size = datetime.timedelta(seconds=window_size_seconds)
window_slide = datetime.timedelta(seconds=window_slide_seconds)
df = pd.read_csv(file_path, index_col='date', parse_dates=True)
df = df.sort_index(ascending = True)
## Extract Segments

subject_ids = np.unique(df['subject_id']).tolist()
labels = list(label_dict.keys())

print('Extracting windows')
samples_count = []
for s in subject_ids:
    print('subject: ',s)
    for l in labels:
        print('label: ',label_dict[l])
        df_temp = df[np.logical_and(df['subject_id']==s, df['label']==l)]
        win_list = pp.ext_windows(df=df_temp,window_size_seconds=window_size_seconds, \
                           window_slide_seconds=window_slide_seconds)
        win_list =[win for win in win_list if win.shape[0]>min_samples ]
        
        df_ = pd.DataFrame()
        i = 0
        index = str(s)+str(l)
        for window in win_list:
            window.to_csv(saveto+label_dict[l]+'/'+str(s)+"_"+str(i)+'.csv')
            i += 1

Creating output folders for each label
/home/ahmet/notebooks/data/G9_data/Raw/snippets/walking  already exist
/home/ahmet/notebooks/data/G9_data/Raw/snippets/walking upstairs  already exist
/home/ahmet/notebooks/data/G9_data/Raw/snippets/walking downstairs  already exist
/home/ahmet/notebooks/data/G9_data/Raw/snippets/sitting  already exist
/home/ahmet/notebooks/data/G9_data/Raw/snippets/standing  already exist
/home/ahmet/notebooks/data/G9_data/Raw/snippets/laying  already exist
There are already folders specified.Are you sure to continue to extract segment? [Y/n] y
Clearing the directory:  /home/ahmet/notebooks/data/G9_data/Raw/snippets/
/home/ahmet/notebooks/data/G9_data/Raw/snippets/walking is created
/home/ahmet/notebooks/data/G9_data/Raw/snippets/walking upstairs is created
/home/ahmet/notebooks/data/G9_data/Raw/snippets/walking downstairs is created
/home/ahmet/notebooks/data/G9_data/Raw/snippets/sitting is created
/home/ahmet/notebooks/data/G9_data/Raw/snippets/standing is crea

## Extract Features

In [23]:
raw_windows_path = "/home/ahmet/notebooks/data/G9_data/Raw/snippets/"
processed_file_path = "/home/ahmet/notebooks/data/G9_data/processed.csv"

In [24]:
win_paths = glob.glob(raw_windows_path+'*/*.csv')
print("Found {} windows".format(len(win_paths)))

Found 25027 windows


In [25]:
win_list = [pd.read_csv(win_paths[i], index_col='date', parse_dates=True) for i in range(len(win_paths))]

In [26]:
len(win_list)

25027

In [27]:
feature_list = ['aoa','ate','apf','rms','std','minimax','cor','mean','min','max']
preserved_features = ['start','subject_id','label']
df_final = pd.DataFrame()
index = 0
for window in win_list:
    df_final = df_final.append(ff.extract_features(window, index=index, feature_list=feature_list ,\
    preserved_features=preserved_features))
    index += 1
    

  list_.append(np.diff(ind).mean())
  ret = ret.dtype.type(ret / rcount)


In [28]:
df_final.head()

Unnamed: 0,CorAccXAccY_corr,CorAccXAccZ_corr,CorAccYAccZ_corr,CorGyroXGyroY_corr,CorGyroXGyroZ_corr,CorGyroYGyroZ_corr,accX_aoa,accX_apf,accX_ate,accX_max,...,gyroZ_ate,gyroZ_max,gyroZ_mean,gyroZ_min,gyroZ_minmax,gyroZ_rms,gyroZ_std,label,start,subject_id
0,-0.191494,-0.01855,-0.291556,0.72788,0.461031,0.569434,-6.158093,0.043011,24.333333,-6.009446,...,40.0,0.017993,0.000428,-0.018226,0.036219,0.007781,0.007769,5.0,2018-01-16 06:30:12.029,15.0
1,0.157201,0.276847,0.112554,0.674492,-0.504778,-0.2729,-9.687722,0.040816,27.0,-8.925584,...,,0.39007,-0.025655,-0.391834,0.781904,0.198135,0.196467,5.0,2017-12-20 07:21:47.028,11.0
2,0.342479,-0.331996,-0.690158,-0.127805,0.323512,0.324708,-9.611954,0.04,29.0,-9.457097,...,29.333333,0.056933,0.000133,-0.02882,0.085754,0.013429,0.013428,5.0,2018-01-16 04:35:14.010,14.0
3,-0.204384,-0.193564,-0.034288,0.587818,-0.047086,-0.344811,-9.551749,0.038835,30.333333,-9.270349,...,37.5,0.191847,-0.004833,-0.20017,0.392017,0.083018,0.082878,5.0,2017-12-08 09:29:15.016,1.0
4,-0.067963,0.216787,0.847867,0.622559,-0.474301,-0.378248,-5.000725,0.034091,23.5,-4.831499,...,30.5,0.057916,-0.001733,-0.048611,0.106526,0.018821,0.018741,5.0,2018-01-16 04:04:24.018,13.0


In [29]:
df_final.shape

(25027, 63)

In [31]:
print('Saving processed file to the path:',processed_file_path)
df_final.to_csv(processed_file_path, index=False)

Saving processed file to the path: /home/ahmet/notebooks/data/G9_data/processed.csv
