# Data Prep for Activity Recognition (Timeseries) Model

In [1]:
from pathlib import Path
import pandas as pd

In [2]:
ls

activity_labels.txt  features_info.txt  [0m[01;34mRawData[0m/    [01;34mTest[0m/
data_prep.ipynb      features.txt       README.txt  [01;34mTrain[0m/


In [3]:
files = [*Path('RawData').glob('*_*.txt')]
files[:5]

[PosixPath('RawData/gyro_exp13_user07.txt'),
 PosixPath('RawData/acc_exp32_user16.txt'),
 PosixPath('RawData/gyro_exp31_user15.txt'),
 PosixPath('RawData/gyro_exp38_user19.txt'),
 PosixPath('RawData/gyro_exp26_user13.txt')]

In [4]:
meta = pd.Series(files) \
    .astype('str') \
    .str \
    .extract('RawData/(.+)_exp(\d+)_user(\d+).txt') \
    .rename(columns={0: 'sensor', 1: 'experiment', 2: 'user'}) \
    .join(pd.Series(files).rename('path'))

meta.head()

Unnamed: 0,sensor,experiment,user,path
0,gyro,13,7,RawData/gyro_exp13_user07.txt
1,acc,32,16,RawData/acc_exp32_user16.txt
2,gyro,31,15,RawData/gyro_exp31_user15.txt
3,gyro,38,19,RawData/gyro_exp38_user19.txt
4,gyro,26,13,RawData/gyro_exp26_user13.txt


In [5]:
data = pd.concat([
    pd.read_csv(row.path, sep=' ', header=None).assign(**row.to_dict())
        .reset_index()
    for _, row in meta.iterrows()
]).drop('path', axis=1)

data = data.melt(id_vars=['index', 'experiment', 'user', 'sensor'])
data['variable'] = data.sensor + '_' + data.variable.astype(str)
data = data.pivot(index=['index', 'experiment', 'user'], 
                  columns='variable', values='value')

data = data.reset_index()

data.head(5)

variable,index,experiment,user,acc_0,acc_1,acc_2,gyro_0,gyro_1,gyro_2
0,0,1,1,0.918056,-0.1125,0.509722,-0.054978,-0.069639,-0.030849
1,0,2,1,0.443056,0.0375,0.888889,-0.036957,0.044593,-0.015272
2,0,3,2,0.413889,-0.015278,0.922222,-0.078802,-0.135918,-0.042455
3,0,4,2,0.295833,0.041667,0.965278,0.007941,0.076664,0.050702
4,0,5,3,0.5,-0.006944,0.856944,0.054978,-0.083383,0.044899


In [6]:
activity = pd.read_csv('RawData/labels.txt', sep=' ', header=None) \
    .rename(columns=lambda i: ['experiment', 'user', 'activity_index', 'start', 'end'][i])

activity = activity.melt(id_vars=['experiment', 'user', 'activity_index'], 
                         value_name='index')

activity.drop('variable', axis=1, inplace=True)

activity.head()

Unnamed: 0,experiment,user,activity_index,index
0,1,1,5,250
1,1,1,7,1233
2,1,1,4,1393
3,1,1,8,2195
4,1,1,5,2360


In [7]:
with open('activity_labels.txt', 'r') as file:
    labels = [*filter(lambda x: x, map(str.strip, file.read().split('\n')))]
    
labels = pd.DataFrame(
    [*map(str.split, labels)],
    columns=['activity_index', 'activity']) \
    .astype({'activity_index': int})

labels

Unnamed: 0,activity_index,activity
0,1,WALKING
1,2,WALKING_UPSTAIRS
2,3,WALKING_DOWNSTAIRS
3,4,SITTING
4,5,STANDING
5,6,LAYING
6,7,STAND_TO_SIT
7,8,SIT_TO_STAND
8,9,SIT_TO_LIE
9,10,LIE_TO_SIT


In [8]:
activity = activity.merge(labels).drop('activity_index', axis=1)
activity.head()

Unnamed: 0,experiment,user,index,activity
0,1,1,250,STANDING
1,1,1,2360,STANDING
2,2,1,251,STANDING
3,2,1,2378,STANDING
4,3,2,298,STANDING


In [9]:
data = data.astype({'experiment': int, 'user': int}) \
    .merge(activity, how='left')

In [10]:
data = data.sort_values(by=['user', 'experiment', 'index'])

In [11]:
grp = data.groupby(['user', 'experiment'])
fwd_act = grp.activity.ffill()
bwd_act = grp.activity.bfill()
data['activity'] = fwd_act.loc[fwd_act == bwd_act]

In [12]:
data['Time'] = pd.Timestamp('2015-07-29') + pd.to_timedelta(data['index'] / 50, unit='s')
data.drop('index', axis=1, inplace=True)

In [13]:
assert not data[['acc_0', 'acc_2', 'acc_2', 'gyro_0', 'gyro_2', 'gyro_2']].isna().any().any()

In [14]:
data.head(5)

Unnamed: 0,experiment,user,acc_0,acc_1,acc_2,gyro_0,gyro_1,gyro_2,activity,Time
0,1,1,0.918056,-0.1125,0.509722,-0.054978,-0.069639,-0.030849,,2015-07-29 00:00:00.000
61,1,1,0.911111,-0.093056,0.5375,-0.012523,0.019242,-0.038485,,2015-07-29 00:00:00.020
122,1,1,0.881944,-0.086111,0.513889,-0.023518,0.276417,0.006414,,2015-07-29 00:00:00.040
183,1,1,0.881944,-0.086111,0.513889,-0.093462,0.367741,0.001222,,2015-07-29 00:00:00.060
244,1,1,0.879167,-0.1,0.505556,-0.124311,0.47678,-0.022907,,2015-07-29 00:00:00.080


In [15]:
data.shape

(1122772, 10)

In [16]:
data.to_csv('activity_recognition.csv', index=False)

In [17]:
(data.Time.max() - pd.Timestamp('2015-07-29')) / pd.Timedelta('1s')

641.76