# Analysis of UCI ML Human Activity Recognition Using Smartphones Dataset
_economy_, 18 Jan 2018

Dataset description found here:
http://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.names

In [3]:
import pandas as pd
import numpy as np

import keras

# Import datasets

### Feature List

In [56]:
with open('uci_har_dataset/features.txt', 'r') as f:
    features = [ 
        x
        .strip()
        .replace('()', '')
        .replace(',', '')
        .split(' ')[-1]
        for x in f.readlines() 
    ]

In [57]:
features[0:10]

['tBodyAcc-mean-X',
 'tBodyAcc-mean-Y',
 'tBodyAcc-mean-Z',
 'tBodyAcc-std-X',
 'tBodyAcc-std-Y',
 'tBodyAcc-std-Z',
 'tBodyAcc-mad-X',
 'tBodyAcc-mad-Y',
 'tBodyAcc-mad-Z',
 'tBodyAcc-max-X']

In [36]:
len(features)

561

### IDs of participants

In [30]:
with open('uci_har_dataset/train/subject_train.txt', 'r') as f:
    train_id = pd.Series([int(x.strip()) for x in f.readlines()])

In [32]:
train_id.value_counts()

25    409
21    408
26    392
30    383
28    382
27    376
23    372
17    368
16    366
19    360
1     347
29    344
3     341
15    328
6     325
14    323
22    321
11    316
7     308
5     302
8     281
dtype: int64

### Activity labels
- 1 = Walking (flat)
- 2 = Walking (up stairs)
- 3 = Walking (down stairs)
- 4 = Sitting
- 5 = Standing
- 6 = Laying

In [68]:
with open('uci_har_dataset/train/y_train.txt', 'r') as f:
    train_tags = pd.Series([ int(x.strip()) for x in f.readlines() ])

In [69]:
train_tags[0:10]

0    5
1    5
2    5
3    5
4    5
5    5
6    5
7    5
8    5
9    5
dtype: int64

### Load feature vector, add participant IDs, activity tags and feature names
NB: training set was altered to be comma-separated using `sed 's/ \{1,\}/,/g` on command line

This causes the first column to be empty (due to leading space), so we drop it immediately

In [58]:
train_df = pd.read_csv('./uci_har_dataset/train/X_train_a.txt', sep=",", header=None).drop(0, axis=1)

In [59]:
train_df.shape

(7352, 561)

In [60]:
train_df.columns = features

In [71]:
train_df['id'] = train_id
train_df['tag'] = train_tags

In [72]:
train_df.head()

Unnamed: 0,tBodyAcc-mean-X,tBodyAcc-mean-Y,tBodyAcc-mean-Z,tBodyAcc-std-X,tBodyAcc-std-Y,tBodyAcc-std-Z,tBodyAcc-mad-X,tBodyAcc-mad-Y,tBodyAcc-mad-Z,tBodyAcc-max-X,...,fBodyBodyGyroJerkMag-kurtosis,angle(tBodyAccMeangravity),angle(tBodyAccJerkMean)gravityMean),angle(tBodyGyroMeangravityMean),angle(tBodyGyroJerkMeangravityMean),angle(XgravityMean),angle(YgravityMean),angle(ZgravityMean),id,tag
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,5
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,5
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,5
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,5
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,5


## Test data

### Participant IDs

In [73]:
with open('uci_har_dataset/test/subject_test.txt', 'r') as f:
    test_id = pd.Series([int(x.strip()) for x in f.readlines()])

In [74]:
test_id.value_counts()

24    381
18    364
20    354
13    327
12    320
4     317
2     302
10    294
9     288
dtype: int64

### Activity labels

In [75]:
with open('./uci_har_dataset/test/y_test.txt', 'r') as f:
    test_tags = pd.Series([ int(x.strip()) for x in f.readlines() ])

In [76]:
test_tags[0:10]

0    5
1    5
2    5
3    5
4    5
5    5
6    5
7    5
8    5
9    5
dtype: int64

### Feature vector for test set, combined with tags and participant IDs

In [79]:
test_df = pd.read_csv('./uci_har_dataset/test/X_test_a.txt', sep=',', header=None).drop(0, axis=1)

In [80]:
test_df.shape

(2947, 561)

In [81]:
test_df.columns = features
test_df['id'] = test_id
test_df['tag'] = test_tags

In [82]:
test_df.head()

Unnamed: 0,tBodyAcc-mean-X,tBodyAcc-mean-Y,tBodyAcc-mean-Z,tBodyAcc-std-X,tBodyAcc-std-Y,tBodyAcc-std-Z,tBodyAcc-mad-X,tBodyAcc-mad-Y,tBodyAcc-mad-Z,tBodyAcc-max-X,...,fBodyBodyGyroJerkMag-kurtosis,angle(tBodyAccMeangravity),angle(tBodyAccJerkMean)gravityMean),angle(tBodyGyroMeangravityMean),angle(tBodyGyroJerkMeangravityMean),angle(XgravityMean),angle(YgravityMean),angle(ZgravityMean),id,tag
0,0.257178,-0.023285,-0.014654,-0.938404,-0.920091,-0.667683,-0.952501,-0.925249,-0.674302,-0.894088,...,-0.705974,0.006462,0.16292,-0.825886,0.271151,-0.720009,0.276801,-0.057978,2,5
1,0.286027,-0.013163,-0.119083,-0.975415,-0.967458,-0.944958,-0.986799,-0.968401,-0.945823,-0.894088,...,-0.594944,-0.083495,0.0175,-0.434375,0.920593,-0.698091,0.281343,-0.083898,2,5
2,0.275485,-0.02605,-0.118152,-0.993819,-0.969926,-0.962748,-0.994403,-0.970735,-0.963483,-0.93926,...,-0.640736,-0.034956,0.202302,0.064103,0.145068,-0.702771,0.280083,-0.079346,2,5
3,0.270298,-0.032614,-0.11752,-0.994743,-0.973268,-0.967091,-0.995274,-0.974471,-0.968897,-0.93861,...,-0.736124,-0.017067,0.154438,0.340134,0.296407,-0.698954,0.284114,-0.077108,2,5
4,0.274833,-0.027848,-0.129527,-0.993852,-0.967445,-0.978295,-0.994111,-0.965953,-0.977346,-0.93861,...,-0.846595,-0.002223,-0.040046,0.736715,-0.118545,-0.692245,0.290722,-0.073857,2,5
