# Analysis of UCI ML Human Activity Recognition Using Smartphones Dataset
_economy_, 18 Jan 2018

Dataset description found here:
http://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.names

In [84]:
import pandas as pd
import numpy as np

import keras
import keras.layers as L
import keras.models as M

# Import datasets

### Feature List

In [56]:
with open('uci_har_dataset/features.txt', 'r') as f:
    features = [ 
        x
        .strip()
        .replace('()', '')
        .replace(',', '')
        .split(' ')[-1]
        for x in f.readlines() 
    ]

In [85]:
features

['tBodyAcc-mean-X',
 'tBodyAcc-mean-Y',
 'tBodyAcc-mean-Z',
 'tBodyAcc-std-X',
 'tBodyAcc-std-Y',
 'tBodyAcc-std-Z',
 'tBodyAcc-mad-X',
 'tBodyAcc-mad-Y',
 'tBodyAcc-mad-Z',
 'tBodyAcc-max-X',
 'tBodyAcc-max-Y',
 'tBodyAcc-max-Z',
 'tBodyAcc-min-X',
 'tBodyAcc-min-Y',
 'tBodyAcc-min-Z',
 'tBodyAcc-sma',
 'tBodyAcc-energy-X',
 'tBodyAcc-energy-Y',
 'tBodyAcc-energy-Z',
 'tBodyAcc-iqr-X',
 'tBodyAcc-iqr-Y',
 'tBodyAcc-iqr-Z',
 'tBodyAcc-entropy-X',
 'tBodyAcc-entropy-Y',
 'tBodyAcc-entropy-Z',
 'tBodyAcc-arCoeff-X1',
 'tBodyAcc-arCoeff-X2',
 'tBodyAcc-arCoeff-X3',
 'tBodyAcc-arCoeff-X4',
 'tBodyAcc-arCoeff-Y1',
 'tBodyAcc-arCoeff-Y2',
 'tBodyAcc-arCoeff-Y3',
 'tBodyAcc-arCoeff-Y4',
 'tBodyAcc-arCoeff-Z1',
 'tBodyAcc-arCoeff-Z2',
 'tBodyAcc-arCoeff-Z3',
 'tBodyAcc-arCoeff-Z4',
 'tBodyAcc-correlation-XY',
 'tBodyAcc-correlation-XZ',
 'tBodyAcc-correlation-YZ',
 'tGravityAcc-mean-X',
 'tGravityAcc-mean-Y',
 'tGravityAcc-mean-Z',
 'tGravityAcc-std-X',
 'tGravityAcc-std-Y',
 'tGravityAcc-std-

In [36]:
len(features)

561

### IDs of participants

In [30]:
with open('uci_har_dataset/train/subject_train.txt', 'r') as f:
    train_id = pd.Series([int(x.strip()) for x in f.readlines()])

In [32]:
train_id.value_counts()

25    409
21    408
26    392
30    383
28    382
27    376
23    372
17    368
16    366
19    360
1     347
29    344
3     341
15    328
6     325
14    323
22    321
11    316
7     308
5     302
8     281
dtype: int64

### Activity labels
- 1 = Walking (flat)
- 2 = Walking (up stairs)
- 3 = Walking (down stairs)
- 4 = Sitting
- 5 = Standing
- 6 = Laying

In [68]:
with open('uci_har_dataset/train/y_train.txt', 'r') as f:
    train_tags = pd.Series([ int(x.strip()) for x in f.readlines() ])

In [69]:
train_tags[0:10]

0    5
1    5
2    5
3    5
4    5
5    5
6    5
7    5
8    5
9    5
dtype: int64

### Load feature vector 
NB: training set was altered to be comma-separated using `sed 's/ \{1,\}/,/g` on command line

This causes the first column to be empty (due to leading space), so we drop it immediately

In [90]:
train_df = pd.read_csv('./uci_har_dataset/train/X_train_a.txt', sep=",", header=None).drop(0, axis=1)

In [91]:
train_df.shape

(7352, 561)

In [92]:
train_df.columns = features

In [93]:
train_df.head()

Unnamed: 0,tBodyAcc-mean-X,tBodyAcc-mean-Y,tBodyAcc-mean-Z,tBodyAcc-std-X,tBodyAcc-std-Y,tBodyAcc-std-Z,tBodyAcc-mad-X,tBodyAcc-mad-Y,tBodyAcc-mad-Z,tBodyAcc-max-X,...,fBodyBodyGyroJerkMag-meanFreq,fBodyBodyGyroJerkMag-skewness,fBodyBodyGyroJerkMag-kurtosis,angle(tBodyAccMeangravity),angle(tBodyAccJerkMean)gravityMean),angle(tBodyGyroMeangravityMean),angle(tBodyGyroJerkMeangravityMean),angle(XgravityMean),angle(YgravityMean),angle(ZgravityMean)
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.074323,-0.298676,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,0.158075,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,0.414503,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,0.404573,-0.11729,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,0.087753,-0.351471,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892


In [94]:
train = train_df.values

### Transform output vector into one-hot shape -> (6,)

In [157]:
def one_hot(x):
    a = np.zeros(shape=(6,))
    a[x-1] = 1.0
    
    return a

In [158]:
y_train = np.array([ one_hot(x) for x in train_tags.values ])

In [159]:
y_train[0:10]

array([[ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.]])

## Test data

### Participant IDs

In [73]:
with open('uci_har_dataset/test/subject_test.txt', 'r') as f:
    test_id = pd.Series([int(x.strip()) for x in f.readlines()])

In [74]:
test_id.value_counts()

24    381
18    364
20    354
13    327
12    320
4     317
2     302
10    294
9     288
dtype: int64

### Activity labels

In [75]:
with open('./uci_har_dataset/test/y_test.txt', 'r') as f:
    test_tags = pd.Series([ int(x.strip()) for x in f.readlines() ])

In [76]:
test_tags[0:10]

0    5
1    5
2    5
3    5
4    5
5    5
6    5
7    5
8    5
9    5
dtype: int64

### Feature vector for test set

In [95]:
test_df = pd.read_csv('./uci_har_dataset/test/X_test_a.txt', sep=',', header=None).drop(0, axis=1)

In [96]:
test_df.shape

(2947, 561)

In [97]:
test_df.columns = features

In [98]:
test_df.head()

Unnamed: 0,tBodyAcc-mean-X,tBodyAcc-mean-Y,tBodyAcc-mean-Z,tBodyAcc-std-X,tBodyAcc-std-Y,tBodyAcc-std-Z,tBodyAcc-mad-X,tBodyAcc-mad-Y,tBodyAcc-mad-Z,tBodyAcc-max-X,...,fBodyBodyGyroJerkMag-meanFreq,fBodyBodyGyroJerkMag-skewness,fBodyBodyGyroJerkMag-kurtosis,angle(tBodyAccMeangravity),angle(tBodyAccJerkMean)gravityMean),angle(tBodyGyroMeangravityMean),angle(tBodyGyroJerkMeangravityMean),angle(XgravityMean),angle(YgravityMean),angle(ZgravityMean)
0,0.257178,-0.023285,-0.014654,-0.938404,-0.920091,-0.667683,-0.952501,-0.925249,-0.674302,-0.894088,...,0.071645,-0.33037,-0.705974,0.006462,0.16292,-0.825886,0.271151,-0.720009,0.276801,-0.057978
1,0.286027,-0.013163,-0.119083,-0.975415,-0.967458,-0.944958,-0.986799,-0.968401,-0.945823,-0.894088,...,-0.401189,-0.121845,-0.594944,-0.083495,0.0175,-0.434375,0.920593,-0.698091,0.281343,-0.083898
2,0.275485,-0.02605,-0.118152,-0.993819,-0.969926,-0.962748,-0.994403,-0.970735,-0.963483,-0.93926,...,0.062891,-0.190422,-0.640736,-0.034956,0.202302,0.064103,0.145068,-0.702771,0.280083,-0.079346
3,0.270298,-0.032614,-0.11752,-0.994743,-0.973268,-0.967091,-0.995274,-0.974471,-0.968897,-0.93861,...,0.116695,-0.344418,-0.736124,-0.017067,0.154438,0.340134,0.296407,-0.698954,0.284114,-0.077108
4,0.274833,-0.027848,-0.129527,-0.993852,-0.967445,-0.978295,-0.994111,-0.965953,-0.977346,-0.93861,...,-0.121711,-0.534685,-0.846595,-0.002223,-0.040046,0.736715,-0.118545,-0.692245,0.290722,-0.073857


In [99]:
test = test_df.values

### Transform output to one-hot

In [160]:
y_test = np.array([ one_hot(x) for x in test_tags.values ])

# Classification Model (2-layer, 50-20-6)

In [161]:
train.shape

(7352, 561)

In [162]:
y_train.shape

(7352, 6)

In [192]:
model = M.Sequential()
model.add(L.Dense(50, input_dim=561, activation='relu'))
model.add(L.Dense(20, activation='relu'))
model.add(L.Dense(6, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

INFO (theano.gof.compilelock): Refreshing lock /Users/economy/.theano/compiledir_Darwin-17.3.0-x86_64-i386-64bit-i386-3.6.2-64/lock_dir/lock
INFO:theano.gof.compilelock:Refreshing lock /Users/economy/.theano/compiledir_Darwin-17.3.0-x86_64-i386-64bit-i386-3.6.2-64/lock_dir/lock


In [193]:
model.fit(x=train, y=y_train, batch_size=20, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x12c1d5e10>

# Evaluation

In [194]:
y_hat = pd.DataFrame(model.predict(test), columns=[1,2,3,4,5,6])

In [200]:
y_hat.sample?

In [202]:
## Sample some, turn ugly floats into 0,1
y_hat.sample(10, random_state=12345).applymap(lambda x: round(x))

Unnamed: 0,1,2,3,4,5,6
2673,1,0,0,0,0,0
193,0,0,0,1,0,0
188,0,0,0,1,0,0
544,1,0,0,0,0,0
1063,0,0,0,0,1,0
2595,0,0,0,0,1,0
2315,0,0,0,0,0,1
2693,1,0,0,0,0,0
522,0,0,0,0,0,1
2174,0,1,0,0,0,0


In [204]:
## real values
pd.DataFrame(y_test, columns=[1,2,3,4,5,6]).sample(10, random_state=12345).applymap(lambda x: int(x))

Unnamed: 0,1,2,3,4,5,6
2673,1,0,0,0,0,0
193,0,0,0,1,0,0
188,0,0,0,1,0,0
544,1,0,0,0,0,0
1063,0,0,0,0,1,0
2595,0,0,0,0,1,0
2315,0,0,0,0,0,1
2693,1,0,0,0,0,0
522,0,0,0,0,0,1
2174,0,1,0,0,0,0


In [197]:
scores = model.evaluate(test, y_test)
print("\n\nBinary Crossentropy Loss: {:.2%}, Accuracy: {:.2%}".format(scores[0], scores[1]))

  32/2947 [..............................] - ETA: 0s

Binary Crossentropy Loss: 6.74%, Accuracy: 98.20%
