# Going deep into Human Activity Recognition

**Elia Bonetto, Filippo Rigotto.**

Department of Information Engineering, University of Padova, Italy.

Human Data Analytics, a.y. 2018/2019

## Part 1 - Data preprocessing

In [0]:
from IPython.display import Image, clear_output
import os
from google.colab import drive
drive.mount('/content/drive/')
clear_output()
os.chdir("/content/drive/My Drive/hda-project")
#!ls

In [0]:
!pip install transforms3d
clear_output()

from collections import Counter
import json
import logging
import math
import random

from transforms3d.axangles import axangle2mat

import h5py
import numpy as np
import scipy as sp
import scipy.io

import pandas as pd
pd.set_option('display.precision',3)
pd.set_option('display.float_format', '{:0.3f}'.format)

from imblearn.over_sampling import ADASYN 
from sklearn.model_selection import train_test_split
import skimage

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.figsize'] = (16,10)
mpl.rcParams['axes.grid'] = True

## Loading

Let's start from the [original datasets](https://www.dlr.de/kn/desktopdefault.aspx/tabid-8500/14564_read-36508/) provided as MATLAB `mat` files and process data items to have the final dataset all the network models will work on.

In [0]:
datasets = [
    'dataset/ARS_DLR_DataSet.mat',
    'dataset/ARS_DLR_DataSet_V2.mat',
    'dataset/ARS_DLR_Benchmark_Data_Set.mat'
]

Classes  to detect are reduced from the original 17 down to 8.

Furthermore, transitions will not be detected, so the final number of used labels is 7.

In [0]:
labels = [
    'RUNNING', 'WALKING', 'JUMPING','STNDING','SITTING', 'XLYINGX', 'FALLING',
    'WALKUPS', 'WALKDWS',
    'JUMPVRT', 'JUMPFWD', 'JUMPBCK', 
    'TRANSUP', 'TRANSDW', 'TRNSACC', 'TRNSDCC', 'TRANSIT'
]

# full map for all 17 classes
map_encode = { label:i for i,label in enumerate(labels) }

# map to squeeze down to 8 classes
map_encode_8 = {
    0:0,   1:1,  2:2,  3:3,  4:4,  5:5,  6:6, # untouched
    7:1,   8:1,                  # walking up and downstairs = walking
    9:2,  10:2, 11:2,            # jumping in place, forward and backward = jumping
    12:7, 13:7, 14:7, 15:7, 16:7 # all transitions
}

# useful for plots
map_decode_8 = {
    0: 'running',
    1: 'walking',
    2: 'jumping',
    3: 'standing',
    4: 'sitting',
    5: 'lying',
    6: 'falling',
    7: 'transition'
}

The dataset provides the list of labels ("activities") for each segment of tracked data, along with index bounds (start and stop) for each item of the list.

Outside these ranges, data is considered to be marked as transitions between classes.

This structure is flatten to a single list.

**TODO** write about holes and mislabeling errors in the report 

In [0]:
def flatten_labels(labels, bounds):
    """Builds a single labels array from labels and bounds."""
    start = bounds[0::2]-1 # even positions
    stop  = bounds[1::2]   # -1: numbering starts from 1
    # start is included, stop is excluded
    
    res = np.ones(bounds[-1], dtype=np.uint8) * map_encode['TRANSIT']
    for i, lab in enumerate(labels):
        if i+1 < len(stop):
            if stop[i] > start[i+1]+1:
                # advance next start, mislabeling error
                print('Time error: {} > {}'.format(stop[i],start[i+1]), end=' ')
                start[i+1] = stop[i]
        res[start[i] : stop[i]] = lab
    return res

Datasets contain IMU measurements referred to the sensor frame, but also provides the attitude/cosine matrix to express the measurements w.r.t. the body frame.

In [0]:
def convert_body_frame(imu_data, attitude_matrix):
    """Converts sensor frames in a data item to body frames through the attitude matrix."""
    C = attitude_matrix[1:].reshape(3,3).T

    result = imu_data.copy()
    result[1:4]  = np.dot(C, imu_data[1:4].T)  # acc
    result[4:7]  = np.dot(C, imu_data[4:7].T)  # gyro
    result[7:10] = np.dot(C, imu_data[7:10].T) # mag
    return result

For each test in a dataset, extract the relevant data and then flat the labels to a single long list.

The rest of the processing is postponed to operate on the full dataset instead of working on single tests.

In [0]:
def process_single_test(dataset, key):
    imu_data, attitude_matrices, activities, activities_bounds = dataset[key][0]

    # throwing away useless nested arrays
    activities = np.array([ act[0] for act in activities[0] ])
    activities_bounds = activities_bounds[0]
    
    # integrity checks on time and length
    assert([ imu_data[i][0] == attitude_matrices[i][0] for i in range(len(imu_data)) ])
    assert(len(activities_bounds) == 2*len(activities))

    # change labels to int numbers
    activities = np.array([ map_encode[a] for a in activities ])

    # get a single array of labels instead of labels + bounds
    activities_flat = flatten_labels(activities, activities_bounds)
    assert(len(imu_data) == len(activities_flat))

    info_dict = {
        'name': key,
        'act_list': [ map_decode_8[map_encode_8[i]] for i in activities ],
        'act': [ (int(a),int(i),int(f)) for a,i,f in zip(activities, activities_bounds[0::2], activities_bounds[1::2]) ] }
    
    return imu_data, attitude_matrices, activities_flat, info_dict

Numpy arrays are used to store all the collected data from every test in each dataset.

For some relevant tests (those with a good mix of activities) plots of the (normalized) magnitude of IMU vectors are saved.

In [0]:
imu_sensor = np.empty((0,10))
attitude_mat = np.empty((0,10))
activities = np.empty((0,1), dtype=np.uint8)
info_list = [] # just for keeping info about single tests

tests_to_plot = [
    'ARS_Cristina_Test_JmpFall_Sensor_Right',
    'ARS_Hanno_Test_JmpFall_Sensor_Right',
    'ARS_Maria_Test_JmpFall_Sensor_Left',
    'ARS_Paula_Benchmark_Sensor_Left',
    'ARS_Sinja_Benchmark_Sensor_Left',
    'ARS_Emil_Benchmark_Sensor_Right'
]

# loop datasets
for ds in datasets:
    dataset = scipy.io.loadmat(ds)
    keys = [ k for k in dataset if '__' not in k ]
    # loop keys=tests
    for test in keys:
        print('Loading {}:'.format(test).ljust(52,' '), end='')
        imu, mat, act, info = process_single_test(dataset, test)
        imu_sensor = np.append(imu_sensor, imu, axis=0)
        attitude_mat = np.append(attitude_mat, mat, axis=0)
        activities = np.append(activities, act)
        
        info['start'] = int(imu_sensor.shape[0] - imu.shape[0])
        info['end'] = int(imu_sensor.shape[0] - 1)
        info_list.append(info)
        print('{} elements'.format(imu.shape[0]).rjust(15,' '), end='')
        
        if test in tests_to_plot:
            print('\tSaving magnitude plots...', end='')
            x = range(imu.shape[0])
            acc = np.linalg.norm(imu[:,1:4],  axis=1) # vector magnitude
            gyr = np.linalg.norm(imu[:,4:7],  axis=1)
            mag = np.linalg.norm(imu[:,7:10], axis=1)
            acc /= acc.max() # normalization
            gyr /= gyr.max()
            mag /= mag.max()
            
            act2 = [ map_encode_8[a] for a in act ]
            cmap8 = mpl.cm.get_cmap('tab10', 8)
            fig, (pa,pg,pm) = plt.subplots(3, 1, sharex=True)
            #fig.suptitle(test, y=0.95, fontsize=16) # no title: will go in figure's caption
            pa.scatter(x, acc, s=1, c=act2, cmap=cmap8)
            pg.scatter(x, gyr, s=1, c=act2, cmap=cmap8)
            pm.scatter(x, mag, s=1, c=act2, cmap=cmap8)
            pa.set_ylabel('Accelerometer')
            pg.set_ylabel('Gyroscope')
            pm.set_ylabel('Magnetometer')
            pm.set_xlabel('Time')
            
            #cb = plt.colorbar(sc) # sc = first plt.scatter
            #cb.set_ticks(np.arange(8) + 0.5)
            #cmap_labels = list(map_decode_8.values())
            #cb.set_ticklabels(cmap_labels)#.append(' '))
            
            #fig.tight_layout(rect=[0, 0, 1, 0.93])
            fig.tight_layout()
            fname = os.path.join('.', 'images', test)
            fig.savefig(fname+'.png')
            fig.savefig(fname+'.pdf', format='pdf')
            plt.close()
        print()
        
assert(imu_sensor.shape==attitude_mat.shape)
num_data = imu_sensor.shape[0]

# save info file for reference
with open('dataset/info.json','w') as info_file:
    json.dump(info_list, info_file, indent=2)

clear_output()
print('Read {} records'.format(num_data))
print('IMU data shape:   {}'.format(imu_sensor.shape))
print('Attitudes shape:  {}'.format(attitude_mat.shape))
print('Activities shape: {}'.format(activities.shape))

Save label's color palette, useful in report.

In [0]:
cm = mpl.cm.get_cmap('tab10',8)

colors = np.array(cm.colors)
colors255 = np.round(colors*255)[:,:-1].astype(np.uint8) # no alpha channel

np.savetxt("dataset/colors.csv",    colors,    delimiter=',', fmt='%.5f')
np.savetxt("dataset/colors255.csv", colors255, delimiter=',', fmt='%d')

indices = np.arange(8).reshape(1,8)
arr = colors255[indices]
arr = np.repeat(arr, 500, axis=0)
arr = np.repeat(arr, 300, axis=1)

skimage.io.imsave('dataset/colors.jpg', arr)

Save this checkpoint, even if it will not be used.

In [0]:
#with h5py.File('dataset/ARS-raw.h5','w') as h5f:
#    h5f.create_dataset('imu_sensor', data=imu_sensor)
#    h5f.create_dataset('attitudes',  data=attitude_mat)
#    h5f.create_dataset('activities', data=activities)

In [0]:
# optional reload if messing up below
#with h5py.File('dataset/ARS-raw.h5','r') as h5f:
#    imu_sensor = h5f['imu_sensor'][:]
#    attitude_mat = h5f['attitudes'][:]
#    activities = h5f['activities'][:]

Performing some checks:

- Displaying minutes for each activity

In [0]:
act,cnt = np.unique(activities, return_counts=True)
cnt = cnt / 100 / 60 # 100 Hz, 60 seconds
act = np.array([labels[a].lower() for a in act])

act = np.append(act,'total')
cnt = np.append(cnt, sum(cnt))

df = pd.DataFrame(np.array([act,np.round(cnt)]).T, columns=['Activity','Time (min)'])
df.index = [''] * len(df) # blank index
display(df)

#TODO other if needed


## More processing

Starting from the whole dataset:
- Reduce the number of tracked activities to 8
- Remove of items labelled as transitions
- Conversion of measurements to the body frame
- Remove time columns (in both measurements and attitude matrixes)

In [0]:
# remap activities
print('Old labels: {}'.format(np.unique(activities)))
activities = np.array([ map_encode_8[act] for act in activities])
print('New labels: {}'.format(np.unique(activities)))

In [0]:
# save times as latex table for report
act,cnt = np.unique(activities, return_counts=True)
cnt = cnt / 100 / 60
act = np.array([map_decode_8[a] for a in act])

act = np.append(act,'total')
cnt = np.append(cnt, sum(cnt))

df = pd.DataFrame(np.array([act,np.round(cnt)]).T, columns=['Activity','Time (min)'])
df.index = [''] * len(df)

with open('output/act-times.tex','w') as tfile:
    tfile.write(df.to_latex(index=False))

In [0]:
# remove transitions
transit_label = map_encode_8[map_encode['TRANSIT']]
transit_number = sum(activities == transit_label)
print('Transit label is {}, found {} elements'.format(transit_label, transit_number))

num_data = len(imu_sensor)
print('Old num data: {}'.format(num_data))
imu_sensor = imu_sensor[activities != transit_label]
attitude_mat = attitude_mat[activities != transit_label]
activities = activities[activities != transit_label]

assert(imu_sensor.shape[0] == num_data-transit_number)
assert(attitude_mat.shape[0] == num_data-transit_number)
assert(activities.shape[0] == num_data-transit_number)

num_data = len(imu_sensor)
print('New num data: {}'.format(num_data))

num_labels = len(np.unique(activities))
print('New num labels: {}'.format(num_labels))

In [0]:
# convert from sensor frame to body frame
imu_body = imu_sensor.copy()
for i, imu in enumerate(imu_sensor):
    imu_body[i] = convert_body_frame(imu, attitude_mat[i])

In [0]:
# remove time column from data
imu_sensor = imu_sensor[:,1:]
imu_body = imu_body[:,1:]
attitude_mat = attitude_mat[:,1:]

Performing some checks:

- Displaying minutes for each activity

In [0]:
act,cnt = np.unique(activities, return_counts=True)
cnt = cnt / 100 / 60
act = np.array([map_decode_8[a] for a in act])

act = np.append(act,'total')
cnt = np.append(cnt, sum(cnt))

df = pd.DataFrame(np.array([act,np.round(cnt)]).T, columns=['Activity','Time (min)'])
df.index = [''] * len(df)
display(df)

#TODO other if needed

Pre-compute and save a train/test split of the dataset.

`random_state` is the seed of the PRNG, the percentage is set using `SPLIT_TEST_PERC` variable.


In [0]:
SPLIT_TEST_PERC = 0.3

X_train, X_test, Y_train, Y_test = \
    train_test_split(imu_sensor, activities, test_size=SPLIT_TEST_PERC, random_state=1, stratify=activities) 

print("X_train shape: " + str(X_train.shape))
print("Y_train shape: " + str(len(Y_train)))
print("X_test shape:  " + str(X_test.shape))
print("Y_test shape:  " + str(len(Y_test)))

Xb_train, Xb_test, Yb_train, Yb_test = \
    train_test_split(imu_body, activities, test_size=SPLIT_TEST_PERC, random_state=1, stratify=activities) 

print("Xb_train shape: " + str(Xb_train.shape))
print("Yb_train shape: " + str(len(Yb_train)))
print("Xb_test shape:  " + str(Xb_test.shape))
print("Yb_test shape:  " + str(len(Yb_test)))

In [0]:
#with h5py.File('dataset/ARS.h5','w') as h5f:
#    h5f.create_dataset('imu_sensor', data=imu_sensor)
#    h5f.create_dataset('imu_body', data=imu_body)
#    h5f.create_dataset('attitudes', data=attitude_mat)
#    h5f.create_dataset('activities', data=activities)

with h5py.File('dataset/ARS-train-test-sensor.h5','w') as h5f:
    h5f.create_dataset('X_train', data=X_train)
    h5f.create_dataset('X_test',  data=X_test)
    h5f.create_dataset('Y_train', data=Y_train)
    h5f.create_dataset('Y_test',  data=Y_test)
    
with h5py.File('dataset/ARS-train-test-body.h5','w') as h5f:
    h5f.create_dataset('X_train', data=Xb_train)
    h5f.create_dataset('X_test',  data=Xb_test)
    h5f.create_dataset('Y_train', data=Yb_train)
    h5f.create_dataset('Y_test',  data=Yb_test)

## Framing

Group by activity and organize data in overlapping windows (overlapping ratio is regulated by `stride_len`). 

Save the resulting dataset and a corresponding split.

In [0]:
window_len = 128
stride_len = round(window_len / 2)

def framing_padding(data):
    x, y = [], []

    for activity in np.unique(activities):
        tmp = data[activities == activity]
        if len(tmp) % stride_len != 0:
            # append zeroes to fill the window, if necessary
            windows_inside = math.ceil( (len(tmp) - window_len) / stride_len )
            windows_space = windows_inside * stride_len
            rest = windows_space - len(tmp) + window_len
            #print('Act. {}, appending {} rows of zeros'.format(activity,rest))

            tmp = np.append(tmp, np.zeros((rest,data.shape[1])), axis=0)
            #tmp = np.append(tmp, [[0]*9]*int((math.floor((len(tmp)-window_len)/stride_len) + 1)*stride_len-len(tmp)+window_len), axis = 0)

        # exlude unnecessary padded windows
        for i in range(0, len(tmp)-window_len, stride_len):
            x.extend([tmp[i:i+window_len]])
            y.extend([activity])

    x = np.array(x)
    y = np.array(y)
    assert(x.shape[0] == len(y))
    return x,y

imu_sensor_framed, activities_sensor_framed = framing_padding(imu_sensor)
imu_body_framed, activities_body_framed = framing_padding(imu_body)

assert(np.array_equal(activities_sensor_framed, activities_body_framed))

print("IMU_sensor shape: " + str(imu_sensor_framed.shape))
print("IMU body shape:   " + str(imu_body_framed.shape))
print("Activities shape: " + str(len(activities_sensor_framed)))

In [0]:
X_train, X_test, Y_train, Y_test = \
    train_test_split(imu_sensor_framed, activities_sensor_framed, test_size=SPLIT_TEST_PERC, random_state=1, stratify=activities_sensor_framed) 

print("X_train shape: " + str(X_train.shape))
print("Y_train shape: " + str(len(Y_train)))
print("X_test shape:  " + str(X_test.shape))
print("Y_test shape:  " + str(len(Y_test)))

Xb_train, Xb_test, Yb_train, Yb_test = \
    train_test_split(imu_body_framed, activities_body_framed, test_size=SPLIT_TEST_PERC, random_state=1, stratify=activities_body_framed) 

print("Xb_train shape: " + str(Xb_train.shape))
print("Yb_train shape: " + str(len(Yb_train)))
print("Xb_test shape:  " + str(Xb_test.shape))
print("Yb_test shape:  " + str(len(Yb_test)))

In [0]:
#with h5py.File('dataset/ARS-framed.h5','w') as h5f:
#    h5f.create_dataset('imu_sensor', data=imu_sensor_framed)
#    h5f.create_dataset('imu_body',   data=imu_body_framed)
#    h5f.create_dataset('activities', data=activities_sensor_framed)
    #h5f.create_dataset('activities_body', data=activities_body_framed) # useless duplicate of prev item

with h5py.File('dataset/ARS-train-test-sensor-framed.h5','w') as h5f:
    h5f.create_dataset('X_train', data=X_train)
    h5f.create_dataset('X_test',  data=X_test)
    h5f.create_dataset('Y_train', data=Y_train)
    h5f.create_dataset('Y_test',  data=Y_test)

with h5py.File('dataset/ARS-train-test-body-framed.h5','w') as h5f:
    h5f.create_dataset('X_train', data=Xb_train)
    h5f.create_dataset('X_test',  data=Xb_test)
    h5f.create_dataset('Y_train', data=Yb_train)
    h5f.create_dataset('Y_test',  data=Yb_test)

In [0]:
# optional reload if messing up below
#with h5py.File('dataset/ARS-framed.h5','r') as h5f:
#    imu_sensor_framed = h5f['imu_sensor'][:]
#    imu_body_framed = h5f['imu_body'][:]
#    activities_sensor_framed = h5f['activities'][:]
#    activities_body_framed = activities_sensor_framed.copy()

## Normalization

Data normalization is performed using training set's mean and std.

Improves accuracy results: to be applied upon latest splits.

In [0]:
for i in range(X_train.shape[-1]):
    tmp_train = X_train[:,:,i].flatten()
    tmp_test = X_test[:,:,i].flatten()
    mean = np.mean(tmp_train)
    std = np.std(tmp_train)
    X_train[:,:,i] = np.reshape((tmp_train - mean)/std, (X_train.shape[0], X_train.shape[1]))
    X_test[:,:,i] =  np.reshape((tmp_test - mean)/std, (X_test.shape[0], X_test.shape[1]))

for i in range(Xb_train.shape[-1]):
    tmp_train = Xb_train[:,:,i].flatten()
    tmp_test = Xb_test[:,:,i].flatten()
    mean = np.mean(tmp_train)
    std = np.std(tmp_train)
    Xb_train[:,:,i] = np.reshape((tmp_train - mean)/std, (Xb_train.shape[0], Xb_train.shape[1]))
    Xb_test[:,:,i] =  np.reshape((tmp_test - mean)/std, (Xb_test.shape[0], Xb_test.shape[1]))

In [0]:
with h5py.File('dataset/ARS-train-test-sensor-framed-norm.h5','w') as h5f:
    h5f.create_dataset('X_train', data=X_train)
    h5f.create_dataset('X_test',  data=X_test)
    h5f.create_dataset('Y_train', data=Y_train)
    h5f.create_dataset('Y_test',  data=Y_test)

with h5py.File('dataset/ARS-train-test-body-framed-norm.h5','w') as h5f:
    h5f.create_dataset('X_train', data=Xb_train)
    h5f.create_dataset('X_test',  data=Xb_test)
    h5f.create_dataset('Y_train', data=Yb_train)
    h5f.create_dataset('Y_test',  data=Yb_test)

Checks: display mean and std for each class in training (should be near 0 and 1).

In [0]:
for i in range(X_train.shape[-1]):
    tmp_train = X_train[:,:,i].flatten()
    print(f'{np.mean(tmp_train)}    \t{np.std(tmp_train)}')

print()
for i in range(Xb_train.shape[-1]):
    tmp_train = Xb_train[:,:,i].flatten()
    print(f'{np.mean(tmp_train)}    \t{np.std(tmp_train)}')

## Augmentation

Data augmentation is performed using the Adaptive Syntetic (ADASYN) sampling algorithm, since this dataset is not balanced.

More information on the algorithm can be found in the paper [(link)](https://ieeexplore.ieee.org/document/4633969): 
 
    Haibo He, Yang Bai, Edwardo A. Garcia, and Shutao Li. “ADASYN: Adaptive synthetic sampling approach for imbalanced learning,” In IEEE International Joint Conference on Neural Networks, pp. 1322-1328, 2008.

We use the implementation included in the [imbalanced-learn](https://imbalanced-learn.readthedocs.io) Python package.


In [0]:
# ADASYN accepts 2d arrays, so we squeeze last two dimensions
imu_sensor_2d = imu_sensor_framed.reshape((imu_sensor_framed.shape[0], imu_sensor_framed.shape[2]*imu_sensor_framed.shape[1]))
imu_body_2d = imu_body_framed.reshape((imu_body_framed.shape[0], imu_body_framed.shape[2]*imu_body_framed.shape[1]))

print(imu_sensor_framed.shape, end=' > ')
print(imu_sensor_2d.shape)
print(imu_body_framed.shape, end=' > ')
print(imu_body_2d.shape)

# to have comparable results to manual augmentation written below, try to keep proportions between classes
"""
Class 0: 1409 entries, 2552 values to add, totalling 3961.
Class 1: 6762 entries, OK
Class 2: 709 entries, 3252 values to add, totalling 3961.
Class 3: 11319 entries, OK
Class 4: 5518 entries, OK
Class 5: 2644 entries, 1317 values to add, totalling 3961.
Class 6: 184 entries, 3777 values to add, totalling 3961.
"""

sample_num_dict = {
    0: 3961,
    1: 6762,  # not augmented
    2: 3961,
    3: 11319, # not augmented
    4: 5518,  # not augmented
    5: 3961,
    6: 3961
}

ada = ADASYN(random_state=8, n_jobs=2, sampling_strategy=sample_num_dict)
imu_sensor_framed_aug, activities_sensor_framed_aug = ada.fit_resample(imu_sensor_2d, activities_sensor_framed)
print(f'Resampled composition: {Counter(activities_sensor_framed_aug)}')

ada = ADASYN(random_state=8, n_jobs=2, sampling_strategy=sample_num_dict)
imu_body_framed_aug, activities_body_framed_aug = ada.fit_resample(imu_body_2d, activities_body_framed)
print(f'Resampled composition: {Counter(activities_body_framed_aug)}')

# back to three dimensions arrays
imu_sensor_framed_aug = imu_sensor_framed_aug.reshape((-1,imu_sensor_framed.shape[1],imu_sensor_framed.shape[2]))
imu_body_framed_aug = imu_body_framed_aug.reshape((-1,imu_body_framed.shape[1],imu_body_framed.shape[2]))

print(imu_sensor_framed_aug.shape)
print(imu_body_framed_aug.shape)

In [0]:
X_train, X_test, Y_train, Y_test = \
    train_test_split(imu_sensor_framed_aug, activities_sensor_framed_aug, test_size=SPLIT_TEST_PERC, random_state=1, stratify=activities_sensor_framed_aug)
Xb_train, Xb_test, Yb_train, Yb_test = \
    train_test_split(imu_body_framed_aug,   activities_body_framed_aug,   test_size=SPLIT_TEST_PERC, random_state=1, stratify=activities_body_framed_aug)

In [0]:
name = 'dataset/ARS-train-test-{}-framed-aug.h5'

with h5py.File(name.format('sensor'),'w') as h5f:
    h5f.create_dataset('X_train', data=X_train)
    h5f.create_dataset('X_test',  data=X_test)
    h5f.create_dataset('Y_train', data=Y_train)
    h5f.create_dataset('Y_test',  data=Y_test)

with h5py.File(name.format('body'),'w') as h5f:
    h5f.create_dataset('X_train', data=Xb_train)
    h5f.create_dataset('X_test',  data=Xb_test)
    h5f.create_dataset('Y_train', data=Yb_train)
    h5f.create_dataset('Y_test',  data=Yb_test)

## Manual Augmentation

Instead of relying on an algorithm to augment data, this can be performed manually:

- By applying a rotation of a random angle against a random axis (_not_ applied to classes _standing_ and _lying_, worse performance), 
- By permutation of data in a window.

One or both operations may be carried out.

To enable and configure the feature set the global switches below accordingly (or skip executing all next cells).

In [0]:
augment = True
rotation  = True
permutation = True

In [0]:
def rotate_random(data):
    # axis and angle of the rotation
    axis = np.random.uniform(low=-1, high=1, size = 3)
    angle = np.random.uniform(low=-np.pi, high=np.pi)
        
    res = data.copy()
    for i in range(0,len(data),3):
        res[i:i+3] = np.matmul(data[i:i+3], axangle2mat(axis,angle))
    return res

def rotate(values):
    for i in range(values.shape[0]):
        for j in range(values.shape[1]):
            values[i,j] = rotate_random(values[i,j])
    return values

def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i+n]
        
def permute(values):
    values = list(chunks(values,8))  #8*16=128
    random.shuffle(values)
    values = [i for crop_seq in values for i in crop_seq]
    return np.array(values)
   
def augmenting(x, y, rotation=True, permutation=True):
    np.random.seed(8) # to have the same choices for both sensor and body-related datasets
    augm_factor = 0.35

    count = []
    for activity in np.unique(y):
        count.append(len(x[y == activity]))
    percentage_before = count/np.sum(count,axis=0)
    most_repr = int(max(count)*augm_factor)

    x_add = []
    y_add = []
    for activity in range(len(count)):
        augment_number = most_repr - count[activity]
        if augment_number > 0:
            print(f"Class {activity}: {count[activity]} entries, {augment_number} values to add, totalling {count[activity]+augment_number}.")
            values = x[np.array(y) == activity][np.random.choice(range(count[activity]),size=augment_number)]
            if rotation and activity not in [4,5]: # exclude sitting, lying from random rotations
                values = rotate(values)
            if permutation:
                values = permute(values)
            x_add.extend(values)
            y_add.extend([activity]*values.shape[0])
        else:
            print(f"Class {activity}: {count[activity]} entries, OK")
    return np.array(x_add), np.array(y_add)

if augment:
    # get data to add
    imu_sensor_toadd, activities_sensor_toadd = augmenting(imu_sensor_framed, activities_sensor_framed, rotation, permutation)
    print()
    imu_body_toadd, activities_body_toadd = augmenting(imu_body_framed, activities_body_framed, rotation, permutation)

    # add to existing data
    imu_sensor_framed_aug = np.concatenate((imu_sensor_framed, imu_sensor_toadd), axis=0)
    imu_body_framed_aug   = np.concatenate((imu_body_framed,   imu_body_toadd),   axis=0)
    activities_sensor_framed_aug = np.concatenate((activities_sensor_framed, activities_sensor_toadd), axis=0)
    activities_body_framed_aug   = np.concatenate((activities_body_framed,   activities_body_toadd),   axis=0)

    assert(len(imu_sensor_framed_aug) == len(imu_sensor_framed) + len(imu_sensor_toadd))
    assert(len(imu_body_framed_aug)   == len(imu_body_framed)   + len(imu_body_toadd))
    assert(len(activities_sensor_framed_aug) == len(activities_sensor_framed) + len(activities_sensor_toadd))
    assert(len(activities_body_framed_aug)   == len(activities_body_framed)   + len(activities_body_toadd))
    
    print(f'Resampled composition: {Counter(activities_sensor_framed_aug)}')
    print(f'Resampled composition: {Counter(activities_body_framed_aug)}')

In [0]:
if augment:
    X_trainM, X_testM, Y_trainM, Y_testM = \
        train_test_split(imu_sensor_framed_aug, activities_sensor_framed_aug, test_size=SPLIT_TEST_PERC, random_state=1, stratify=activities_sensor_framed_aug)
    Xb_trainM, Xb_testM, Yb_trainM, Yb_testM = \
        train_test_split(imu_body_framed_aug,   activities_body_framed_aug,   test_size=SPLIT_TEST_PERC, random_state=1, stratify=activities_body_framed_aug)

In [0]:
if augment:
    name = 'dataset/ARS-train-test-{}-framed-aug'
    if rotation:    name += '-rot'
    if permutation: name += '-per'
    name += '.h5'
    
    with h5py.File(name.format('sensor'),'w') as h5f:
        h5f.create_dataset('X_train', data=X_trainM)
        h5f.create_dataset('X_test',  data=X_testM)
        h5f.create_dataset('Y_train', data=Y_trainM)
        h5f.create_dataset('Y_test',  data=Y_testM)

    with h5py.File(name.format('body'),'w') as h5f:
        h5f.create_dataset('X_train', data=Xb_trainM)
        h5f.create_dataset('X_test',  data=Xb_testM)
        h5f.create_dataset('Y_train', data=Yb_trainM)
        h5f.create_dataset('Y_test',  data=Yb_testM)

## Normalization

Reapplied to augmented data in the same way it was done before.

In [0]:
# ADASYN part
for i in range(X_train.shape[-1]):
    tmp_train = X_train[:,:,i].flatten()
    tmp_test = X_test[:,:,i].flatten()
    mean = np.mean(tmp_train)
    std = np.std(tmp_train)
    X_train[:,:,i] = np.reshape((tmp_train - mean)/std, (X_train.shape[0], X_train.shape[1]))
    X_test[:,:,i] =  np.reshape((tmp_test - mean)/std, (X_test.shape[0], X_test.shape[1]))

for i in range(Xb_train.shape[-1]):
    tmp_train = Xb_train[:,:,i].flatten()
    tmp_test = Xb_test[:,:,i].flatten()
    mean = np.mean(tmp_train)
    std = np.std(tmp_train)
    Xb_train[:,:,i] = np.reshape((tmp_train - mean)/std, (Xb_train.shape[0], Xb_train.shape[1]))
    Xb_test[:,:,i] =  np.reshape((tmp_test - mean)/std, (Xb_test.shape[0], Xb_test.shape[1]))

# Manual part
if augment:
    for i in range(X_trainM.shape[-1]):
        tmp_train = X_trainM[:,:,i].flatten()
        tmp_test = X_testM[:,:,i].flatten()
        mean = np.mean(tmp_train)
        std = np.std(tmp_train)
        X_trainM[:,:,i] = np.reshape((tmp_train - mean)/std, (X_trainM.shape[0], X_trainM.shape[1]))
        X_testM[:,:,i] =  np.reshape((tmp_test - mean)/std, (X_testM.shape[0], X_testM.shape[1]))

    for i in range(Xb_trainM.shape[-1]):
        tmp_train = Xb_trainM[:,:,i].flatten()
        tmp_test = Xb_testM[:,:,i].flatten()
        mean = np.mean(tmp_train)
        std = np.std(tmp_train)
        Xb_trainM[:,:,i] = np.reshape((tmp_train - mean)/std, (Xb_trainM.shape[0], Xb_trainM.shape[1]))
        Xb_testM[:,:,i] =  np.reshape((tmp_test - mean)/std, (Xb_testM.shape[0], Xb_testM.shape[1]))

In [0]:
# ADASYN part
name = 'dataset/ARS-train-test-{}-framed-aug-norm.h5'

with h5py.File(name.format('sensor'),'w') as h5f:
    h5f.create_dataset('X_train', data=X_train)
    h5f.create_dataset('X_test',  data=X_test)
    h5f.create_dataset('Y_train', data=Y_train)
    h5f.create_dataset('Y_test',  data=Y_test)

with h5py.File(name.format('body'),'w') as h5f:
    h5f.create_dataset('X_train', data=Xb_train)
    h5f.create_dataset('X_test',  data=Xb_test)
    h5f.create_dataset('Y_train', data=Yb_train)
    h5f.create_dataset('Y_test',  data=Yb_test)

# Manual part
if augment:
    name = 'dataset/ARS-train-test-{}-framed-aug'
    if augment and rotation:    name += '-rot'
    if augment and permutation: name += '-per'
    name += '-norm.h5'

    with h5py.File(name.format('sensor'),'w') as h5f:
        h5f.create_dataset('X_train', data=X_trainM)
        h5f.create_dataset('X_test',  data=X_testM)
        h5f.create_dataset('Y_train', data=Y_trainM)
        h5f.create_dataset('Y_test',  data=Y_testM)

    with h5py.File(name.format('body'),'w') as h5f:
        h5f.create_dataset('X_train', data=Xb_trainM)
        h5f.create_dataset('X_test',  data=Xb_testM)
        h5f.create_dataset('Y_train', data=Yb_trainM)
        h5f.create_dataset('Y_test',  data=Yb_testM)

Checks

In [0]:
for i in range(X_train.shape[-1]):
    tmp_train = X_train[:,:,i].flatten()
    print(f'{np.mean(tmp_train)}    \t{np.std(tmp_train)}')

print()
for i in range(Xb_train.shape[-1]):
    tmp_train = Xb_train[:,:,i].flatten()
    print(f'{np.mean(tmp_train)}    \t{np.std(tmp_train)}')
    
if augment:
    print('\n')
    for i in range(X_trainM.shape[-1]):
        tmp_train = X_trainM[:,:,i].flatten()
        print(f'{np.mean(tmp_train)}    \t{np.std(tmp_train)}')

    print()
    for i in range(Xb_trainM.shape[-1]):
        tmp_train = Xb_trainM[:,:,i].flatten()
        print(f'{np.mean(tmp_train)}    \t{np.std(tmp_train)}')