In [62]:
import numpy as np
from unrar import rarfile
import zipfile
import io
from contextlib import closing
import requests
import tempfile
import os
import re
import h5py
import scipy.io as sio

# Helper functions

# Helper function, adapted from
# https://gist.github.com/alimanfoo/c5977e87111abe8127453b21204c1065
def find_runs(x):
    """Find runs of consecutive items in an array."""

    # ensure array
    x = np.asanyarray(x)
    if x.ndim != 1:
        raise ValueError('only 1D array supported')
    n = x.shape[0]

    # handle empty array
    if n == 0:
        return np.array([]), np.array([]), np.array([])

    else:
        # find run starts
        loc_run_start = np.empty(n, dtype=bool)
        loc_run_start[0] = True
        np.not_equal(x[:-1], x[1:], out=loc_run_start[1:])
        run_starts = np.nonzero(loc_run_start)[0]

        # find run values
        run_values = x[loc_run_start]

        # find run lengths
        run_lengths = np.diff(np.append(run_starts, n))

        return run_values, run_starts, run_lengths

def split_longer_runs(old_labels, old_starts, old_lengths, min_split_length=30):
    new_labels, new_starts, new_lengths = [], [], []

    for i in range(len(old_labels)):
        sequence_split_count = old_lengths[i] // min_split_length

        if sequence_split_count == 0:
            new_labels.append(old_labels[i])
            new_starts.append(old_starts[i])
            new_lengths.append(old_lengths[i])
            continue

        sequence_split_length = old_lengths[i] // sequence_split_count

        for j in range(sequence_split_count):
            new_labels.append(old_labels[i])
            new_starts.append(old_starts[i] + j * sequence_split_length)
            new_lengths.append(sequence_split_length)

    return np.array(new_labels, dtype=np.int64), \
            np.array(new_starts, dtype=np.int64), \
            np.array(new_lengths, dtype=np.int64)


# Load uWave dataset

Dataset is available at http://zhen-wang.appspot.com/rice/files/uwave/uWaveGestureLibrary.zip
Redistribution for this repository was explicitly allowed by
the authors.

In [16]:
DATA_URL = 'http://zhen-wang.appspot.com/rice/files/uwave/uWaveGestureLibrary.zip'
r = requests.get(DATA_URL)
files_in_zip = {}
with closing(r), zipfile.ZipFile(io.BytesIO(r.content)) as archive:
    for member in archive.infolist():
        files_in_zip[member.filename] = archive.read(member)

class_ids = list(range(8))
class_labels = ['diagonal', 'square', 'right', 'left',
                'up', 'down', 'clockwise', 'counter clockwise']
# person * day * class * attempt
instance_count = 8 * 7 * 8 * 10
# maximal length of gesture in dataset - rest is zero padded
max_length = 315

with h5py.File('uwave.h5', 'a') as h5f:
    for key in ['x', 'y', 'meta']:
        if key in h5f:
            del h5f[key]
    xs = h5f.create_dataset('x', (instance_count, max_length, 3), dtype='float64')
    xs.attrs['d1'] = 'instance'
    xs.attrs['d2'] = 'time'
    xs.attrs['d3'] = 'channel'
    xs.attrs['channels'] = ['acc1-x', 'acc1-y', 'acc1-z']

    ys = h5f.create_dataset('y', (instance_count, ), dtype=np.dtype([('class', np.int8)]))
    ys.attrs['d1'] = 'instance'
    ys.attrs['classes'] = class_ids
    ys.attrs['labels'] = class_labels


    metas = h5f.create_dataset('meta', (instance_count, ),
                               dtype=np.dtype([('user', np.int8), ('day', np.int8), ('attempt', np.int8), ('length', np.int32)]))
    metas.attrs['d1'] = 'instance'

    txt_regex = re.compile('[A-Z]_\w*_\w*(\d+)-(\d+).txt')
    count = 0
 
    for user_id in range(1, 9):
        for day_id in range(1, 8):
            rar_content = files_in_zip[f'U{user_id} ({day_id}).rar']

            with tempfile.NamedTemporaryFile() as f:
                f.write(rar_content)
                f.flush()
                with rarfile.RarFile(f.name) as rf:
                    rf.testrar()
                    for f in rf.infolist():
                        match = txt_regex.match(f.filename)
                        if match:
                            target_class, attempt = match.group(1), match.group(2)
                            with rf.open(f.filename) as txt_file:
                                instance_data = np.loadtxt(txt_file).reshape(-1, 3)
                                padded_data = np.pad(instance_data, ((0, max_length - instance_data.shape[0]), (0,0)), 'constant')
                                xs[count] = padded_data
                                ys[count] = int(target_class) - 1
                                metas[count] = (user_id, day_id, attempt, instance_data.shape[0])

                            count += 1
    ys.attrs['classes'] = np.unique(ys).astype(np.int8)
# Basic sanity check
assert count == instance_count            

# Load Skoda dataset

Dataset is available at http://har-dataset.org/lib/exe/fetch.php?media=wiki:dataset:skodaminicp:skodaminicp_2015_08.zip

After downloading, it should be unzipped into the `original` directory in the project root.

Be aware of the fact, that we cut off the activities to be of length 1163 (this cuts
one outlier activity of length 1713).

In [61]:
skoda_mat = sio.loadmat('../original/SkodaMiniCP_2015_08/dataset_cp_2007_12.mat')

class_ids = list(range(10))
class_labels = ['write notepad', 'open hood', 'close hood',
                'check gaps front', 'open left front door',
                'close left front door', 'close both left door',
                'check trunk gaps', 'open and close trunk',
                'check steering wheel']
instance_count = 4867
max_length = 177

with h5py.File('skoda.h5', 'a') as h5f:
    for key in ['x', 'y', 'meta']:
        if key in h5f:
            del h5f[key]
    xs = h5f.create_dataset('x', (instance_count, max_length, 60), dtype='float64')
    xs.attrs['d1'] = 'instance'
    xs.attrs['d2'] = 'time'
    xs.attrs['d3'] = 'channel'
    xs.attrs['channels'] = sum(
        [
            [f'acc{i}-x', f'acc{i}-y', f'acc{i}-z']
            for i in range(1, 21)
        ], [])

    ys = h5f.create_dataset('y', (instance_count, ), dtype=np.dtype([('class', np.int8)]))
    ys.attrs['d1'] = 'instance'
    ys.attrs['classes'] = class_ids
    ys.attrs['labels'] = class_labels

    metas = h5f.create_dataset('meta', (instance_count, ), dtype=np.dtype([('attempt', np.int8), ('length', np.int32)]))
    metas.attrs['d1'] = 'instance'
    
    rlengths = []
    rrlengths = []
    
    for axis, axis_data in enumerate(skoda_mat['dataset_left'][0]):
        cum_offset = 0
        for target_class, target_class_data in enumerate(axis_data[0]):
            for instance, instance_data in enumerate(target_class_data[0][:70]):
                instance_data = instance_data[0]
                labels, starts, lengths = split_longer_runs([target_class], [0], [instance_data.shape[0]], min_split_length=100)
                
                for offset in range(len(labels)):
                    split_instance_data = instance_data[starts[offset]: starts[offset] + lengths[offset]]
                    padded_data = np.pad(split_instance_data, ((0, max_length - split_instance_data.shape[0])), 'constant')
                                                                
                    xs[cum_offset + offset, :, axis] = padded_data
                    ys[cum_offset + offset] = target_class
                    metas[cum_offset + offset] = (instance, split_instance_data.shape[0])
                rlengths.append(len(labels))

                cum_offset += len(labels)
        assert cum_offset == instance_count
        print(f'Axis {axis} processed.')

        
    for axis, axis_data in enumerate(skoda_mat['dataset_right'][0]):
        cum_offset = 0
        for target_class, target_class_data in enumerate(axis_data[0]):
            for instance, instance_data in enumerate(target_class_data[0][:70]):
                otherside_length = skoda_mat['dataset_left'][0][axis][0][target_class][0][instance][0].shape[0]
                instance_data = instance_data[0][:otherside_length]
                # Cut and pad to same length as other side
                instance_data = np.pad(instance_data, ((0, otherside_length - instance_data.shape[0])), 'constant')
                
                labels, starts, lengths = split_longer_runs([target_class], [0], [instance_data.shape[0]], min_split_length=100)
                
                for offset in range(len(labels)):
                    split_instance_data = instance_data[starts[offset]: starts[offset] + lengths[offset]]
                    padded_data = np.pad(split_instance_data, ((0, max_length - split_instance_data.shape[0])), 'constant')
                                                                
                    xs[cum_offset + offset, :, 30 + axis] = padded_data

                cum_offset += len(labels)
                rrlengths.append(len(labels))
        assert cum_offset == instance_count
        print(f'Axis {30 + axis} processed.')
    ys.attrs['classes'] = np.unique(ys).astype(np.int8)


Axis 0 processed.
Axis 1 processed.
Axis 2 processed.
Axis 3 processed.
Axis 4 processed.
Axis 5 processed.
Axis 6 processed.
Axis 7 processed.
Axis 8 processed.
Axis 9 processed.
Axis 10 processed.
Axis 11 processed.
Axis 12 processed.
Axis 13 processed.
Axis 14 processed.
Axis 15 processed.
Axis 16 processed.
Axis 17 processed.
Axis 18 processed.
Axis 19 processed.
Axis 20 processed.
Axis 21 processed.
Axis 22 processed.
Axis 23 processed.
Axis 24 processed.
Axis 25 processed.
Axis 26 processed.
Axis 27 processed.
Axis 28 processed.
Axis 29 processed.
Axis 30 processed.
Axis 31 processed.
Axis 32 processed.
Axis 33 processed.
Axis 34 processed.
Axis 35 processed.
Axis 36 processed.
Axis 37 processed.
Axis 38 processed.
Axis 39 processed.
Axis 40 processed.
Axis 41 processed.
Axis 42 processed.
Axis 43 processed.
Axis 44 processed.
Axis 45 processed.
Axis 46 processed.
Axis 47 processed.
Axis 48 processed.
Axis 49 processed.
Axis 50 processed.
Axis 51 processed.
Axis 52 processed.
Axi

# Load Opportunity dataset

Dataset is available at http://opportunity-project.eu/challengeDownload.html

After downloading, it should be unzipped into the `original` directory in the project root.

We use the channels suggested by the original documentation (2-37, 38-46, 51-59, 64-72, 77-85, 90-98, 103-134)
and use the middle level gesture labels as target classes.

Meta contains information about the subject and run.

Beware that the data contains nan values as well.

In [18]:
opportunity_data_dir = '../original/OpportunityUCIDataset/dataset'

label_translation = {
    0: 0,
    406516: 1, 406517: 2, 404516: 3, 404517: 4,
    406520: 5, 404520: 6, 406505: 7, 404505: 8, 406519: 9, 404519: 10,
    406511: 11, 404511: 12, 406508: 13, 404508: 14,
    408512: 15, 407521: 16, 405506: 17
}
class_ids = list(range(18))
class_labels = ['null', 'open door 1', 'open door 2', 'close door 1',
                'close door 2', 'open fridge', 'close fridge',
                'open dishwasher', 'close dishwasher',
                'open drawer 1', 'close drawer 1',
                'open drawer 2', 'close drawer 2',
                'open drawer 3', 'close drawer 3',
                'clean table', 'drink from cup', 'toggle switch']
instance_count = 27530
max_length = 60

channels = []
for sensor in ['RKN^', 'HIP', 'LUA^', 'RUA_', 'LH', 'BACK', 'RKN_', 'RWR', 'RUA^', 'LUA_', 'LWR', 'RH']:
    for axis in ['accX', 'accY', 'accZ']:
        channels.append(f'Accelerometer {sensor} {axis}')

for sensor in ['BACK', 'RUA', 'RLA', 'LUA', 'LLA']:
    for axis in ['accX', 'accY', 'accZ', 'gyroX', 'gyroY', 'gyroZ', 'magneticX', 'magneticY', 'magneticZ']:
        channels.append(f'InertialMeasurementUnit {sensor} {axis}')

for sensor in ['L-SHOE', 'R-SHOE']:
    for axis in ['EuX', 'EuY', 'EuZ', 'Nav_Ax', 'Nav_Ay', 'Nav_Az', 'Body_Ax', 'Body_Ay', 'Body_Az', 'AngVelBodyFrameX', 'AngVelBodyFrameY', 'AngVelBodyFrameZ', 'AngVelNavFrameX', 'AngVelNavFrameY', 'AngVelNavFrameZ', 'Compass']:
        channels.append(f'InertialMeasurementUnit {sensor} {axis}')

with h5py.File('opportunity.h5', 'a') as h5f:
    for key in ['x', 'y', 'meta']:
        if key in h5f:
            del h5f[key]
    xs = h5f.create_dataset('x', (instance_count, max_length, 113), dtype='float64')
    xs.attrs['d1'] = 'instance'
    xs.attrs['d2'] = 'time'
    xs.attrs['d3'] = 'channel'
    xs.attrs['channels'] = channels

    ys = h5f.create_dataset('y', (instance_count, ), dtype=np.dtype([('class', np.int8)]))
    ys.attrs['d1'] = 'instance'
    ys.attrs['classes'] = class_ids
    ys.attrs['labels'] = class_labels

    metas = h5f.create_dataset('meta', (instance_count, ), dtype=np.dtype([('subject', np.int8), ('length', np.int32), ('run', 'S10')]))
    metas.attrs['d1'] = 'instance'
    count = 0
    for file in os.listdir(opportunity_data_dir):
        if not file.endswith('.dat'):
            continue
        print(f'Processing : {file}')
        data = np.loadtxt(os.path.join(opportunity_data_dir, file))
        subject_id = int(file[1]) - 1
        run_id = file[3:-4]
        input_channels = np.concatenate([data[:, 1:37], data[:,37:46],
                              data[:,50:59], data[:,63:72], data[:,76:85],
                              data[:,89:98], data[:,102:134] ], axis=1)
        labels, starts, lengths = split_longer_runs(*find_runs(data[:, 249]))
        for offset in range(len(labels)):
            instance_data = input_channels[starts[offset]: starts[offset] + lengths[offset]]
            padded_data = np.pad(instance_data, ((0, max_length - instance_data.shape[0]), (0,0)), 'constant')
            xs[count + offset] = padded_data
            ys[count + offset] = label_translation[labels[offset]]
            metas[count + offset] = (subject_id, lengths[offset], run_id)

        count += len(labels)

assert count == instance_count

Processing : S1-ADL4.dat
Processing : S1-Drill.dat
Processing : S1-ADL5.dat
Processing : S1-ADL1.dat
Processing : S1-ADL2.dat
Processing : S1-ADL3.dat
Processing : S2-ADL2.dat
Processing : S3-ADL2.dat
Processing : S3-ADL3.dat
Processing : S2-ADL3.dat
Processing : S3-ADL1.dat
Processing : S2-ADL1.dat
Processing : S3-Drill.dat
Processing : S3-ADL4.dat
Processing : S2-ADL4.dat
Processing : S2-ADL5.dat
Processing : S3-ADL5.dat
Processing : S4-ADL4.dat
Processing : S4-ADL5.dat
Processing : S2-Drill.dat
Processing : S4-ADL2.dat
Processing : S4-ADL3.dat
Processing : S4-ADL1.dat
Processing : S4-Drill.dat


# Load PAMAP2 dataset

Dataset is available at https://archive.ics.uci.edu/ml/datasets/PAMAP2+Physical+Activity+Monitoring

After downloading, it should be unzipped into the `original` directory in the project root.

We use the channels for the heart rate, the 16g range acceleration, gyroscope data
and magnetometer data, as noted in the original documentation,
from both the Protocol and Optional runs.

Meta contains information about the subject.

Beware that the data contains nan values as well.

In [5]:
pamap_data_dir = '../original/PAMAP2_Dataset/Protocol'
pamap_data_dir_2 = '../original/PAMAP2_Dataset/Optional'
label_translation = {
    1:0, 2:1, 3:2, 4:3, 5:4, 6:5, 7:6, 9:7, 10:8,
    11:9, 12:10, 13:11, 16:12, 17:13, 18:14, 19:15, 20:16, 24:17,
}
class_ids = list(range(18))
class_labels = ['lying', 'sitting', 'standing', 'walking', 'running', 'cycling',
                'nordic walking', 'watching tv', 'computer work', 'car driving',
                'ascending stairs', 'descending stairs', 'vacuum cleaning', 'ironing',
                'folding laundry', 'house cleaning', 'playing soccer', 'rope jumping']

instance_count = 27185
max_length = 128

channels = ['heartrate']
for sensor in range(1, 4):
    for modality in ['acc', 'gyro', 'magnet']:
        for axis in ['x', 'y', 'z']:
            channels.append(f'{modality}{sensor}-{axis}')

with h5py.File('pamap2.h5', 'a') as h5f:
    for key in ['x', 'y', 'meta']:
        if key in h5f:
            del h5f[key]
    xs = h5f.create_dataset('x', (instance_count, max_length, len(channels)), dtype='float64')
    xs.attrs['d1'] = 'instance'
    xs.attrs['d2'] = 'time'
    xs.attrs['d3'] = 'channel'
    xs.attrs['channels'] = channels

    ys = h5f.create_dataset('y', (instance_count, ), dtype=np.dtype([('class', np.int8)]))
    ys.attrs['d1'] = 'instance'
    ys.attrs['classes'] = class_ids
    ys.attrs['labels'] = class_labels

    metas = h5f.create_dataset('meta', (instance_count, ), dtype=np.dtype([('subject', np.int8), ('length', np.int32)]))
    metas.attrs['d1'] = 'instance'
    count = 0
    files = os.listdir(pamap_data_dir) + os.listdir(pamap_data_dir_2)
    paths = [os.path.join(pamap_data_dir, file) for file in os.listdir(pamap_data_dir)] \
            + [os.path.join(pamap_data_dir_2, file) for file in os.listdir(pamap_data_dir_2)]
    for file, path in zip(files, paths):
        if not file.endswith('.dat'):
            continue
        print(f'Processing : {file}')
        data = np.loadtxt(path)
        subject_id = int(file[7:-4]) - 101
        input_channels = np.concatenate([data[:, 2:3],
                              data[:,4:7], data[:,10:16],
                              data[:,21:24], data[:,27:33],
                              data[:,38:41], data[:,44:50]], axis=1)
        labels, starts, lengths = split_longer_runs(*find_runs(data[:, 1]), 100)
        print('Found segments of classes:', np.unique(labels))
        inc_count = 0
        for offset in range(len(labels)):
            if labels[offset] == 0:
                continue  # Skip the transient activity data

            instance_data = input_channels[starts[offset]: starts[offset] + lengths[offset]]
            padded_data = np.pad(instance_data, ((0, max_length - instance_data.shape[0]), (0,0)), 'constant')
            xs[count + inc_count] = padded_data
            ys[count + inc_count] = label_translation[labels[offset]]
            metas[count + inc_count] = (subject_id, lengths[offset])
            inc_count += 1
        count += inc_count

assert count == instance_count


Processing : subject108.dat
Found segments of classes: [ 0  1  2  3  4  5  6  7 12 13 16 17 24]
Processing : subject109.dat
Found segments of classes: [ 0 24]
Processing : subject107.dat
Found segments of classes: [ 0  1  2  3  4  5  6  7 12 13 16 17]
Processing : subject106.dat
Found segments of classes: [ 0  1  2  3  4  5  6  7 12 13 16 17 24]
Processing : subject104.dat
Found segments of classes: [ 0  1  2  3  4  5  6  7 12 13 16 17]
Processing : subject105.dat
Found segments of classes: [ 0  1  2  3  4  5  6  7 12 13 16 17 24]
Processing : subject101.dat
Found segments of classes: [ 0  1  2  3  4  5  6  7 12 13 16 17 24]
Processing : subject102.dat
Found segments of classes: [ 0  1  2  3  4  5  6  7 12 13 16 17 24]
Processing : subject103.dat
Found segments of classes: [ 0  1  2  3  4 12 13 16 17]
Processing : subject108.dat
Found segments of classes: [ 0 10 18 19 20]
Processing : subject109.dat
Found segments of classes: [ 0 10 18 19 20]
Processing : subject106.dat
Found segments 

# Load datasets preprocessed by Jordao et al.

These preprocessed datasets are available at
https://github.com/arturjordao/WearableSensorData

The git repository should be cloned into the `original` directory inside the project root.

Data is read from the numpy format and re-saved into hdf5.
- target class is translated from one-hot encoding to integers
- we use the FNOW directory (fully non overlapping window), to comply with the rest
  of our datasets

The original work also contains pre-calculated folds used for reproduction,
however these folds are not passed on to our data.


In [63]:
wsd_data_dir = '../original/WearableSensorData/data/FNOW'

for file in os.listdir(wsd_data_dir):
    if not file.endswith('.npz'):
        continue
    print(f'Processing : {file}')
    data = np.load(os.path.join(wsd_data_dir, file))
    in_X = data['X']
    in_y = data['y']

    instance_count = in_X.shape[0]
    max_length = in_X.shape[2]
    channel_count = in_X.shape[3]

    with h5py.File(f'{file[:-4].lower()}.h5', 'a') as h5f:
        for key in ['x', 'y', 'meta']:
            if key in h5f:
                del h5f[key]
        xs = h5f.create_dataset('x', (instance_count, max_length, channel_count),
                                dtype='float64',
                                data=in_X[:, 0, :, :])
        xs.attrs['d1'] = 'instance'
        xs.attrs['d2'] = 'time'
        xs.attrs['d3'] = 'channel'
        
        t = in_y.argmax(axis=1)
        print(instance_count)
        ys = h5f.create_dataset('y', (instance_count, ), dtype=np.dtype([('class', np.int8)]))
        ys[:] = in_y.argmax(axis=1)[:]
        ys.attrs['d1'] = 'instance'
        ys.attrs['classes'] = np.unique(ys).astype(np.int8)

        metas = h5f.create_dataset('meta', (instance_count, ),
                                   dtype=np.dtype([('length', np.int32)]))
        metas[:] = [max_length] * instance_count
        metas.attrs['d1'] = 'instance'
        
        if file == 'MHEALTH.npz':
            xs.attrs['channels'] = ['acc1-x', 'acc1-y', 'acc1-z', 'ecg1', 'ecg2',
                                   'acc2-x', 'acc2-y', 'acc2-z', 'gyro2-x', 'gyro2-y', 'gyro2-z', 'magnet2-x', 'magnet2-y', 'magnet2-z',
                                   'acc3-x', 'acc3-y', 'acc3-z', 'gyro3-x', 'gyro3-y', 'gyro3-z', 'magnet3-x', 'magnet3-y', 'magnet3-z']
            ys.attrs['labels'] = ['standing still', 'sitting and relaxing', 'lying down', 'walking', 'climbing stairs',
                                 'waist bends forward', 'frontal elevation of arms', 'knees bending', 'cycling', 'jogging',
                                 'running', 'jump front & back']

Processing : USCHAD.npz
5038
Processing : UTD-MHAD1_1s.npz
2048
Processing : WISDM.npz
10516
Processing : WHARF.npz
2146
Processing : UTD-MHAD2_1s.npz
616
Processing : MHEALTH.npz
1335
