In [1]:
import torch
from datasets import load_dataset
dataset = load_dataset('alexshengzhili/Accel2ActivityCrawl', split='capture24_30hz_w10_o0_unfileterd_rawlabel').with_format("torch")
max_length = 300

def process_single_example(example):
    x_value = example['x']

    if x_value.shape[0] > max_length:
        x_value = x_value[:max_length]
    elif x_value.shape[0] < max_length:
        padding = torch.zeros((max_length - x_value.shape[0], 3))
        x_value = torch.cat([x_value, padding], dim=0)
    example['x'] = x_value
    return example

formated_dataset = dataset.map(process_single_example, num_proc=30)


Found cached dataset parquet (/home/ubuntu/.cache/huggingface/datasets/alexshengzhili___parquet/alexshengzhili--Accel2ActivityCrawl-bccdf31cd551dca3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/alexshengzhili___parquet/alexshengzhili--Accel2ActivityCrawl-bccdf31cd551dca3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-832bd0094fdcc361_*_of_00030.arrow


In [17]:
non_sleep_filer = formated_dataset.filter(lambda example: 'sleeping' not in example['y'], num_proc=50)

Filter (num_proc=50):   0%|          | 0/1372784 [00:00<?, ? examples/s]

In [16]:
len(non_sleep_filer)

340076

In [20]:
label_list = non_sleep_filer['y']

In [21]:
from collections import Counter
Counter(label_list)

Counter({'': 463249,
         'home activity;miscellaneous;walking;17150 walking household without observable loads;MET 2.0': 22226,
         'home activity;household chores;preparing meals/cooking/washing dishes;5035 kitchen activity general cooking/washing/dishes/cleaning up;MET 3.3': 39634,
         'home activity;eating;13030 eating sitting alone or with someone;MET 1.5': 11692,
         'home activity;miscellaneous;standing;9070 standing reading or without observable/identifiable activities;MET 1.8': 3251,
         'home activity;self care;13020 dressing/undressing;MET 2.5': 290,
         'transportation;walking;17270 walking as the single means to work or class (not from);MET 3.5': 3131,
         'occupation;interruption;11791 walking on job in office or lab area;MET 2.0': 5176,
         'occupation;interruption;sitting;9055 sitting using a mobile phone/smartphone/tablet or talking on the phone/computer (skype meeting etc.);MET 1.5': 970,
         'occupation;office and administr

In [23]:
non_empty = non_sleep_filer.filter(lambda example: len(example['y']) > 1, num_proc=50)

Filter (num_proc=50):   0%|          | 0/1032708 [00:00<?, ? examples/s]

In [26]:
non_empty.save_to_disk('capture24_30hz_w10_o0_unfileterd_rawlabel_non_empty_non_sleep', num_proc=40)

Flattening the indices (num_proc=40):   0%|          | 0/569459 [00:00<?, ? examples/s]

Saving the dataset (0/40 shards):   0%|          | 0/569459 [00:00<?, ? examples/s]

In [27]:

import pandas as pd

ANNOLABELFILE = '/home/ubuntu/ssl-wearables/data/capture24/annotation-label-dictionary.csv'
LABEL = 'label:Walmsley2020'
annolabel = pd.read_csv(ANNOLABELFILE, index_col='annotation')
def apply_annotation_conversion(example):
    ''' Convert annotation to label '''
    example['converted_y'] = annolabel.loc[example['y'], LABEL]
    return example

converted_y = non_empty.map(apply_annotation_conversion, num_proc=50)

Map (num_proc=50):   0%|          | 0/569459 [00:00<?, ? examples/s]

In [28]:
Counter(converted_y['converted_y'])

Counter({'light': 165245, 'sedentary': 360547, 'moderate-vigorous': 43667})

In [29]:
converted_y.save_to_disk('capture24_30hz_w10_o0_unfileterd_converted_y_non_empty_non_sleep', num_proc=40)

Saving the dataset (0/40 shards):   0%|          | 0/569459 [00:00<?, ? examples/s]

In [30]:
converted_y

Dataset({
    features: ['x', 'pid', 'y', 'converted_y'],
    num_rows: 569459
})