# KuHar Views generator

In [1]:
from pathlib import Path
from typing import List
import hashlib
import pandas as pd

from librep.datasets.har.kuhar import (
    RawKuHar,
    RawKuHarIterator,
    KuHarDatasetGenerator
)
from librep.utils.dataset import PandasDatasetsIO

%matplotlib inline

In [2]:
dataset_dir = Path("../data/raw/KuHar/1.Raw_time_domian_data")
output_dir = Path("../data/processed/KuHar/")
train_size = 0.7
validation_size = 0.1
test_size = 0.2
ensure_distinct_users = True
balance_samples = True
seed = 0

In [3]:
def add_standard_activity_code(df: pd.DataFrame) -> pd.DataFrame:
    labels_map = {
        # activity code: standard activity code
        0: 1,
        1: 0,
        2: -1,
        3: -1,
        4: -1,
        5: -1,
        6: -1,
        7: -1,
        8: -1,
        9: -1,
        10: -1,
        11: 2,
        12: -1,
        13: -1,
        14: 5,
        15: 3,
        16: 4,
        17: -1,
    }

    df["standard activity code"] = df["activity code"].map(labels_map)
    return df


In [4]:
kuhar_dataset = RawKuHar(dataset_dir, download=False)
iterator = RawKuHarIterator(kuhar_dataset)
iterator

Kuhar Iterator: users=89, activities=18

## Raw data

In [5]:
kuhar_generator = KuHarDatasetGenerator(iterator, time_window=300, window_overlap=0, resampler=False, fs=None)
train, validation, test = kuhar_generator.create_datasets(
    train_size=train_size,
    validation_size=validation_size,
    test_size=test_size,
    ensure_distinct_users_per_dataset=ensure_distinct_users,
    balance_samples=balance_samples,
    seed=seed
)

train, validation, test = list(map(add_standard_activity_code, [train, validation, test]))
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

output_path = output_dir / "balanced"

description = """# Balanced KuHar Dataset

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples. 

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.
"""

PandasDatasetsIO(output_path).save(train, validation, test, description=description)
print(f"Dataset saved to {output_path} directory.")

Generating full df over KuHar View: 1945it [01:25, 22.85it/s]


dbb153e329a602fc5b142f25d6c5da00a35f48d0
29a040e169f546975f86585ff1eac437418d611b
6e55b90894d73b2f6fc8ff851ff94dc3cb611c65
Dataset saved to ../data/processed/KuHar/balanced directory.


## Normalized data

In [6]:
kuhar_generator = KuHarDatasetGenerator(iterator, time_window=60, window_overlap=0, resampler=True, fs=20)
train, validation, test = kuhar_generator.create_datasets(
    train_size=train_size,
    validation_size=validation_size,
    test_size=test_size,
    ensure_distinct_users_per_dataset=ensure_distinct_users,
    balance_samples=balance_samples,
    seed=seed
)

train, validation, test = list(map(add_standard_activity_code, [train, validation, test]))
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

output_path = output_dir / "balanced_normalized"

description = """# Balanced Normalized KuHar Dataset

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples. 
The samples were resampled to 20Hz before splitting into windows.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.
"""

PandasDatasetsIO(output_path).save(train, validation, test, description=description)
print(f"Dataset saved to {output_path} directory.")

Generating full df over KuHar View: 1945it [00:50, 38.78it/s]


df773b945663a680d6b09d594775e36a5851cf53
11168f662309f1d6842e0ef13b766b332ed1fc6e
570c2e519e95389c40720df1ce5b5204ba39a845
Dataset saved to ../data/processed/KuHar/balanced_normalized directory.


# Selected Data

Here we include only selected activities: 0, 1, 11, 14, 15, 16

In [7]:
selected_activities = [0, 1, 11, 14, 15, 16]
iterator = RawKuHarIterator(kuhar_dataset, activities=selected_activities)
iterator

Kuhar Iterator: users=89, activities=6

## Raw

In [8]:
kuhar_generator = KuHarDatasetGenerator(iterator, time_window=300, window_overlap=0, resampler=False, fs=None)
train, validation, test = kuhar_generator.create_datasets(
    train_size=train_size,
    validation_size=validation_size,
    test_size=test_size,
    ensure_distinct_users_per_dataset=ensure_distinct_users,
    balance_samples=balance_samples,
    seed=seed
)

train, validation, test = list(map(add_standard_activity_code, [train, validation, test]))
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

output_path = output_dir / "selected_balanced"

description = """# Selected Balanced KuHar Dataset

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples. 
Only standard activities were selected, that is, activities 0, 1, 11, 14, 15 and 16.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.
"""

PandasDatasetsIO(output_path).save(train, validation, test, description=description)
print(f"Dataset saved to {output_path} directory.")

Generating full df over KuHar View: 625it [00:28, 22.06it/s]


24d9a14deb7c24a736d052aa6349adfab061dac6
70dd040328487caaef43a1ca5772133c8c68a18a
01bc6661dedeed42dbfa1bfa455b33cbeab27ceb
Dataset saved to ../data/processed/KuHar/selected_balanced directory.


## Normalized

In [9]:
kuhar_generator = KuHarDatasetGenerator(iterator, time_window=60, window_overlap=0, resampler=True, fs=20)
train, validation, test = kuhar_generator.create_datasets(
    train_size=train_size,
    validation_size=validation_size,
    test_size=test_size,
    ensure_distinct_users_per_dataset=ensure_distinct_users,
    balance_samples=balance_samples,
    seed=seed
)

train, validation, test = list(map(add_standard_activity_code, [train, validation, test]))
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

output_path = output_dir / "selected_balanced_normalized"

description = """# Balanced Normalized KuHar Dataset

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples. 
The samples were resampled to 20Hz before splitting into windows. Only standard activities were selected, that is, activities 0, 1, 11, 14, 15 and 16.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.
"""

PandasDatasetsIO(output_path).save(train, validation, test, description=description)
print(f"Dataset saved to {output_path} directory.")

Generating full df over KuHar View: 625it [00:16, 38.45it/s]


c7a474c9b374606034d4f7787301d78daf8f2d5c
f89cc1a8be4ee1598cfa04f8b94f931efddc55b9
e12611033c2c99739a8ef1a3071f1a885ed70196
Dataset saved to ../data/processed/KuHar/selected_balanced_normalized directory.
