In [1]:
from pathlib import Path
from typing import List
import hashlib
import pandas as pd

from librep.datasets.har.kuhar import (
    RawKuHar,
    RawKuHarIterator,
    KuHarDatasetGenerator
)
from librep.utils.dataset import PandasDatasetsIO

%matplotlib inline

In [2]:
dataset_dir = Path("./data/KuHar/1.Raw_time_domian_data")
kuhar_dataset = RawKuHar(dataset_dir, download=False)
kuhar_dataset

KuHar Dataset at: 'data/KuHar/1.Raw_time_domian_data'

In [3]:
iterator = RawKuHarIterator(kuhar_dataset)
iterator

Kuhar Iterator: users=89, activities=18

In [4]:
kuhar_generator = KuHarDatasetGenerator(iterator, time_window=300, window_overlap=0, fs=None)
kuhar_generator

Dataset generator: time_window=300, overlap=0

In [5]:
train_raw, validation_raw, test_raw = kuhar_generator.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=True,
    seed=0
)

Generating full df over KuHar View: 1945it [01:23, 23.21it/s]


In [6]:
print(hashlib.sha1(pd.util.hash_pandas_object(train_raw).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation_raw).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test_raw).values).hexdigest())

# ca61c922fc72ef34f12bb19c954f63e20d552e1c
# 53190325648181a44390dd332815ff82d67b6e66
# 788444769fed9b4a2910739f680a33634a5715dd

ca61c922fc72ef34f12bb19c954f63e20d552e1c
53190325648181a44390dd332815ff82d67b6e66
788444769fed9b4a2910739f680a33634a5715dd


In [7]:
train_raw

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-299,accel-start-time,gyro-start-time,accel-end-time,gyro-end-time,activity code,length,serial,index,user
0,-0.056118,0.034403,0.052704,0.070734,0.020224,-0.048252,-0.033161,-0.006543,-0.001562,0.022189,...,-0.005646,30.379,30.331,33.433,33.352,0,300,6,3000,1040
1,-0.019538,-0.016915,0.021001,0.055937,0.036128,0.004878,-0.032916,-0.044168,-0.048170,-0.035626,...,-0.005636,35.360,35.349,38.380,38.371,0,300,1,3300,1025
2,0.078851,0.067761,0.042445,-0.016207,-0.060515,-0.052389,-0.039572,-0.020855,-0.020164,0.006835,...,0.000831,0.006,0.009,2.995,2.997,0,300,1,0,1010
3,-0.067950,0.001450,0.095617,0.070418,-0.008559,-0.001449,-0.013325,-0.036775,-0.043285,-0.014290,...,0.001721,3.045,3.034,6.067,6.057,0,300,1,300,1058
4,-0.030760,-0.005518,0.005185,0.029851,0.029403,0.007791,0.007751,-0.005227,-0.019164,-0.015232,...,0.011505,0.001,0.001,2.957,2.956,0,300,1,0,1015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3163,-2.313400,-2.892900,-2.673300,-2.809500,-2.480700,-2.364800,-1.480300,-0.130750,0.720060,1.141000,...,-0.073736,51.553,51.552,54.575,54.577,17,300,2,5100,1088
3164,12.728562,14.670412,14.441504,12.774478,10.833324,9.047502,7.200125,5.398930,4.047943,3.127044,...,-0.104629,45.494,45.493,48.516,48.515,17,300,2,4500,1086
3165,-4.490470,-4.377375,-3.154591,-1.199597,0.701107,2.356681,3.743246,4.541451,4.296527,3.414813,...,-0.268679,6.070,6.071,9.091,9.092,17,300,1,600,1087
3166,-13.566983,-12.306684,-10.653790,-9.129092,-7.738182,-6.136394,-4.105975,-1.504293,1.902500,5.816815,...,0.955308,54.593,54.594,57.617,57.616,17,300,2,5400,1084


In [8]:
kuhar_generator_normalized = KuHarDatasetGenerator(iterator, time_window=60, window_overlap=0, resampler=True, fs=20)

kuhar_generator_normalized

Dataset generator: time_window=60, overlap=0

In [9]:
train_normalized, validation_normalized, test_normalized = kuhar_generator_normalized.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=True,
    seed=0
)

Generating full df over KuHar View: 1945it [00:48, 40.49it/s]


In [10]:
print(hashlib.sha1(pd.util.hash_pandas_object(train_normalized).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation_normalized).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test_normalized).values).hexdigest())

# 6326718d4c104d7de03f740dc8f981b62347e8c7
# 9d5ecd055c4a17225e672624d984697af91241e3
# eeb24af9d8b6dbcdb4db8b2f2153f0302d258b26

6c7210b7f49c48c08eddb5853e59221aebdd0868
8d61265a5c0426423c36cb8842f3c912d39a0cca
55e959d09b58db9d5c59bd88d6f59d50d7a97425


In [11]:
train_raw.shape, train_normalized.shape

((3168, 1809), (3168, 369))

In [41]:
import numpy as np

def compare_metadata(dataset_normal, dataset_resampled, columns, index_a_fraction=1, index_b_fraction=1):

    for column in columns:
        if column == 'index':
            if not np.all(dataset_normal[column]/index_a_fraction == dataset_resampled[column]/index_b_fraction):
                return False

        else:
            if not np.all(dataset_normal[column] == dataset_resampled[column]):
                return False
    return True


In [44]:
compare_metadata(train_raw, train_normalized, ['activity code'])

True

In [54]:
len(sorted(np.where(train_raw["user"] != train_normalized["user"])[0]))

474

In [47]:
train_raw["user"]

0       1040
1       1025
2       1010
3       1058
4       1015
        ... 
3163    1088
3164    1086
3165    1087
3166    1084
3167    1086
Name: user, Length: 3168, dtype: int64

In [48]:
train_normalized["user"]

0       1040
1       1025
2       1010
3       1058
4       1015
        ... 
3163    1083
3164    1087
3165    1086
3166    1086
3167    1086
Name: user, Length: 3168, dtype: int64