# ExtraSensory Dataset Wrapper 
- Download the [features and labels zip file (215MB)](http://extrasensory.ucsd.edu/data/primary_data_files/ExtraSensory.per_uuid_features_labels.zip).
-  Download the [watch-accelerometer measurements zip file (800MB)](http://extrasensory.ucsd.edu/data/raw_measurements/ExtraSensory.raw_measurements.watch_acc.zip)
- Unzip them and paste their directory addresses.
- The rest of the code will parse **labels** and **watch accelerometer** data

In [17]:
import os
import pandas as pd
import numpy as np
import scipy

In [None]:
time_series_path = "/data/har/raw/ExtraSensory/watch_acc"
processed_path = "/data/har/processed/ExtraSensory"
ts_with_id = os.listdir(time_series_path)

In [3]:
label_path = "/data/har/raw/ExtraSensory/features_labels"
label_with_id = os.listdir(label_path)

In [None]:
curr_id = ts_with_id[0]
frequency = 10

In [5]:
# Label Mapping
label_mapping = {
    "sedentary_sitting_other": ["SITTING", "COMPUTER_WORK", "READING_A_BOOK"],
    "sedentary_sitting_transport": ['DRIVE_-_I_M_THE_DRIVER', 'DRIVE_-_I_M_A_PASSENGER', 'IN_A_CAR'],
    "sedentary_lying": ["LYING_DOWN", "SLEEPING"],
    "upright_standing": ["STANDING_IN_PLACE", "WASHING_DISHES", "COOKING"],
    "upright_stepping": ["STANDING_AND_MOVING", "WALKING", "SHOPPING", "OUTSIDE", "RUNNING", "STAIRS_-_GOING_UP", "STAIRS_-_GOING_DOWN"]
}

In [19]:
from scipy import interpolate
def df_resample(df, base_time_stamp, frequency):
    time = np.array(df["timestamp"])

    x = np.array(df["acc_x"])
    interp_x = interpolate.interp1d(time, x, fill_value="extrapolate")
    
    y = np.array(df["acc_y"])
    interp_y = interpolate.interp1d(time, y, fill_value="extrapolate")
    
    z = np.array(df["acc_z"])
    interp_z = interpolate.interp1d(time, z, fill_value="extrapolate")

    new_time_stamp = base_time_stamp + np.array(list(range(0, 20 * frequency))) / frequency + 1 / frequency / 2
    new_x = interp_x(new_time_stamp)
    new_y = interp_y(new_time_stamp)
    new_z = interp_z(new_time_stamp)
    
    new_data = {"time": new_time_stamp,
               "acc_x": new_x,
               "acc_y": new_y,
               "acc_z": new_z}
    return pd.DataFrame(data=new_data)

In [20]:
def single_ts_sample_wrapper(time_id, curr_id, time_series_path, label_df, label_mapping, frequency):
    ts_file_list = os.listdir(time_series_path + "/" + curr_id)
    base_time_stamp = int(ts_file_list[time_id].split(".")[0])
    base_time_stamp_idx = label_df['timestamp'][label_df['timestamp'] == base_time_stamp].index.tolist()[0]
    
    data_dist = {}
    for key in label_mapping.keys():
        data_dist[key] = 0

        for label in label_mapping[key]:
            new_label = "label:" + label
            if new_label in label_df.columns:
                if label_df[new_label][base_time_stamp_idx] == 1:
                    data_dist[key] = 1
    # print("watch acc file path: ", time_series_path + "/" + curr_id + "/" + ts_file_list[time_id])
    ts_df = pd.read_csv(time_series_path + "/" + curr_id + "/" + ts_file_list[time_id], header=None, sep=" ")
    ts_df.columns = ["timestamp", "acc_x", "acc_y", "acc_z"]
    ts_df["timestamp"] = np.array(ts_df["timestamp"]) / 1000 + base_time_stamp
    
    ts_df = df_resample(ts_df, base_time_stamp, frequency)
    
    for key in data_dist.keys():
        ts_df[key] =  data_dist[key]
    return ts_df

In [21]:
def es_wrapper(idx, label_with_id, label_mapping, frequency):
    curr_id = ts_with_id[idx]

    label_file_idx = 0
    for i in range(len(label_with_id)):
        if curr_id == label_with_id[i].split(".")[0]:
            label_file_idx = i
            break
    label_file_name = label_with_id[label_file_idx]
    label_df = pd.read_csv(label_path + "/" + label_file_name, compression="gzip")
    
    raw_label = []
    for i in range(len(label_df.columns)):
        if label_df.columns[i].split(":")[0] == "label":
            raw_label.append(label_df.columns[i].split(":")[1])
            
    
    ts_file_list = os.listdir(time_series_path + "/" + curr_id)
    ts_df = []
    for i in range(len(ts_file_list)):
        ts_df.append(single_ts_sample_wrapper(i, curr_id, time_series_path, label_df, label_mapping, frequency))
    ts_df = pd.concat(ts_df, ignore_index=True)
    return ts_df

In [None]:
from tqdm import tqdm

for idx in tqdm(range(40, len(ts_with_id))):
    if ts_with_id[idx] != "A5A30F76-581E-4757-97A2-957553A2C6AA":
        print(ts_with_id[idx])
        ts_df = es_wrapper(idx, label_with_id, label_mapping, frequency)
        ts_df.to_csv(processed_path + "/" + ts_with_id[idx] + ".csv")

  0%|          | 0/16 [00:00<?, ?it/s]

9759096F-1119-4E19-A0AD-6F16989C7E1C


  6%|▋         | 1/16 [01:22<20:30, 82.04s/it]

5EF64122-B513-46AE-BCF1-E62AAC285D2C


 12%|█▎        | 2/16 [01:41<10:32, 45.15s/it]

3600D531-0C55-44A7-AE95-A7A38519464E


 19%|█▉        | 3/16 [02:02<07:23, 34.11s/it]

E65577C1-8D5D-4F70-AF23-B3ADB9D3DBA3


 25%|██▌       | 4/16 [02:22<05:40, 28.42s/it]

00EABED2-271D-49D8-B599-1D4A09240601


 31%|███▏      | 5/16 [02:24<03:30, 19.14s/it]

81536B0A-8DBF-4D8A-AC24-9543E2E4C8E0


 38%|███▊      | 6/16 [03:00<04:07, 24.73s/it]

CCAF77F0-FABB-4F2F-9E24-D56AD0C5A82F


 44%|████▍     | 7/16 [03:56<05:15, 35.08s/it]

1538C99F-BA1E-4EFB-A949-6C7C47701B20


 50%|█████     | 8/16 [03:58<03:15, 24.38s/it]

7CE37510-56D0-4120-A1CF-0E23351428D2


 56%|█████▋    | 9/16 [05:17<04:50, 41.47s/it]

BEF6C611-50DA-4971-A040-87FB979F3FC1


 62%|██████▎   | 10/16 [05:47<03:48, 38.08s/it]

B7F9D634-263E-4A97-87F9-6FFB4DDCB36C


 69%|██████▉   | 11/16 [07:35<04:57, 59.41s/it]

F50235E0-DD67-4F2A-B00B-1F31ADA998B9


 75%|███████▌  | 12/16 [07:45<02:57, 44.32s/it]

A5CDF89D-02A2-4EC1-89F8-F534FDABDD96


 81%|████████▏ | 13/16 [08:17<02:01, 40.57s/it]

4E98F91F-4654-42EF-B908-A3389443F2E7


 88%|████████▊ | 14/16 [08:36<01:08, 34.06s/it]

11B5EC4D-4133-4289-B475-4E737182A406


 94%|█████████▍| 15/16 [09:54<00:47, 47.36s/it]

83CF687B-7CEC-434B-9FE8-00C3D5799BE6


100%|██████████| 16/16 [11:17<00:00, 42.32s/it]


In [None]:
ts_files = os.listdir(processed_path)
df = pd.read_csv(processed_path + "/" + ts_files[0])
print(df.head())
print(df.columns)
print(df.shape)


   Unnamed: 0          time      acc_x       acc_y        acc_z  \
0           0  1.449740e+09 -80.000000  120.000000   992.799943   
1           1  1.449740e+09 -80.399971  119.600029  1016.399971   
2           2  1.449740e+09 -83.599981  104.000000  1019.599981   
3           3  1.449740e+09 -80.000000  112.000000  1016.000000   
4           4  1.449740e+09 -84.400019  111.199962  1016.000000   

   sedentary_sitting_other  sedentary_sitting_transport  sedentary_lying  \
0                        1                            0                0   
1                        1                            0                0   
2                        1                            0                0   
3                        1                            0                0   
4                        1                            0                0   

   upright_standing  upright_stepping  
0                 0                 0  
1                 0                 0  
2                 0 

In [36]:
df["time_shift"] = df["time"] - df["time"].shift(1)
df["time_shift"].head()
most_occurrence = df["time_shift"].mode()[0]
mid_value = df["time_shift"].median()

print("Most Occurrence:", most_occurrence)
print("Mid Value:", mid_value)
print ("frequency:", 1 / mid_value, "Hz")

Most Occurrence: 0.09999990463256836
Mid Value: 0.09999990463256836
frequency: 10.000009536752259 Hz
