In [30]:
import numpy as np
import pandas as pd
from tqdm import tqdm as tqdm
import zarr
from matplotlib import pyplot as plt

In [2]:
def get_ds_infos():
    """
    Read the file includes data subject information.
    
    Data Columns:
    0: code [1-24]
    1: weight [kg]
    2: height [cm]
    3: age [years]
    4: gender [0:Female, 1:Male]
    
    Returns:
        A pandas DataFrame that contains inforamtion about data subjects' attributes 
    """ 

    dss = pd.read_csv("../data/MotionSense/data_subjects_info.csv")
    print("[INFO] -- Data subjects' information is imported.")
    
    return dss

def set_data_types(data_types=["userAcceleration"]):
    """
    Select the sensors and the mode to shape the final dataset.
    
    Args:
        data_types: A list of sensor data type from this list: [attitude, gravity, rotationRate, userAcceleration] 

    Returns:
        It returns a list of columns to use for creating time-series from files.
    """
    dt_list = []
    for t in data_types:
        if t != "attitude":
            dt_list.append([t+".x",t+".y",t+".z"])
        else:
            dt_list.append([t+".roll", t+".pitch", t+".yaw"])

    return dt_list


def creat_time_series(dt_list, act_labels, trial_codes, mode="mag", labeled=True):
    """
    Args:
        dt_list: A list of columns that shows the type of data we want.
        act_labels: list of activites
        trial_codes: list of trials
        mode: It can be "raw" which means you want raw data
        for every dimention of each data type,
        [attitude(roll, pitch, yaw); gravity(x, y, z); rotationRate(x, y, z); userAcceleration(x,y,z)].
        or it can be "mag" which means you only want the magnitude for each data type: (x^2+y^2+z^2)^(1/2)
        labeled: True, if we want a labeld dataset. False, if we only want sensor values.

    Returns:
        It returns a time-series of sensor data.
    
    """
    num_data_cols = len(dt_list) if mode == "mag" else len(dt_list*3)

    if labeled:
        dataset = np.zeros((0,num_data_cols+7)) # "7" --> [act, code, weight, height, age, gender, trial] 
    else:
        dataset = np.zeros((0,num_data_cols))
        
    ds_list = get_ds_infos()
    
    print("[INFO] -- Creating Time-Series")
    for sub_id in ds_list["code"]:
        for act_id, act in enumerate(act_labels):
            for trial in trial_codes[act_id]:
                fname = '../data/MotionSense/A_DeviceMotion_data/'+act+'_'+str(trial)+'/sub_'+str(int(sub_id))+'.csv'
                raw_data = pd.read_csv(fname)
                raw_data = raw_data.drop(['Unnamed: 0'], axis=1)
                vals = np.zeros((len(raw_data), num_data_cols))
                for x_id, axes in enumerate(dt_list):
                    if mode == "mag":
                        vals[:,x_id] = (raw_data[axes]**2).sum(axis=1)**0.5        
                    else:
                        vals[:,x_id*3:(x_id+1)*3] = raw_data[axes].values
                    vals = vals[:,:num_data_cols]
                if labeled:
                    lbls = np.array([[act_id,
                            sub_id-1,
                            ds_list["weight"][sub_id-1],
                            ds_list["height"][sub_id-1],
                            ds_list["age"][sub_id-1],
                            ds_list["gender"][sub_id-1],
                            trial          
                           ]]*len(raw_data))
                    vals = np.concatenate((vals, lbls), axis=1)
                dataset = np.append(dataset,vals, axis=0)
    cols = []
    for axes in dt_list:
        if mode == "raw":
            cols += axes
        else:
            cols += [str(axes[0][:-2])]
            
    if labeled:
        cols += ["act", "id", "weight", "height", "age", "gender", "trial"]
    
    dataset = pd.DataFrame(data=dataset, columns=cols)
    return dataset
#________________________________


ACT_LABELS = ["dws","ups", "wlk", "jog", "std", "sit"]
TRIAL_CODES = {
    ACT_LABELS[0]:[1,2,11],
    ACT_LABELS[1]:[3,4,12],
    ACT_LABELS[2]:[7,8,15],
    ACT_LABELS[3]:[9,16],
    ACT_LABELS[4]:[6,14],
    ACT_LABELS[5]:[5,13]
}

## Here we set parameter to build labeld time-series from dataset of "(A)DeviceMotion_data"
## attitude(roll, pitch, yaw); gravity(x, y, z); rotationRate(x, y, z); userAcceleration(x,y,z)
sdt = ["attitude", "gravity", "rotationRate", "userAcceleration"]
print("[INFO] -- Selected sensor data types: "+str(sdt))    
act_labels = ACT_LABELS [:]
print("[INFO] -- Selected activites: "+str(act_labels))    
trial_codes = [TRIAL_CODES[act] for act in act_labels]
dt_list = set_data_types(sdt)
dataset = creat_time_series(dt_list, act_labels, trial_codes, mode="raw", labeled=True)
print("[INFO] -- Shape of time-Series dataset:"+str(dataset.shape))    

[INFO] -- Selected sensor data types: ['attitude', 'gravity', 'rotationRate', 'userAcceleration']
[INFO] -- Selected activites: ['dws', 'ups', 'wlk', 'jog', 'std', 'sit']
[INFO] -- Data subjects' information is imported.
[INFO] -- Creating Time-Series
[INFO] -- Shape of time-Series dataset:(1412865, 19)


In [3]:
seconds = [0]
for i in tqdm(range(1, len(dataset))):
    if dataset.iloc[i]["trial"] != dataset.iloc[i-1]["trial"]:
        seconds.append(0)
    else:
        seconds.append(seconds[-1]+.02)
seconds = np.array(seconds)
dataset["time"] = seconds

  0%|          | 0/1412864 [00:00<?, ?it/s]

100%|██████████| 1412864/1412864 [01:02<00:00, 22777.72it/s]


In [4]:
dataset

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,act,id,weight,height,age,gender,trial,time
0,1.528132,-0.733896,0.696372,0.741895,0.669768,-0.031672,0.316738,0.778180,1.082764,0.294894,-0.184493,0.377542,0.0,0.0,102.0,188.0,46.0,1.0,1.0,0.00
1,1.527992,-0.716987,0.677762,0.753099,0.657116,-0.032255,0.842032,0.424446,0.643574,0.219405,0.035846,0.114866,0.0,0.0,102.0,188.0,46.0,1.0,1.0,0.02
2,1.527765,-0.706999,0.670951,0.759611,0.649555,-0.032707,-0.138143,-0.040741,0.343563,0.010714,0.134701,-0.167808,0.0,0.0,102.0,188.0,46.0,1.0,1.0,0.04
3,1.516768,-0.704678,0.675735,0.760709,0.647788,-0.041140,-0.025005,-1.048717,0.035860,-0.008389,0.136788,0.094958,0.0,0.0,102.0,188.0,46.0,1.0,1.0,0.06
4,1.493941,-0.703918,0.672994,0.760062,0.647210,-0.058530,0.114253,-0.912890,0.047341,0.199441,0.353996,-0.044299,0.0,0.0,102.0,188.0,46.0,1.0,1.0,0.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412860,-2.368280,-0.381075,-0.089417,-0.648402,0.371919,0.664267,-0.001312,-0.011512,0.001284,0.000789,-0.005937,-0.004355,5.0,23.0,74.0,173.0,18.0,0.0,13.0,67.62
1412861,-2.368752,-0.381071,-0.089478,-0.648090,0.371915,0.664574,-0.000293,-0.022169,0.001305,-0.000409,-0.000608,0.000098,5.0,23.0,74.0,173.0,18.0,0.0,13.0,67.64
1412862,-2.369130,-0.381163,-0.089465,-0.647814,0.372000,0.664795,0.007208,-0.012616,0.003482,-0.000486,0.000711,0.002045,5.0,23.0,74.0,173.0,18.0,0.0,13.0,67.66
1412863,-2.369372,-0.381295,-0.089468,-0.647619,0.372123,0.664916,0.006180,-0.003029,0.004531,0.000311,-0.003395,0.004746,5.0,23.0,74.0,173.0,18.0,0.0,13.0,67.68


In [26]:
## Process the data further into a format that can be used by the model
## We will be using a sliding window approach to generate the data
## The window size will be 124 samples (2.48 seconds) with a 50% overlap

# data will be produced on a trial by trial basis, and X will be a numpy array of shape (n_samples, 128, 12) and y will be a numpy array of shape (n_samples, 3)
# The columns of X will be [attitude(roll, pitch, yaw); gravity(x, y, z); rotationRate(x, y, z); userAcceleration(x,y,z)], and the columns of Y will be [act, weight, gender]

X = np.empty((0, 128, 12))
y = np.empty((0, 4))

for id in tqdm(dataset['id'].unique()):
    for act in dataset['act'].unique():
        for trial in dataset['trial'].unique():
            temp = dataset[(dataset['id'] == id) & (dataset['act'] == act) & (dataset['trial'] == trial)]
            if len(temp) > 0:
                act = temp['act'].iloc[0]
                weight = temp['weight'].iloc[0]
                gender = temp['gender'].iloc[0]
                for i in range(0, len(temp), 32):
                    if i+128 > len(temp):
                        data = temp.iloc[i:, :12].values
                        data = np.concatenate((data, np.zeros((128-len(data), 12))))
                        X = np.concatenate((X, data.reshape(1, 128, 12)))
                        y = np.concatenate((y, np.array([act, weight, gender, id]).reshape(1, 4)))
        

                    else:
                        X = np.concatenate((X, temp.iloc[i:i+128, :12].values.reshape(1, 128, 12)))
                        y = np.concatenate((y, np.array([act, weight, gender, id]).reshape(1, 4)))

100%|██████████| 24/24 [55:43<00:00, 139.32s/it]


In [31]:
## save the data into zarr format
zarr.save('../data/MotionSense/all_data.zarr', X)
zarr.save('../data/MotionSense/all_labels.zarr', y)

In [37]:
#split the data into train, validation, and test sets based off of y[:, 3] id
ids = np.unique(y[:, 3])

# 75% train, 25% test
train_ids = ids[:int(.75*len(ids))]
test_ids = ids[int(.75*len(ids)):]

print(f"number of train ids: {len(train_ids)}, number of test ids: {len(test_ids)}")

# split the data into train and test sets
train_idx = np.where(np.isin(y[:, 3], train_ids))[0]
test_idx = np.where(np.isin(y[:, 3], test_ids))[0]

print(f"number of train samples: {len(train_idx)}, number of test samples: {len(test_idx)}")

# split the data into train and test sets
X_train = X[train_idx]
y_train = y[train_idx]

X_test = X[test_idx]
y_test = y[test_idx]

# save the data into zarr format
zarr.save('../data/MotionSense/train_data.zarr', X_train)
zarr.save('../data/MotionSense/train_labels.zarr', y_train)

zarr.save('../data/MotionSense/test_data.zarr', X_test)
zarr.save('../data/MotionSense/test_labels.zarr', y_test)

number of train ids: 18, number of test ids: 6
number of train samples: 33102, number of test samples: 11213
