In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from scipy import interpolate

# Data Processing

In [2]:
columns = ["timestamp", "activity", "heartrate", ]
columns += [f"hand-{i}" for i in range(1,18)]
columns += [f"chest-{i}" for i in range(1,18)]
columns += [f"ankle-{i}" for i in range(1,18)]

In [3]:
dataframes = []
for subject in range(101, 110):
    df = pd.read_table(f'./PAMAP2_Dataset/Protocol/subject{subject}.dat', sep='\s+')
    df.columns = columns
    dataframes.append(df)
data = pd.concat(dataframes, ignore_index=True)
print("Data shape:", data.shape)

Data shape: (2872524, 54)


In [4]:
data.head()

Unnamed: 0,timestamp,activity,heartrate,hand-1,hand-2,hand-3,hand-4,hand-5,hand-6,hand-7,...,ankle-8,ankle-9,ankle-10,ankle-11,ankle-12,ankle-13,ankle-14,ankle-15,ankle-16,ankle-17
0,8.39,0,,30.0,2.18837,8.5656,3.66179,2.39494,8.55081,3.64207,...,-0.006577,-0.004638,0.000368,-59.8479,-38.8919,-58.5253,1.0,0.0,0.0,0.0
1,8.4,0,,30.0,2.37357,8.60107,3.54898,2.30514,8.53644,3.7328,...,0.003014,0.000148,0.022495,-60.7361,-39.4138,-58.3999,1.0,0.0,0.0,0.0
2,8.41,0,,30.0,2.07473,8.52853,3.66021,2.33528,8.53622,3.73277,...,0.003175,-0.020301,0.011275,-60.4091,-38.7635,-58.3956,1.0,0.0,0.0,0.0
3,8.42,0,,30.0,2.22936,8.83122,3.7,2.23055,8.59741,3.76295,...,0.012698,-0.014303,-0.002823,-61.5199,-39.3879,-58.2694,1.0,0.0,0.0,0.0
4,8.43,0,,30.0,2.29959,8.82929,3.5471,2.26132,8.65762,3.77788,...,-0.006089,-0.016024,0.00105,-60.2954,-38.8778,-58.3977,1.0,0.0,0.0,0.0


Here is the breakdown of each of the 17 parameters of the IMU data (hand, chest and ankle)
+ 1 temperature (°C)
+ 2-4 3D-acceleration data (ms-2),  scale: ±16g, resolution: 13-bit
+ 5-7 3D-acceleration data (ms-2),  scale: ±6g, resolution: 13-bit*
+ 8-10 3D-gyroscope data (rad/s)
+ 11-13 3D-magnetometer data (μT)
+ 14-17 orientation (invalid in this data collection)

Out of these, temperature is unnecessary. The second accelerometer (5-7) data is imprecise, we use just the first one.
Orientation (14-17) is invalid for this data. We remove this from our dataset. Finally, order is all that matters, timestep value is irrelevant. We drop that row too.

In [5]:
data.columns

Index(['timestamp', 'activity', 'heartrate', 'hand-1', 'hand-2', 'hand-3',
       'hand-4', 'hand-5', 'hand-6', 'hand-7', 'hand-8', 'hand-9', 'hand-10',
       'hand-11', 'hand-12', 'hand-13', 'hand-14', 'hand-15', 'hand-16',
       'hand-17', 'chest-1', 'chest-2', 'chest-3', 'chest-4', 'chest-5',
       'chest-6', 'chest-7', 'chest-8', 'chest-9', 'chest-10', 'chest-11',
       'chest-12', 'chest-13', 'chest-14', 'chest-15', 'chest-16', 'chest-17',
       'ankle-1', 'ankle-2', 'ankle-3', 'ankle-4', 'ankle-5', 'ankle-6',
       'ankle-7', 'ankle-8', 'ankle-9', 'ankle-10', 'ankle-11', 'ankle-12',
       'ankle-13', 'ankle-14', 'ankle-15', 'ankle-16', 'ankle-17'],
      dtype='object')

In [6]:
data.drop(columns=['timestamp']+[ f"{place}-{number}" for place in ["hand", "chest", "ankle"] for number in [1, 5, 6, 7, 14, 15, 16, 17]], inplace=True)

In [7]:
data.head()

Unnamed: 0,activity,heartrate,hand-2,hand-3,hand-4,hand-8,hand-9,hand-10,hand-11,hand-12,...,chest-13,ankle-2,ankle-3,ankle-4,ankle-8,ankle-9,ankle-10,ankle-11,ankle-12,ankle-13
0,0,,2.18837,8.5656,3.66179,-0.024413,0.047759,0.006474,14.8991,-69.2224,...,43.1768,9.6937,-1.57902,-0.215687,-0.006577,-0.004638,0.000368,-59.8479,-38.8919,-58.5253
1,0,,2.37357,8.60107,3.54898,-0.057976,0.032574,-0.006988,14.242,-69.5197,...,43.7782,9.58944,-1.73276,0.092914,0.003014,0.000148,0.022495,-60.7361,-39.4138,-58.3999
2,0,,2.07473,8.52853,3.66021,-0.002352,0.03281,-0.003747,14.8908,-69.5439,...,43.167,9.58814,-1.7704,0.054545,0.003175,-0.020301,0.011275,-60.4091,-38.7635,-58.3956
3,0,,2.22936,8.83122,3.7,0.012269,0.018305,-0.053325,15.5612,-68.8196,...,43.6453,9.69771,-1.65625,-0.060809,0.012698,-0.014303,-0.002823,-61.5199,-39.3879,-58.2694
4,0,,2.29959,8.82929,3.5471,0.003238,0.01226,-0.054474,15.4565,-68.818,...,43.4065,9.69551,-1.6556,-0.138014,-0.006089,-0.016024,0.00105,-60.2954,-38.8778,-58.3977


In [8]:
def interpolate_linear(data_pd):
    # Sensor polling rates are not the same. Interpolating NaN values.
    for key, value in data_pd.items():
        if value.isna().any():
            nan_values = value.isna()
            numeric_values = ~nan_values
            data_pd.loc[nan_values, key] = np.interp(np.flatnonzero(nan_values), np.flatnonzero(numeric_values), value[numeric_values])

def remove_heart_noise(data_np):
    data_heartfixed = data_np.copy()
    hrate = data_heartfixed[:, 2]
    nan_indices = np.argwhere(~np.isnan(hrate)).flatten()
    nan_indices.sort()
    for idx in range(0, len(nan_indices)-1, 8):
        vals = [ nan_indices[idx+i]for i in range(8) if idx+i < len(nan_indices) ]
        avg = np.mean(hrate[vals])
        hrate[vals] = np.nan
        hrate[vals[len(vals)//2]] = avg
    return data_heartfixed

def interpolate_cubic(data_np):
    data_clean_np = np.zeros_like(data_np)
    data_clean_np[:, 0:2] = data_np[:, 0:2]
    for idx, values in enumerate(data_np[:, 2:].T):
        timestamps = data_np[:, 0]
        timestamps_notnan = timestamps[~np.isnan(values)]
        values_notnan = values[~np.isnan(values)]
        values = interpolate.CubicSpline(timestamps_notnan, values_notnan)(timestamps)
        data_clean_np[:, 2+idx] = values.T
    return data_clean_np

In [9]:
interpolate_linear(data)

In [10]:
X = data.loc[:, data.columns != 'activity'].to_numpy()
y = data['activity'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

print(f"{X.shape = }", f"{y.shape = }", sep="\n")
print()
print(f"{X_train.shape = }", f"{X_test.shape = }", f"{y_train.shape = }", f"{y_test.shape = }", sep="\n")

X.shape = (2872524, 28)
y.shape = (2872524,)

X_train.shape = (2441645, 28)
X_test.shape = (430879, 28)
y_train.shape = (2441645,)
y_test.shape = (430879,)


In [11]:
np.savez("./processed_data/linear_interpolation.npz", X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)