In [1]:
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d

# Read the data
df = pd.read_csv('../data/xml_export/HeartRate.csv', low_memory=False)
df = pd.read_csv('../data/xml_export/RestingHeartRate.csv', low_memory=False)


# Rename column value to heartrate_value
df = df.rename(columns={'value': 'heartrate_value'})

# Assuming your DataFrame is named 'df'
# Step 1: Filter out anomalous values
mean_hr = df['heartrate_value'].mean()
std_hr = df['heartrate_value'].std()
threshold = 5 * std_hr
df = df[(df['heartrate_value'] >= mean_hr - threshold) & (df['heartrate_value'] <= mean_hr + threshold)]

# Step 2: Normalize the heart rate time-series
df['normalized_hr'] = (df['heartrate_value'] - df['heartrate_value'].mean()) / df['heartrate_value'].std()

# Step 3: Resample the time-series using linear interpolation to a 2 Hz sampling rate (0.5-second intervals)
date_columns = ['creationDate', 'startDate', 'endDate']
for column in date_columns:
    df[column] = pd.to_datetime(df[column])

df = df.set_index('startDate')
resampled_df = df.resample('500ms').interpolate(method='linear')

# Step 4: Pad with zeros to reach 72,000 elements
pad_length = 18000 - len(resampled_df)
padding = pd.DataFrame({'normalized_hr': [0] * pad_length}, index=pd.date_range(start=resampled_df.index[-1] + pd.Timedelta('10T'), periods=pad_length, freq='10T'))
padded_df = pd.concat([resampled_df, padding])

In [None]:
padded_df

79831832

"The heart rate in this effort was extracted from ECG. First we normalize the ECG signal and perform R-wave detection using a Pan-Tompkins based algorithm29. The time differences between consecutive R locations is the inter beat interval (IBI) time series. The IBI time-series is then filtered by removing anomalous values >5 standard deviations that are caused by missed or spurious peak detection. The IHR is calculated as simply the reciprocal of the IBI values. This heart rate time-series is independently normalized for each night by subtracting the mean and dividing by the standard deviation of the night. Finally the time-series is resampled with linear interpolation to a 2 Hz sampling rate and padded with zeros to a constant size corresponding to 10 h or 72,000 elements."