In [25]:
import pandas as pd
from datetime import datetime
import numpy as np

import os
from tqdm import tqdm

### Sleep

In [9]:
def load_sleep_data(participant_path):
    participant_id = participant_path.split('/')[-1].split()[0]
    sleep_file = f'{participant_path}/{participant_id}_sleep.csv'
    sleep_periods_file = f'{participant_path}/{participant_id}_sleep-periods.csv'
    
    sleep_data = pd.read_csv(sleep_file)
    sleep_periods_data = pd.read_csv(sleep_periods_file)
    return sleep_data, sleep_periods_data

def process_bedtimes(row):
    try:
        bedtime_start = datetime.fromisoformat(row['bedtime_start'])
        bedtime_end = datetime.fromisoformat(row['bedtime_end'])
        sleep_duration_minutes = (bedtime_end - bedtime_start).total_seconds() / 60
    except:
        bedtime_start, bedtime_end, sleep_duration_minutes = None, None, None
    return bedtime_start, bedtime_end, sleep_duration_minutes

def transform_hypnogram_to_minute(hypnogram):
    return ''.join([stage * 5 for stage in hypnogram])

def process_sleep_data(sleep_data, sleep_periods_data):
    # Merge sleep data and sleep periods data
    combined_sleep_data = pd.merge(
        sleep_data[['summary_date', 'efficiency', 'participant_uid']],
        sleep_periods_data[['day', 'bedtime_start', 'bedtime_end', 'total_sleep_duration', 'participant_uid']],
        left_on=['summary_date', 'participant_uid'], 
        right_on=['day', 'participant_uid'],
        how='outer'  # Ensures all days are included, even those without sleep period data
    )

    # Process bedtimes and calculate sleep duration
    combined_sleep_data[['bedtime_start_dt', 'bedtime_end_dt', 'sleep_duration_minutes']] = combined_sleep_data.apply(
        process_bedtimes, axis=1, result_type="expand"
    )

    # Aggregate data by day, ensuring days without sleep periods are still included
    aggregated_sleep_data = combined_sleep_data.groupby(['participant_uid', 'summary_date']).agg({
        'bedtime_start_dt': 'min',  # Earliest bedtime, NaN if no data
        'bedtime_end_dt': 'max',    # Latest wake time, NaN if no data
        'sleep_duration_minutes': 'sum',  # Total sleep duration, 0 if no data
        'efficiency': 'mean'        # Average efficiency, NaN if no data
    }).reset_index()

    return aggregated_sleep_data

def calculate_sri(sleep_data):
    sleep_data['minute_by_minute_hypnogram'] = sleep_data['hypnogram_5min'].apply(transform_hypnogram_to_minute)
    unique_dates = sorted(sleep_data['summary_date'].unique())
    num_days = len(unique_dates)
    num_minutes = 24 * 60
    sleep_matrix = np.zeros((num_days, num_minutes), dtype=int)
    date_to_index = {date: idx for idx, date in enumerate(unique_dates)}
    for _, row in sleep_data.iterrows():
        day_index = date_to_index[row['summary_date']]
        hypnogram = row['minute_by_minute_hypnogram']
        for minute in range(min(num_minutes, len(hypnogram))):
            sleep_matrix[day_index, minute] = 1 if hypnogram[minute] != '0' else 0

    sri_scores = np.zeros(num_days)
    for i in range(1, num_days):
        similarity = 1 - np.sum(np.abs(sleep_matrix[i] - sleep_matrix[i - 1])) / num_minutes
        sri_scores[i] = similarity
    # Create a DataFrame for SRI scores
    sri_data = pd.DataFrame({'summary_date': unique_dates, 'sri_score': sri_scores})
    return sri_data


### Heart Rate

In [30]:
# Function to load heart rate data
def load_heart_data(participant_path):
    participant_id = participant_path.split('/')[-1].split()[0]
    daily_hr_file = f'{participant_path}/{participant_id}_daily-hr.csv'
    hr_hrv_file = f'{participant_path}/{participant_id}_hr-hrv.csv'

    daily_hr_data = pd.read_csv(daily_hr_file)
    hr_hrv_data = pd.read_csv(hr_hrv_file)
    return daily_hr_data, hr_hrv_data

# Sample Entropy Function
def sampen(L, m, r):
    N = len(L)
    B = 0.0
    A = 0.0
    xmi = np.array([L[i: i + m] for i in range(N - m)])
    xmj = np.array([L[i: i + m] for i in range(N - m + 1)])
    B = np.sum([np.sum(np.abs(xmii - xmj).max(axis=1) <= r) - 1 for xmii in xmi])
    m += 1
    xm = np.array([L[i: i + m] for i in range(N - m + 1)])
    A = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= r) - 1 for xmi in xm])
    return -np.log(A / B) if B != 0 else np.nan

# Function to process daily heart rate data
def process_daily_hr_data(daily_hr_data):
    # Convert 'timestamp' column to datetime, handling errors
    daily_hr_data['timestamp'] = pd.to_datetime(daily_hr_data['timestamp'], errors='coerce')
    # Drop rows where the timestamp couldn't be converted
    daily_hr_data = daily_hr_data.dropna(subset=['timestamp'])
    daily_hr_data['date'] = pd.to_datetime(daily_hr_data['timestamp']).dt.date
    aggregated_hr_data = daily_hr_data.groupby('date').agg({
        'bpm': ['mean', 'std', 'min', 'max', lambda x: sampen(x, 2, 0.2 * np.std(x))]
    }).reset_index()
    aggregated_hr_data.columns = ['date', 'avg_hr', 'std_hr', 'min_hr', 'max_hr', 'sampen_hr']
    return aggregated_hr_data

# Function to process heart rate variability (HRV) data
def process_hr_hrv_data(hr_hrv_data):
    # Convert 'timestamp' column to datetime, handling errors
    hr_hrv_data['timestamp'] = pd.to_datetime(hr_hrv_data['timestamp'], utc=True)
    # Drop rows where the timestamp couldn't be converted
    hr_hrv_data = hr_hrv_data.dropna(subset=['timestamp'])
    hr_hrv_data['date'] = pd.to_datetime(hr_hrv_data['timestamp']).dt.date
    aggregated_hrv_data = hr_hrv_data.groupby('date').agg({
        '5-min hrv': ['mean', 'std', 'min', 'max']  # Adjust column names as needed
    }).reset_index()
    aggregated_hrv_data.columns = ['date', 'avg_hrv', 'std_hrv', 'min_hrv', 'max_hrv']
    return aggregated_hrv_data


### Activities

In [12]:
def load_activity_data(participant_path):
    participant_id = participant_path.split('/')[-1].split()[0]

    activity_file = f'{participant_path}/{participant_id}_activity.csv'
    activity_data = pd.read_csv(activity_file)
    return activity_data

def process_activity_data(activity_data):
    # Assuming 'summary_date' column exists and is in a standard format
    activity_data['date'] = pd.to_datetime(activity_data['summary_date']).dt.date

    # Select and potentially rename columns based on the metrics you need
    # Example: ['summary_date', 'total_steps', 'total_calories', 'score_stay_active', ...]
    processed_activity_data = activity_data[['date', 'steps', 'cal_total', 'score_stay_active', 'score_move_every_hour']]
    # Add more columns as needed

    return processed_activity_data

In [47]:
sleep_features.date

0     2023-06-08
1     2023-06-09
2     2023-06-10
3     2023-06-11
4     2023-06-12
5     2023-06-14
6     2023-06-17
7     2023-06-18
8     2023-06-19
9     2023-06-20
10    2023-06-21
11    2023-06-22
12    2023-06-23
13    2023-06-24
14    2023-06-25
15    2023-06-27
16    2023-06-28
17    2023-06-29
18    2023-06-30
19    2023-07-03
Name: date, dtype: object

In [55]:
def format_date_string(date_column):
    return pd.to_datetime(date_column).dt.strftime('%Y-%m-%d')

# Processing Loop for Multiple Participants
# participant_directories = ['../Studies/BIN Complete Data Sets/BIN01 Data/'] 
file_path = "../Studies/BIN Complete Data Sets/"
participant_directories = [os.path.join(file_path, dir) for dir in os.listdir(file_path) if dir.endswith("Data")]

# Initialize an empty DataFrame for the combined data
all_data_combined = pd.DataFrame()

for participant_dir in tqdm(participant_directories):
    # Extract participant ID from directory name
    participant_id = participant_dir.split('/')[-1].split()[0]

    ### Sleep Data Processing
    sleep_data, sleep_periods_data = load_sleep_data(participant_dir)
    processed_sleep_data = process_sleep_data(sleep_data, sleep_periods_data)
    sri_data = calculate_sri(sleep_data)
    # Ensure consistent date column name and add participant_id
    sleep_features = processed_sleep_data.rename(columns={'summary_date': 'date'})
    sleep_features.drop(columns=["participant_uid"], inplace=True)
    sri_data = sri_data.rename(columns={'summary_date': 'date'})
    sleep_features['date'] = format_date_string(sleep_features['date'])
    sri_data['date'] = format_date_string(sri_data['date'])
    sleep_features = pd.merge(sleep_features, sri_data, on=['date'], how='outer')
    sleep_features['participant_id'] = participant_id

    ### Heart Rate Data Processing
    daily_hr_data, hr_hrv_data = load_heart_data(participant_dir)
    processed_hr_data = process_daily_hr_data(daily_hr_data)
    processed_hrv_data = process_hr_hrv_data(hr_hrv_data)
    # Add participant_id and merge heart data
    heart_data = processed_hr_data
    processed_hr_data['date'] = format_date_string(processed_hr_data['date'])
    processed_hrv_data['date'] = format_date_string(processed_hrv_data['date'])
    heart_data = pd.merge(heart_data, processed_hrv_data, on=['date'], how='outer')
    heart_data['participant_id'] = participant_id

    ### Activity Data Processing
    activity_data = load_activity_data(participant_dir)
    processed_activity_data = process_activity_data(activity_data)
    processed_activity_data['date'] = format_date_string(processed_activity_data['date'])
    processed_activity_data['participant_id'] = participant_id

    # Merge all features for the participant
    combined_data_participant = pd.merge(sleep_features, heart_data, on=['participant_id', 'date'], how='outer')
    combined_data_participant = pd.merge(combined_data_participant, processed_activity_data, on=['participant_id', 'date'], how='outer')

    # Concatenate with the overall dataset
    all_data_combined = pd.concat([all_data_combined, combined_data_participant])

# Reset index of the final DataFrame
all_data_combined.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [61]:
all_data_combined

Unnamed: 0,date,bedtime_start_dt,bedtime_end_dt,sleep_duration_minutes,efficiency,sri_score,participant_id,avg_hr,std_hr,min_hr,max_hr,sampen_hr,avg_hrv,std_hrv,min_hrv,max_hrv,steps,cal_total,score_stay_active,score_move_every_hour
0,2023-06-08,2023-06-07 23:23:33-05:00,2023-06-08 06:33:33-05:00,430.0,93.0,0.000000,BIN01,90.285714,16.214271,42.0,138.0,1.331806,12.298851,3.974087,0.0,24.0,6509.0,2182.0,69.0,95.0
1,2023-06-09,2023-06-08 21:15:28-05:00,2023-06-09 16:37:59-05:00,446.0,93.0,0.968750,BIN01,92.118483,13.299488,41.0,131.0,1.389414,14.649351,6.542907,0.0,39.0,10324.0,2427.0,56.0,100.0
2,2023-06-10,2023-06-10 00:21:30-05:00,2023-06-10 04:50:30-05:00,269.0,93.0,0.920139,BIN01,100.902174,16.567174,45.0,145.0,1.233330,9.777778,2.415880,0.0,16.0,2399.0,1909.0,98.0,95.0
3,2023-06-11,2023-06-11 01:31:32-05:00,2023-06-11 10:05:32-05:00,514.0,91.0,0.829861,BIN01,96.481132,8.716958,85.0,117.0,2.209495,12.359223,4.292845,0.0,26.0,3532.0,1988.0,68.0,100.0
4,2023-06-12,2023-06-11 22:21:29-05:00,2023-06-12 06:16:29-05:00,475.0,55.0,0.972222,BIN01,98.267606,10.103658,84.0,131.0,1.472472,13.593750,9.850010,0.0,42.0,6315.0,2223.0,50.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779,2023-08-13,,,,,,BIN23,76.359606,8.093130,69.0,128.0,1.247793,,,,,1892.0,2401.0,61.0,60.0
780,2023-08-15,,,,,,BIN23,88.757764,10.690402,68.0,119.0,1.873549,,,,,13980.0,3332.0,83.0,100.0
781,2023-08-21,,,,,,BIN23,84.790244,13.805366,70.0,140.0,1.278354,,,,,4155.0,2468.0,100.0,100.0
782,2023-09-05,,,,,,BIN23,81.903553,13.400268,68.0,148.0,1.362764,,,,,5349.0,2620.0,67.0,95.0


In [62]:
all_data_combined.to_csv("../Studies/oura_feats_bin01_23.csv", index=False)

PermissionError: [Errno 13] Permission denied: '../Studies/oura_feats_bin01_23.csv'

In [60]:
pd.read_csv("../Studies/oura_feats_bin01_23.csv")

Unnamed: 0,date,bedtime_start_dt,bedtime_end_dt,sleep_duration_minutes,efficiency,sri_score,participant_id,avg_hr,std_hr,min_hr,max_hr,sampen_hr,avg_hrv,std_hrv,min_hrv,max_hrv,steps,cal_total,score_stay_active,score_move_every_hour
0,2023-06-08,2023-06-07 23:23:33-05:00,2023-06-08 06:33:33-05:00,430.0,93.0,0.000000,BIN01,90.285714,16.214271,42.0,138.0,1.331806,12.298851,3.974087,0.0,24.0,6509.0,2182.0,69.0,95.0
1,2023-06-09,2023-06-08 21:15:28-05:00,2023-06-09 16:37:59-05:00,446.0,93.0,0.968750,BIN01,92.118483,13.299488,41.0,131.0,1.389414,14.649351,6.542907,0.0,39.0,10324.0,2427.0,56.0,100.0
2,2023-06-10,2023-06-10 00:21:30-05:00,2023-06-10 04:50:30-05:00,269.0,93.0,0.920139,BIN01,100.902174,16.567174,45.0,145.0,1.233330,9.777778,2.415880,0.0,16.0,2399.0,1909.0,98.0,95.0
3,2023-06-11,2023-06-11 01:31:32-05:00,2023-06-11 10:05:32-05:00,514.0,91.0,0.829861,BIN01,96.481132,8.716958,85.0,117.0,2.209495,12.359223,4.292845,0.0,26.0,3532.0,1988.0,68.0,100.0
4,2023-06-12,2023-06-11 22:21:29-05:00,2023-06-12 06:16:29-05:00,475.0,55.0,0.972222,BIN01,98.267606,10.103658,84.0,131.0,1.472472,13.593750,9.850010,0.0,42.0,6315.0,2223.0,50.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779,2023-08-13,,,,,,BIN23,76.359606,8.093130,69.0,128.0,1.247793,,,,,1892.0,2401.0,61.0,60.0
780,2023-08-15,,,,,,BIN23,88.757764,10.690402,68.0,119.0,1.873549,,,,,13980.0,3332.0,83.0,100.0
781,2023-08-21,,,,,,BIN23,84.790244,13.805366,70.0,140.0,1.278354,,,,,4155.0,2468.0,100.0,100.0
782,2023-09-05,,,,,,BIN23,81.903553,13.400268,68.0,148.0,1.362764,,,,,5349.0,2620.0,67.0,95.0


In [25]:
heart_data

Unnamed: 0,date,avg_hr,std_hr,min_hr,max_hr,sampen_hr,avg_hrv,std_hrv,min_hrv,max_hrv
0,2023-06-06,90.220339,17.071925,48.0,133.0,1.504077,,,,
1,2023-06-07,91.34375,14.416119,46.0,132.0,1.19673,13.375,5.853875,0.0,19.0
2,2023-06-08,90.285714,16.214271,42.0,138.0,1.331806,14.0,6.079059,6.0,39.0
3,2023-06-09,92.118483,13.299488,41.0,131.0,1.389414,11.886364,2.870993,0.0,18.0
4,2023-06-10,100.902174,16.567174,45.0,145.0,1.23333,9.777778,2.41588,0.0,16.0
5,2023-06-11,96.481132,8.716958,85.0,117.0,2.209495,13.642276,5.723306,0.0,32.0
6,2023-06-12,98.267606,10.103658,84.0,131.0,1.472472,11.842105,9.677538,0.0,42.0
7,2023-06-13,102.508571,13.429196,85.0,145.0,1.493808,12.642857,3.342204,7.0,16.0
8,2023-06-14,101.559441,11.511551,85.0,141.0,1.763589,12.35,3.72844,0.0,19.0
9,2023-06-15,99.462264,11.089398,83.0,131.0,1.58412,,,,


In [28]:
processed_activity_data

Unnamed: 0,date,steps,cal_total,score_stay_active,score_move_every_hour
0,2023-06-06,3154,1975,96,100
1,2023-06-07,6340,2160,58,100
2,2023-06-08,6509,2182,69,95
3,2023-06-09,10324,2427,56,100
4,2023-06-10,2399,1909,98,95
5,2023-06-11,3532,1988,68,100
6,2023-06-12,6315,2223,50,100
7,2023-06-13,6721,2217,58,100
8,2023-06-14,3813,1978,73,100
9,2023-06-15,4494,2085,66,100
