In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from datetime import datetime

# Load and parse the XML file
tree = ET.parse('source_data/clara/apple_watch.xml')
root = tree.getroot()

In [2]:

# Step 1: Extract all workouts
workouts = []
for workout in root.findall('Workout'):
    workout_type = workout.attrib.get('workoutActivityType')
    start_date = workout.attrib.get('startDate')
    end_date = workout.attrib.get('endDate')
    duration = workout.attrib.get('duration')
    total_energy = workout.attrib.get('totalEnergyBurned')

    workouts.append({
        'type': workout_type,
        'start': pd.to_datetime(start_date),
        'end': pd.to_datetime(end_date),
        'duration_min': float(duration) / 60 if duration else None,
        'calories': float(total_energy) if total_energy else None
    })

# Turn workouts into a DataFrame
df_workouts = pd.DataFrame(workouts)
print("Available workouts:")
print(df_workouts)

Available workouts:
                                                type  \
0                       HKWorkoutActivityTypeWalking   
1                       HKWorkoutActivityTypeWalking   
2   HKWorkoutActivityTypeTraditionalStrengthTraining   
3   HKWorkoutActivityTypeTraditionalStrengthTraining   
4   HKWorkoutActivityTypeTraditionalStrengthTraining   
5                       HKWorkoutActivityTypeWalking   
6                       HKWorkoutActivityTypeCycling   
7                       HKWorkoutActivityTypeCycling   
8                       HKWorkoutActivityTypeRunning   
9                       HKWorkoutActivityTypeRunning   
10                      HKWorkoutActivityTypeRunning   
11                      HKWorkoutActivityTypeRunning   
12                      HKWorkoutActivityTypeRunning   
13                      HKWorkoutActivityTypeRunning   
14                      HKWorkoutActivityTypeRunning   
15                      HKWorkoutActivityTypeRunning   
16                      HKWo

In [3]:
comedy_workout = df_workouts.iloc[52]
documentary_workout = df_workouts.iloc[53]
horror_workout = df_workouts.iloc[55]

In [4]:
record_list = [x.attrib for x in root.iter('Record')]
record_data = pd.DataFrame(record_list)
record_data.type.unique()

array(['HKQuantityTypeIdentifierHeight',
       'HKQuantityTypeIdentifierBodyMass',
       'HKQuantityTypeIdentifierHeartRate',
       'HKQuantityTypeIdentifierRespiratoryRate',
       'HKQuantityTypeIdentifierStepCount',
       'HKQuantityTypeIdentifierDistanceWalkingRunning',
       'HKQuantityTypeIdentifierBasalEnergyBurned',
       'HKQuantityTypeIdentifierActiveEnergyBurned',
       'HKQuantityTypeIdentifierFlightsClimbed',
       'HKQuantityTypeIdentifierDietaryFatTotal',
       'HKQuantityTypeIdentifierDietaryFatPolyunsaturated',
       'HKQuantityTypeIdentifierDietaryFatMonounsaturated',
       'HKQuantityTypeIdentifierDietaryFatSaturated',
       'HKQuantityTypeIdentifierDietaryCholesterol',
       'HKQuantityTypeIdentifierDietarySodium',
       'HKQuantityTypeIdentifierDietaryCarbohydrates',
       'HKQuantityTypeIdentifierDietaryFiber',
       'HKQuantityTypeIdentifierDietarySugar',
       'HKQuantityTypeIdentifierDietaryEnergyConsumed',
       'HKQuantityTypeIdentifierDieta

In [5]:
def extract_health_data(start, end):
    target_types = {
        'HKQuantityTypeIdentifierHeartRate': 'heart_rate',
        'HKQuantityTypeIdentifierRespiratoryRate': 'respiratory_rate',
        'HKQuantityTypeIdentifierBasalEnergyBurned': 'basal_energy_burned',
        'HKQuantityTypeIdentifierActiveEnergyBurned': 'active_energy_burned',
        'HKQuantityTypeIdentifierVO2Max': 'vo2_max',
        'HKQuantityTypeIdentifierHeadphoneAudioExposure': 'headphone_audio_exposure',
        'HKQuantityTypeIdentifierPhysicalEffort': 'physical_effort',
        'HKCategoryTypeIdentifierAudioExposureEvent': 'audio_exposure_event',
        'HKQuantityTypeIdentifierHeartRateVariabilitySDNN': 'hrv_sdnn'
    }

    records = []

    for record in root.findall('Record'):
        record_type = record.attrib.get('type')
        if record_type in target_types:
            time = pd.to_datetime(record.attrib.get('startDate'))
            value_raw = record.attrib.get('value')
            try:
                value = float(value_raw)
            except (TypeError, ValueError):
                value = np.nan  # Use NaN for non-numeric values
    
            records.append({
                'time': time,
                'type': target_types[record_type],
                'value': value
            })

    df = pd.DataFrame(records)

    # Filter by time
    df = df[(df['time'] >= start) & (df['time'] <= end)]

    # Pivot: one column per type
    df_pivoted = df.pivot_table(index='time', columns='type', values='value', aggfunc='mean').reset_index()

    return df_pivoted


In [6]:
comedy_data = extract_health_data(comedy_workout['start'], comedy_workout['end'])
documentary_data = extract_health_data(documentary_workout['start'], documentary_workout['end'])
horror_data = extract_health_data(horror_workout['start'], horror_workout['end'])

In [7]:
print(len(comedy_data))
print(len(documentary_data))
print(len(horror_data))

0
4
3


In [52]:
print(comedy_data.columns)
print(documentary_data.columns)
print(horror_data.columns)

Index(['time', 'active_energy_burned', 'basal_energy_burned', 'heart_rate',
       'physical_effort'],
      dtype='object', name='type')
Index(['time', 'active_energy_burned', 'basal_energy_burned', 'heart_rate',
       'physical_effort'],
      dtype='object', name='type')
Index(['time', 'active_energy_burned', 'basal_energy_burned', 'heart_rate',
       'physical_effort'],
      dtype='object', name='type')


In [53]:
comedy_data

type,time,active_energy_burned,basal_energy_burned,heart_rate,physical_effort
0,2025-06-05 16:25:55+02:00,,,62.0,
1,2025-06-05 16:25:56+02:00,0.077,0.066,,
2,2025-06-05 16:25:59+02:00,0.077,0.066,60.0,
3,2025-06-05 16:26:01+02:00,0.077,0.066,,
4,2025-06-05 16:26:04+02:00,0.077,0.066,,
...,...,...,...,...,...
956,2025-06-05 16:56:36+02:00,0.077,0.066,,
957,2025-06-05 16:56:37+02:00,,,62.0,
958,2025-06-05 16:56:38+02:00,0.077,0.066,,
959,2025-06-05 16:56:41+02:00,0.123,0.266,,1.7


In [54]:
def aggregate_data(df):
    df['time'] = pd.to_datetime(df['time']).dt.tz_localize(None)

    # Floor timestamps to the nearest second
    df['timestamp'] = df['time'].dt.floor('1s')
    
    # Group by the rounded time and average
    return df.groupby('timestamp').mean().reset_index()

In [55]:
comedy_data_agg = aggregate_data(comedy_data)
documentary_data_agg = aggregate_data(documentary_data)
horror_data_agg = aggregate_data(horror_data)

In [68]:
comedy_data_final = comedy_data_agg.drop(['time'], axis=1)
documentary_data_final = documentary_data_agg.drop(['time'], axis=1)
horror_data_final = horror_data_agg.drop(['time'], axis=1)

In [71]:
horror_data_final

type,timestamp,active_energy_burned,basal_energy_burned,heart_rate,physical_effort
0,2025-06-05 17:58:54,0.077,0.066,56.0,
1,2025-06-05 17:58:56,0.077,0.066,,
2,2025-06-05 17:58:59,0.077,0.066,,
3,2025-06-05 17:59:00,,,54.0,
4,2025-06-05 17:59:01,0.077,0.066,,
...,...,...,...,...,...
960,2025-06-05 18:30:04,0.077,0.066,,
961,2025-06-05 18:30:06,0.077,0.066,52.0,
962,2025-06-05 18:30:09,0.077,0.066,,
963,2025-06-05 18:30:10,,,53.0,


In [72]:
comedy_data_final.to_csv('../data_collection/generated_data/comedy_aw.csv', index=False)
documentary_data_final.to_csv('../data_collection/generated_data/documentary_aw.csv', index=False)
horror_data_final.to_csv('../data_collection/generated_data/horror_aw.csv', index=False)