In [32]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from datetime import datetime

# Load and parse the XML file
tree = ET.parse('kenji_apple_watch.xml')
root = tree.getroot()

In [33]:

# Step 1: Extract all workouts
workouts = []
for workout in root.findall('Workout'):
    workout_type = workout.attrib.get('workoutActivityType')
    start_date = workout.attrib.get('startDate')
    end_date = workout.attrib.get('endDate')
    duration = workout.attrib.get('duration')
    total_energy = workout.attrib.get('totalEnergyBurned')

    workouts.append({
        'type': workout_type,
        'start': pd.to_datetime(start_date),
        'end': pd.to_datetime(end_date),
        'duration_min': float(duration) / 60 if duration else None,
        'calories': float(total_energy) if total_energy else None
    })

# Turn workouts into a DataFrame
df_workouts = pd.DataFrame(workouts)
print("Available workouts:")
print(df_workouts)

Available workouts:
                                                type  \
0                       HKWorkoutActivityTypeWalking   
1                       HKWorkoutActivityTypeWalking   
2   HKWorkoutActivityTypeTraditionalStrengthTraining   
3   HKWorkoutActivityTypeTraditionalStrengthTraining   
4   HKWorkoutActivityTypeTraditionalStrengthTraining   
5                       HKWorkoutActivityTypeWalking   
6                       HKWorkoutActivityTypeCycling   
7                       HKWorkoutActivityTypeCycling   
8                       HKWorkoutActivityTypeRunning   
9                       HKWorkoutActivityTypeRunning   
10                      HKWorkoutActivityTypeRunning   
11                      HKWorkoutActivityTypeRunning   
12                      HKWorkoutActivityTypeRunning   
13                      HKWorkoutActivityTypeRunning   
14                      HKWorkoutActivityTypeRunning   
15                      HKWorkoutActivityTypeRunning   
16                      HKWo

In [34]:
comedy_workout = df_workouts.iloc[47]
documentary_workout = df_workouts.iloc[48]
horror_workout = df_workouts.iloc[49]

In [35]:
record_list = [x.attrib for x in root.iter('Record')]
record_data = pd.DataFrame(record_list)
record_data.type.unique()

array(['HKQuantityTypeIdentifierHeight',
       'HKQuantityTypeIdentifierBodyMass',
       'HKQuantityTypeIdentifierHeartRate',
       'HKQuantityTypeIdentifierRespiratoryRate',
       'HKQuantityTypeIdentifierStepCount',
       'HKQuantityTypeIdentifierDistanceWalkingRunning',
       'HKQuantityTypeIdentifierBasalEnergyBurned',
       'HKQuantityTypeIdentifierActiveEnergyBurned',
       'HKQuantityTypeIdentifierFlightsClimbed',
       'HKQuantityTypeIdentifierDietaryFatTotal',
       'HKQuantityTypeIdentifierDietaryFatPolyunsaturated',
       'HKQuantityTypeIdentifierDietaryFatMonounsaturated',
       'HKQuantityTypeIdentifierDietaryFatSaturated',
       'HKQuantityTypeIdentifierDietaryCholesterol',
       'HKQuantityTypeIdentifierDietarySodium',
       'HKQuantityTypeIdentifierDietaryCarbohydrates',
       'HKQuantityTypeIdentifierDietaryFiber',
       'HKQuantityTypeIdentifierDietarySugar',
       'HKQuantityTypeIdentifierDietaryEnergyConsumed',
       'HKQuantityTypeIdentifierDieta

In [36]:
def extract_health_data(start, end):
    target_types = {
        'HKQuantityTypeIdentifierHeartRate': 'heart_rate',
        'HKQuantityTypeIdentifierRespiratoryRate': 'respiratory_rate',
        'HKQuantityTypeIdentifierBasalEnergyBurned': 'basal_energy_burned',
        'HKQuantityTypeIdentifierActiveEnergyBurned': 'active_energy_burned',
        'HKQuantityTypeIdentifierVO2Max': 'vo2_max',
        'HKQuantityTypeIdentifierEnvironmentalAudioExposure': 'env_audio_exposure',
        'HKQuantityTypeIdentifierHeadphoneAudioExposure': 'headphone_audio_exposure',
        'HKQuantityTypeIdentifierPhysicalEffort': 'physical_effort',
        'HKCategoryTypeIdentifierAudioExposureEvent': 'audio_exposure_event',
        'HKQuantityTypeIdentifierHeartRateVariabilitySDNN': 'hrv_sdnn'
    }

    records = []

    for record in root.findall('Record'):
        record_type = record.attrib.get('type')
        if record_type in target_types:
            time = pd.to_datetime(record.attrib.get('startDate'))

            value_raw = record.attrib.get('value')
            try:
                value = float(value_raw)
            except (TypeError, ValueError):
                value = np.nan  # Use NaN for non-numeric values

            records.append({
                'time': time,
                'type': target_types[record_type],
                'value': value
            })

    df = pd.DataFrame(records)

    # Filter by time
    df = df[(df['time'] >= start) & (df['time'] <= end)]

    # Pivot: one column per type
    df_pivoted = df.pivot_table(index='time', columns='type', values='value', aggfunc='mean').reset_index()

    return df_pivoted


In [37]:
comedy_data = extract_health_data(comedy_workout['start'], comedy_workout['end'])
documentary_data = extract_health_data(documentary_workout['start'], documentary_workout['end'])
horror_data = extract_health_data(horror_workout['start'], horror_workout['end'])

In [38]:
print(len(comedy_data))
print(len(documentary_data))
print(len(horror_data))

961
963
965


In [41]:
print(comedy_data.columns)
print(documentary_data.columns)
print(horror_data.columns)

Index(['time', 'active_energy_burned', 'basal_energy_burned',
       'env_audio_exposure', 'heart_rate', 'physical_effort'],
      dtype='object', name='type')
Index(['time', 'active_energy_burned', 'basal_energy_burned',
       'env_audio_exposure', 'heart_rate', 'physical_effort'],
      dtype='object', name='type')
Index(['time', 'active_energy_burned', 'basal_energy_burned', 'heart_rate',
       'physical_effort'],
      dtype='object', name='type')


In [48]:
comedy_data.to_csv('../data_collection/comedy_aw.csv', index=False)

In [49]:
documentary_data.to_csv('../data_collection/documentary_aw.csv', index=False)

In [51]:
horror_data.to_csv('../data_collection/horror_aw.csv', index=False)