# iPhone Health Data: XML to CSV

### Imports

In [386]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

In [387]:
with open('data/export.xml') as f:
    data = ET.parse(f)
    root = data.getroot()


## Easy Transfer XML -> Pandas DF -> CSV

- larger files will take longer, but Pandas makes column sorting easier
- 1 GB ~ 1 min

In [388]:
records_df = pd.DataFrame([{**{'tag': child.tag}, **child.attrib} for child in root[2:]])
records_df.to_csv('data/health_data.csv', index=False)

### Separate CSV just for workout data

In [389]:
workout_df = pd.DataFrame([{**{'tag': child.tag}, **child.attrib} for child in root[2:] if child.tag == 'Workout'])
workout_df = workout_df.drop(columns=['tag'])
workout_df.to_csv('data/workouts.csv', index=False)

### Separate CSV just for activity data

In [390]:
activity_df = pd.DataFrame([{**{'tag': child.tag}, **child.attrib} for child in root[2:] if child.tag == 'ActivitySummary'])
activity_df = activity_df.drop(columns=['tag'])
activity_df.to_csv('data/activity.csv', index=False)

### Get Specific Records from XML

In [391]:
records_df.type.value_counts()

type
HKQuantityTypeIdentifierActiveEnergyBurned                879244
HKQuantityTypeIdentifierHeartRate                         517280
HKQuantityTypeIdentifierBasalEnergyBurned                 353209
HKQuantityTypeIdentifierDistanceWalkingRunning            191020
HKQuantityTypeIdentifierStepCount                         135602
HKQuantityTypeIdentifierPhysicalEffort                     97673
HKQuantityTypeIdentifierAppleStandTime                     43341
HKQuantityTypeIdentifierRespiratoryRate                    34431
HKCategoryTypeIdentifierAppleStandHour                     33923
HKQuantityTypeIdentifierAppleExerciseTime                  31436
HKQuantityTypeIdentifierEnvironmentalAudioExposure         30908
HKQuantityTypeIdentifierOxygenSaturation                   23691
HKQuantityTypeIdentifierWalkingSpeed                       21674
HKQuantityTypeIdentifierWalkingStepLength                  21674
HKQuantityTypeIdentifierWalkingDoubleSupportPercentage     19813
HKCategoryTypeIdenti

In [392]:
selected_columns = ['type', 'value', 'sum', 'unit', 'startDate', 'endDate']

selected_types = ['HKQuantityTypeIdentifierActiveEnergyBurned',
                  'HKQuantityTypeIdentifierBasalEnergyBurned',
                  'HKQuantityTypeIdentifierDistanceWalkingRunning',
                  'HKQuantityTypeIdentifierFlightsClimbed']

In [393]:
selected_df = pd.DataFrame([child.attrib for child in root.findall(f'.//*[@type="{selected_types[2]}"]')], columns=selected_columns)
selected_df['date'] = pd.to_datetime(selected_df['startDate'], format='%Y-%m-%d', exact=False)
selected_df['startDT'] = pd.to_datetime(selected_df['startDate'], format='%Y-%m-%d %H:%M:%S %z', exact=False)
selected_df['endDT'] = pd.to_datetime(selected_df['endDate'], format='%Y-%m-%d %H:%M:%S %z', exact=False)
selected_df.loc[selected_df['sum'].notna(), 'value'] = selected_df.loc[selected_df['sum'].notna(), 'sum']
selected_df = selected_df.drop(columns=['sum', 'startDate', 'endDate']).sort_values(by='startDT').reset_index(drop=True)
time_to_next = selected_df.iloc[1:, -2].reset_index(drop=True) - selected_df.iloc[:-1, -1].reset_index(drop=True)
time_to_next = pd.concat([time_to_next, pd.Series([0])]).reset_index(drop=True)
selected_df['time_to_next'] = time_to_next
selected_df['value'] = selected_df['value'].astype(np.float64)
estimate_df = selected_df.groupby('date')['value'].sum().reset_index()
estimate_df.rename(columns={'value': 'estimate'}, inplace=True)
estimate_df['type'] = [selected_df['type'][0]]*len(estimate_df)
estimate_df['km'] = [np.float64(0)]*len(estimate_df)


In [394]:
selected_df.head(12)

Unnamed: 0,type,value,unit,date,startDT,endDT,time_to_next
0,HKQuantityTypeIdentifierDistanceWalkingRunning,0.11278,km,2019-04-26,2019-04-26 23:23:26+02:00,2019-04-26 23:25:13+02:00,0 days 00:16:54
1,HKQuantityTypeIdentifierDistanceWalkingRunning,0.01994,km,2019-04-26,2019-04-26 23:42:07+02:00,2019-04-26 23:46:30+02:00,0 days 01:03:47
2,HKQuantityTypeIdentifierDistanceWalkingRunning,0.04286,km,2019-04-27,2019-04-27 00:50:17+02:00,2019-04-27 00:53:36+02:00,0 days 00:10:09
3,HKQuantityTypeIdentifierDistanceWalkingRunning,0.01617,km,2019-04-27,2019-04-27 01:03:45+02:00,2019-04-27 01:03:53+02:00,0 days 01:07:00
4,HKQuantityTypeIdentifierDistanceWalkingRunning,0.03522,km,2019-04-27,2019-04-27 02:10:53+02:00,2019-04-27 02:12:14+02:00,0 days 00:10:48
5,HKQuantityTypeIdentifierDistanceWalkingRunning,0.09129,km,2019-04-27,2019-04-27 02:23:02+02:00,2019-04-27 02:27:02+02:00,0 days 00:06:34
6,HKQuantityTypeIdentifierDistanceWalkingRunning,0.0513,km,2019-04-27,2019-04-27 02:33:36+02:00,2019-04-27 02:42:52+02:00,0 days 00:02:49
7,HKQuantityTypeIdentifierDistanceWalkingRunning,0.03913,km,2019-04-27,2019-04-27 02:45:41+02:00,2019-04-27 02:48:22+02:00,0 days 00:14:13
8,HKQuantityTypeIdentifierDistanceWalkingRunning,0.05394,km,2019-04-27,2019-04-27 03:02:35+02:00,2019-04-27 03:04:05+02:00,0 days 04:55:20
9,HKQuantityTypeIdentifierDistanceWalkingRunning,0.05323,km,2019-04-27,2019-04-27 07:59:25+02:00,2019-04-27 08:01:56+02:00,0 days 00:13:38


In [395]:
estimate_df.head()

Unnamed: 0,date,estimate,type,km
0,2019-04-26,0.13272,HKQuantityTypeIdentifierDistanceWalkingRunning,0.0
1,2019-04-27,0.52632,HKQuantityTypeIdentifierDistanceWalkingRunning,0.0
2,2019-04-28,1.91316,HKQuantityTypeIdentifierDistanceWalkingRunning,0.0
3,2019-04-29,3.551862,HKQuantityTypeIdentifierDistanceWalkingRunning,0.0
4,2019-04-30,5.939268,HKQuantityTypeIdentifierDistanceWalkingRunning,0.0


In [398]:
sum_idx = 0
dt_start = selected_df.loc[0, 'startDT']
dt_end = dt_start + np.timedelta64(1, 'D')
sum_df = estimate_df.copy()

# for idx, row in selected_df.iloc[:1000].iterrows():
    # if next gap of movement more than 8 hours,
    # and the dt + that gap is after ~ 24 hours from the beginning of this day:
        # add today's mvmt to sum_df
        # go to next day on sum_df for the next row

    # if it has been more than 16 hours since the start of the day,
    # but there is no gap of more than 8 hours:
        # add today's mvmt to sum_df
        # keep seeking for next gap of more than 3 hours

    # if row['time_to_next'] < np.timedelta64(8, 'h') and row['startDT'] < dt_end:
    #     sum_df.loc[sum_idx, 'km'] = sum_df.loc[sum_idx, 'km'] + row['value']
    # elif row['time_to_next'] < np.timedelta64(5, 'h') and row['startDT'] > dt_end:
    #     print('a: ', row['time_to_next'], dt_end - row['startDT'])
    # else:
    #     print(row['time_to_next'], dt_end - row['startDT'])
    #     print(dt_start)
    #     sum_idx += 1
    #     dt_start = selected_df.loc[idx+1, 'startDT']
    #     dt_end = dt_start + np.timedelta64(1, 'D')
    #     print(dt_start, '\n')


In [399]:
sum_df.head()

Unnamed: 0,date,estimate,type,km
0,2019-04-26,0.13272,HKQuantityTypeIdentifierDistanceWalkingRunning,0.0
1,2019-04-27,0.52632,HKQuantityTypeIdentifierDistanceWalkingRunning,0.0
2,2019-04-28,1.91316,HKQuantityTypeIdentifierDistanceWalkingRunning,0.0
3,2019-04-29,3.551862,HKQuantityTypeIdentifierDistanceWalkingRunning,0.0
4,2019-04-30,5.939268,HKQuantityTypeIdentifierDistanceWalkingRunning,0.0
