# Extracting Apple Sleep Data

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime, timedelta

# CHANGE THESE TWO VARIABLES FOR YOUR OWN
file_path = 'CB_data/export.xml'
csv_file_name = 'cheryl_sleep_data.csv'

In [2]:
def extract_sleep_records(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    # Extract sleep records
    sleep_records = []
    for record in root.findall("Record"):
        if record.get("type") == "HKCategoryTypeIdentifierSleepAnalysis":
            sleep_records.append({
                "startDate": record.get("startDate"),
                "endDate": record.get("endDate"),
                "value": record.get("value")
            })
            
    df = pd.DataFrame(sleep_records)
    return df

In [3]:
sleep_df = extract_sleep_records(file_path)
sleep_df.head()

Unnamed: 0,startDate,endDate,value
0,2022-12-18 23:59:55 -0500,2022-12-19 00:01:51 -0500,HKCategoryValueSleepAnalysisInBed
1,2022-12-19 00:02:09 -0500,2022-12-19 00:02:11 -0500,HKCategoryValueSleepAnalysisInBed
2,2022-12-19 00:02:50 -0500,2022-12-19 00:07:38 -0500,HKCategoryValueSleepAnalysisInBed
3,2022-12-19 00:07:53 -0500,2022-12-19 00:08:55 -0500,HKCategoryValueSleepAnalysisInBed
4,2022-12-19 00:08:57 -0500,2022-12-19 00:08:58 -0500,HKCategoryValueSleepAnalysisInBed


In [4]:
# process the records

def process_sleep_data(df):
    # Convert timestamps to datetime
    df["startDate"] = pd.to_datetime(df["startDate"])
    df["endDate"] = pd.to_datetime(df["endDate"])

    df = df[df["value"] != "HKCategoryValueSleepAnalysisAwake"].copy()
    # Sort records by start time
    df = df.sort_values("startDate").reset_index(drop=True)

    # Assign a night date:
    # - If sleep starts before noon, it's part of the previous night.
    # - Otherwise, it's part of the same night's sleep.
    df["Date"] = df["startDate"].dt.date
    df["Date"] = df["Date"].where(df["startDate"].dt.hour < 12, df["Date"] - timedelta(days=1))

    # Calculate sleep duration in hours
    df["duration_hours"] = (df["endDate"] - df["startDate"]).dt.total_seconds() / 3600

    # Merge sleep sessions that are close together (e.g., falling asleep again at 6 AM)
    grouped_sleep = []
    prev_night = None
    total_sleep = 0

    for night, duration in zip(df["Date"], df["duration_hours"]):
        if prev_night is None or night == prev_night:
            total_sleep += duration
        else:
            grouped_sleep.append({"Date": prev_night, "total_sleep_hours": total_sleep})
            total_sleep = duration
        prev_night = night

    # Append the last night's sleep data
    if prev_night is not None:
        grouped_sleep.append({"Date": prev_night, "total_sleep_hours": total_sleep})

    # Convert to DataFrame
    sleep_summary = pd.DataFrame(grouped_sleep)
    sleep_summary = sleep_summary.loc[sleep_summary.groupby("Date")["total_sleep_hours"].idxmax()]

    return sleep_summary

In [5]:
sleep_summary_df = process_sleep_data(sleep_df)
sleep_summary_df.head()
sleep_summary_df.to_csv(csv_file_name)