# Extracting Apple Sleep Data

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime, timedelta

# CHANGE THESE TWO VARIABLES FOR YOUR OWN
file_path = 'CB_data/export.xml'
csv_file_name = 'cheryl_sleep_data.csv'

In [2]:
def extract_sleep_records(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    # Extract sleep records
    sleep_records = []
    for record in root.findall("Record"):
        if record.get("type") == "HKCategoryTypeIdentifierSleepAnalysis":
            sleep_records.append({
                "startDate": record.get("startDate"),
                "endDate": record.get("endDate"),
                "value": record.get("value")
            })
            
    df = pd.DataFrame(sleep_records)
    return df

In [3]:
sleep_df = extract_sleep_records(file_path)
sleep_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'CB_data/export.xml'

In [None]:
def process_sleep_data(df):
    # Convert timestamps to datetime
    df["startDate"] = pd.to_datetime(df["startDate"])
    df["endDate"] = pd.to_datetime(df["endDate"])

    # Exclude awake periods
    df = df[df["value"] != "HKCategoryValueSleepAnalysisAwake"].copy()

    df = df.sort_values("startDate").reset_index(drop=True)

    # Assign a night date:
    # - If sleep starts after 8 PM, assign it to the next day's morning date.
    # - Otherwise, if it starts before 8 PM but also before noon (not after noon), assign it to the same morning.
    df["Date"] = df["startDate"].dt.date
    df["Date"] = df["Date"].where(df["startDate"].dt.hour < 20, df["Date"] + timedelta(days=1))

    df["duration_hours"] = (df["endDate"] - df["startDate"]).dt.total_seconds() / 3600

    # Merge sleep sessions that belong to the same "night"
    grouped_sleep = []
    prev_night = None
    total_sleep = 0
    sleep_start = None
    sleep_end = None

    for night, start, end, duration in zip(df["Date"], df["startDate"], df["endDate"], df["duration_hours"]):
        if prev_night is None or night == prev_night:
            total_sleep += duration
            sleep_start = min(sleep_start, start) if sleep_start else start
            sleep_end = max(sleep_end, end) if sleep_end else end
        else:
            grouped_sleep.append({
                "Date": prev_night, 
                "total_sleep_hours": total_sleep,
                "sleep_start": sleep_start, 
                "sleep_end": sleep_end
            })
            total_sleep = duration
            sleep_start = start
            sleep_end = end
        
        prev_night = night

    # Append the last night's sleep data
    if prev_night is not None:
        grouped_sleep.append({
            "Date": prev_night, 
            "total_sleep_hours": total_sleep,
            "sleep_start": sleep_start, 
            "sleep_end": sleep_end
        })

    sleep_summary = pd.DataFrame(grouped_sleep)

    return sleep_summary


In [4]:
sleep_summary_df = process_sleep_data(sleep_df)
sleep_summary_df = sleep_summary_df.drop(columns=["Unnamed: 0"])
sleep_summary_df.head()

NameError: name 'process_sleep_data' is not defined

In [None]:
sleep_summary_df.to_csv(csv_file_name)