# Imports

In [129]:
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime
import os
import csv 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

## Parsing XML Files into Data Frames

In [203]:
def parse_xml(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    # Initialize dictionaries to hold data
    glucose_data = []
    bolus_data = []
    basal_data = []
    meal_data = []
    finger_stick_data = []
    temp_basal_data = []
    sleep_data = []
    work_data = []
    stressors_data = []
    hypo_event_data = []
    illness_data = []
    exercise_data = []
    basis_heart_rate_data = []
    basis_gsr_data = []
    basis_skin_temp_data = []
    basis_air_temp_data = []
    basis_steps_data = []
    basis_sleep_data = []
    
    # Extract glucose levels
    for event in root.findall('.//glucose_level/event'):
        ts = event.attrib['ts']
        value = float(event.attrib['value'])
        glucose_data.append({'timestamp': ts, 
                             'glucose': value})
    
    # Extract bolus data
    for event in root.findall('.//bolus/event'):
        ts_begin = event.attrib['ts_begin']
        ts_end = event.attrib['ts_end']
        type = event.attrib['type']
        dose = float(event.attrib['dose'])
        carb_input = float(event.attrib['bwz_carb_input'])
        bolus_data.append({'ts_begin': ts_begin,
                           'ts_end': ts_end, 
                           'type': type,
                           'dose': dose, 
                           'carb_input': carb_input})
    
    # Extract basal data
    for event in root.findall('.//basal/event'):
        ts = event.attrib['ts']
        value = float(event.attrib['value'])
        basal_data.append({'timestamp': ts,
                           'basal_rate': value})
    
    # Extract meal data
    for event in root.findall('.//meal/event'):
        ts = event.attrib['ts']
        meal_type = event.attrib['type']
        carbs = float(event.attrib['carbs'])
        meal_data.append({'timestamp': ts,
                          'meal_type': meal_type,
                          'carbs': carbs})
    
    # Extract finger stick data
    for event in root.findall('.//finger_stick/event'):
        ts = event.attrib['ts']
        value = float(event.attrib['value'])
        finger_stick_data.append({'timestamp': ts, 
                                  'finger_stick': value})
    
    # Extract temp basal data
    for event in root.findall('.//temp_basal/event'):
        ts_begin = event.attrib['ts_begin']
        ts_end = event.attrib['ts_end']
        value = float(event.attrib['value'])
        temp_basal_data.append({'ts_begin': ts_begin,
                                'ts_end': ts_end,
                                'temp_basal_rate': value})
    
    # Extract sleep data
    for event in root.findall('.//sleep/event'):
        ts_begin = event.attrib['ts_begin']
        ts_end = event.attrib['ts_end']
        quality = int(event.attrib['quality'])
        sleep_data.append({'ts_begin': ts_begin,
                           'ts_end': ts_end,
                           'quality': quality})
    
    # Extract work data
    for event in root.findall('.//work/event'):
        ts_begin = event.attrib['ts_begin']
        ts_end = event.attrib['ts_end']
        intensity = int(event.attrib['intensity'])
        work_data.append({'ts_begin': ts_begin, 
                          'ts_end': ts_end,
                          'intensity': intensity})
    
    # Extract stressors data
    for event in root.findall('.//stressors/event'):
        ts = event.attrib['ts']
        stress_type = event.attrib['type']
        description = event.attrib['description']
        stressors_data.append({'timestamp': ts, 
                               'type': stress_type,
                               'description': description})
    
    # Extract hypo event data
    for event in root.findall('.//hypo_event/event'):
        ts = event.attrib['ts']
        symptom_name = event.find('symptom').attrib['name']
        hypo_event_data.append({'timestamp': ts, 
                                'symptom_name': symptom_name})
    
    # Extract illness data
    for event in root.findall('.//illness/event'):
        ts_begin = event.attrib['ts_begin']
        ts_end = event.attrib['ts_end']
        illness_type = event.attrib['type']
        description = event.attrib['description']
        illness_data.append({'ts_begin': ts_begin,
                             'ts_end': ts_end,
                             'type': illness_type, 
                             'description': description})
    
    # Extract exercise data
    for event in root.findall('.//exercise/event'):
        ts = event.attrib['ts']
        intensity = int(event.attrib['intensity'])
        exercise_type = event.attrib['type']
        duration = int(event.attrib['duration'])
        competitive = event.attrib['competitive']
        exercise_data.append({'timestamp': ts, 
                              'intensity': intensity, 
                              'type': exercise_type,
                              'duration': duration, 
                              'competitive': competitive})
    
    # Extract basis heart rate data
    for event in root.findall('.//basis_heart_rate/event'):
        ts = event.attrib['ts']
        value = int(event.attrib['value'])
        basis_heart_rate_data.append({'timestamp': ts,
                                      'heart_rate': value})
    
    # Extract basis GSR data
    for event in root.findall('.//basis_gsr/event'):
        ts = event.attrib['ts']
        value = float(event.attrib['value'])
        basis_gsr_data.append({'timestamp': ts, 
                               'gsr': value})
    
    # Extract basis skin temperature data
    for event in root.findall('.//basis_skin_temperature/event'):
        ts = event.attrib['ts']
        value = float(event.attrib['value'])
        basis_skin_temp_data.append({'timestamp': ts, 
                                     'skin_temp': value})
    
    # Extract basis air temperature data
    for event in root.findall('.//basis_air_temperature/event'):
        ts = event.attrib['ts']
        value = float(event.attrib['value'])
        basis_air_temp_data.append({'timestamp': ts,
                                    'air_temp': value})
    
    # Extract basis steps data
    for event in root.findall('.//basis_steps/event'):
        ts = event.attrib['ts']
        value = int(event.attrib['value'])
        basis_steps_data.append({'timestamp': ts,
                                 'steps': value})
    
    # Extract basis sleep data
    for event in root.findall('.//basis_sleep/event'):
        tbegin = event.attrib['tbegin']
        tend = event.attrib['tend']
        quality = int(event.attrib['quality'])
        sleep_type = event.attrib['type']
        basis_sleep_data.append({'tbegin': tbegin, 
                                 'tend': tend,
                                 'quality': quality,
                                 'type': sleep_type})
    
    # Convert to pandas DataFrames
    df_glucose = pd.DataFrame(glucose_data)
    df_bolus = pd.DataFrame(bolus_data)
    df_basal = pd.DataFrame(basal_data)
    df_meal = pd.DataFrame(meal_data)
    df_finger_stick = pd.DataFrame(finger_stick_data)
    df_temp_basal = pd.DataFrame(temp_basal_data)
    df_sleep = pd.DataFrame(sleep_data)
    df_work = pd.DataFrame(work_data)
    df_stressors = pd.DataFrame(stressors_data)
    df_hypo_event = pd.DataFrame(hypo_event_data)
    df_illness = pd.DataFrame(illness_data)
    df_exercise = pd.DataFrame(exercise_data)
    df_basis_heart_rate = pd.DataFrame(basis_heart_rate_data)
    df_basis_gsr = pd.DataFrame(basis_gsr_data)
    df_basis_skin_temp = pd.DataFrame(basis_skin_temp_data)
    df_basis_air_temp = pd.DataFrame(basis_air_temp_data)
    df_basis_steps = pd.DataFrame(basis_steps_data)
    df_basis_sleep = pd.DataFrame(basis_sleep_data)
    
    # Return DataFrames directly
    return df_glucose, df_bolus, df_basal, df_meal, df_finger_stick, df_temp_basal, df_sleep, df_work,
    df_stressors, df_hypo_event, df_illness, df_exercise, df_basis_heart_rate, df_basis_gsr, 
    df_basis_skin_temp, df_basis_air_temp, df_basis_steps, df_basis_sleep

In [178]:
# Path to your XML file
file_path = 'path-to-file.xml'

# Parse the XML and extract data into DataFrames
train_data = parse_xml(file_path)

# Unpack the DataFrames
df_glucose, df_bolus, df_basal, df_meal, df_finger_stick, df_temp_basal, df_sleep, df_work,
df_stressors, df_hypo_event, df_illness, df_exercise, df_basis_heart_rate, df_basis_gsr, 
df_basis_skin_temp, df_basis_air_temp, df_basis_steps, df_basis_sleep = train_data

## Data Cleaning 

#### 1. Convert timestamp to datetime

In [179]:
# List of tuples containing the dataframe and column name
datetime_columns = [
    (df_glucose, 'timestamp'),
    (df_bolus, 'ts_begin'),
    (df_bolus, 'ts_end'),
    (df_basal, 'timestamp'),
    (df_meal, 'timestamp'),
    (df_finger_stick, 'timestamp'),
    (df_temp_basal, 'ts_begin'),
    (df_temp_basal, 'ts_end'),
    (df_sleep, 'ts_begin'),
    (df_sleep, 'ts_end'),
    (df_work, 'ts_begin'),
    (df_work, 'ts_end'),
    (df_stressors, 'timestamp'),
    (df_hypo_event, 'timestamp'),
    (df_illness, 'ts_begin'),
    (df_illness, 'ts_end'),
    (df_exercise, 'timestamp'),
    (df_basis_heart_rate, 'timestamp'),
    (df_basis_gsr, 'timestamp'),
    (df_basis_skin_temp, 'timestamp'),
    (df_basis_air_temp, 'timestamp'),
    (df_basis_steps, 'timestamp'),
    (df_basis_sleep, 'tbegin'),
    (df_basis_sleep, 'tend')
]

# Loop through each dataframe and column pair to convert to datetime
for df, col in datetime_columns:
    if col in df.columns:  # Check if the column exists in the dataframe
        df[col] = pd.to_datetime(df[col], dayfirst=True, errors='coerce')


#### 2. Check for null/missing values, and remove useless columns 

In [180]:
df_stressors = df_stressors.drop(['type', 'description'], axis=1)
df_hypo_event = df_hypo_event.drop('symptom_name', axis=1)
df_illness = df_illness.drop(['ts_end', 'type', 'description'], axis=1)
df_exercise = df_exercise.drop(['type', 'competitive'], axis=1)

#### 2.2 CGM data missing values handling.
- interpolate if gap < 3hrs
- fill with Nan if gap > 3hrs

In [185]:
# Set timestamp as the index
df_glucose.set_index('timestamp', inplace=True)

# Calculate the time difference between consecutive timestamps
df_glucose['diff'] = df_glucose.index.to_series().diff()

# Identify gaps longer than 3 hours
# We need to handle cases where `diff` is NaT (e.g., for the first row)
gaps = df_glucose[df_glucose['diff'].notna() & (df_glucose['diff'] > pd.Timedelta('3H'))]

# Initialize lists to store new timestamps for both smaller and larger gaps
new_timestamps = []

# Generate new timestamps for gaps 3 hours or shorter
for idx in df_glucose.index[1:]:
    prev_time = df_glucose.index[df_glucose.index.get_loc(idx) - 1]
    gap_duration = idx - prev_time
    
    if gap_duration <= pd.Timedelta('3H'):
        # Generate timestamps for smaller gaps
        new_times = pd.date_range(start=prev_time + pd.Timedelta('5T'), end=idx, freq='5T')
        new_timestamps.extend(new_times)
    else:
        # Generate timestamps for larger gaps but don't fill them (NaN values will be preserved)
        new_times = pd.date_range(start=prev_time + pd.Timedelta('5T'), end=idx, freq='5T')
        new_timestamps.extend(new_times)

# Remove duplicates and sort all timestamps
new_timestamps = sorted(set(new_timestamps))

# Reindex the DataFrame to include all new timestamps
all_timestamps = sorted(set(df_glucose.index) | set(new_timestamps))
df_glucose = df_glucose.reindex(all_timestamps)

# Interpolate missing glucose values linearly for gaps 3 hours or shorter
df_glucose['glucose'] = df_glucose['glucose'].interpolate(method='linear')

# Round glucose values to one decimal place
df_glucose['glucose'] = df_glucose['glucose'].round(0)

# Drop the diff column
df_glucose.drop(columns='diff', inplace=True)
df_glucose.reset_index(inplace=True)

#### 3. Create unified dataframe 

##### 3.1 Merge temp basal and basal - override basal with temp basal

In [186]:
# Combine basal and temp basal rates = override the basal value with temp value

# Step 1: Sort DataFrames
df_basal.sort_values('timestamp', inplace=True)
df_temp_basal.sort_values('ts_begin', inplace=True)

# Step 2: Create a combined DataFrame
df_comb_basal = df_basal.copy()

# Step 3: Insert temp basal start and end times into the combined DataFrame
for _, row in df_temp_basal.iterrows():
    # Insert start time
    df_comb_basal = pd.concat([
        df_comb_basal,
        pd.DataFrame({'timestamp': [row['ts_begin']], 'basal_rate': [row['temp_basal_rate']]})
    ]).sort_values('timestamp').reset_index(drop=True)
    
    # Find the original basal rate before the temp basal period
    if row['ts_end'] < df_basal['timestamp'].max():
        previous_basal_value = df_basal[df_basal['timestamp'] <= row['ts_end']].iloc[-1]['basal_rate']
    else:
        previous_basal_value = df_basal[df_basal['timestamp'] <= row['ts_end']].iloc[-1]['basal_rate']
    
    # Insert end time with the original basal rate
    df_comb_basal = pd.concat([
        df_comb_basal,
        pd.DataFrame({'timestamp': [row['ts_end']], 'basal_rate': [previous_basal_value]})
    ]).sort_values('timestamp').reset_index(drop=True)

# Step 4: Interpolate missing values if necessary (optional, based on your needs)
df_comb_basal.set_index('timestamp', inplace=True)
df_comb_basal['basal_rate'].interpolate(method='linear', inplace=True)  # dont need to if checked for null before and handled 
df_comb_basal.reset_index(inplace=True)

##### 3.2 Merge with glucose - resample basal to 5 mins

In [187]:
# Merge glucose and basal variables = resample to 5 minutes interval to match CGM data

# Step 1: Convert timestamps to index for each dataframe
df_glucose.set_index('timestamp', inplace=True)
df_comb_basal.set_index('timestamp', inplace=True)

# Step 2: Merge the dataframes, using the closest earlier basal value for each glucose timestamp
df_glucose_basal = pd.merge_asof(df_glucose, df_comb_basal, on='timestamp', direction='backward')

# Step 3: Reset index if necessary
df_glucose_basal.reset_index(drop=True, inplace=True)

#### 3.3 Merge with sleep - aggregate by date and assign to all 

#### Check and fix if sleep times are reversed

In [188]:
# swap the timestamps where ts_begin is after ts_end
df_sleep.loc[df_sleep['ts_begin'] > df_sleep['ts_end'], ['ts_begin', 'ts_end']] = \
    df_sleep.loc[df_sleep['ts_begin'] > df_sleep['ts_end'], ['ts_end', 'ts_begin']].values

#### Calculate sleep duration and aggregate by date

In [189]:
# Caculate sleep duration in hours
df_sleep['sleep_duration_hours'] = (df_sleep['ts_end'] - df_sleep['ts_begin']).dt.total_seconds() / 3600
df_sleep['sleep_duration_hours'] = df_sleep['sleep_duration_hours'].round(1)

# Extract date and aggregate sleep duration by date
df_sleep['date'] = df_sleep['ts_begin'].dt.date
df_sleep_aggregated = df_sleep.groupby('date')['sleep_duration_hours'].sum().reset_index()

# Concatenate the two dataframes
df_combined = pd.DataFrame()

# Add a date column to the combined dataframe
df_glucose_basal['date'] = df_glucose_basal['timestamp'].dt.date

# Merge the aggregated sleep data with the combined dataframe
df_combined = pd.merge(df_glucose_basal, df_sleep_aggregated, on='date', how='right')

# Drop the intermediate date column if no longer needed
df_combined.drop(columns='date', inplace=True)

##### 4.4 Merge with bolus 

In [192]:
# Rename ts_begin to timestamp to match with df_glucose_basal
df_bolus['bolus_duration_mins'] = (df_bolus['ts_end'] - df_bolus['ts_begin']).dt.total_seconds() / 60.0

# Drop ts_end since we have duration now
df_bolus.drop(columns=['ts_end'], inplace=True)

# Rename to match format of combined dataframe
df_bolus.rename(columns={'ts_begin': 'timestamp', 'type':'bolus_type', 'dose' : 'bolus_dose'}, inplace=True)

# Concatenate the two dataframes
df_combined = pd.concat([df_combined, df_bolus], ignore_index=True)

# Sort by timestamp to maintain chronological order
df_combined.sort_values(by='timestamp', inplace=True)

# Reset the index
df_combined.reset_index(drop=True, inplace=True)

##### 4.4 Merge with meal

In [193]:
# Merge meal data with the combined DataFrame
df_combined = pd.concat([df_combined, df_meal[['timestamp', 'meal_type', 'carbs']]], sort=False)

# Sort by timestamp to maintain chronological order
df_combined.sort_values(by='timestamp', inplace=True)

# Optional: reset index if needed
df_combined.reset_index(drop=True, inplace=True)

##### 4.4 Merge with exercise

In [194]:
df_exercise.rename(columns={'intensity': 'exercise_intensity', 'duration':'exercise_duration_mins'}, inplace=True)

# Merge exercise data with the combined DataFrame
df_combined = pd.concat([df_combined, df_exercise[['timestamp', 'exercise_intensity', 'exercise_duration_mins']]], sort=False)

# Sort by timestamp to maintain chronological order
df_combined.sort_values(by='timestamp', inplace=True)

# Optional: reset index if needed
df_combined.reset_index(drop=True, inplace=True)

#### 5. Forward/backward fill continous values + create flags for events

In [195]:
df_combined[['glucose', 'basal_rate', 'sleep_duration_hours']] = df_combined[['glucose', 'basal_rate', 'sleep_duration_hours']].ffill().bfill()
df_combined['bolus_flag'] = df_combined['bolus_dose'].notnull().astype(int)
df_combined['meal_flag'] = df_combined['carbs'].notnull().astype(int)
df_combined['exercise_flag'] = df_combined['exercise_duration_mins'].notnull().astype(int)
df_combined['bolus_dose'].fillna(0, inplace=True)
df_combined['carbs'].fillna(0, inplace=True)
df_combined['exercise_duration_mins'].fillna(0, inplace=True)
df_combined['bolus_duration_mins'].fillna(0, inplace=True)

#### 6. Extract hour from timestamp

In [196]:
df_combined['hour_of_day'] = df_combined['timestamp'].dt.hour # feature engineering   

#### 7. Examine dataframe and remove unnecessary features 

In [199]:
df_combined = df_combined.drop(columns=['bolus_type', 'exercise_intensity' ,'meal_type', 'carb_input'])

In [202]:
# Define a dictionary to rename your columns
new_column_names = {
    'timestamp': 'Timestamp',
    'glucose': 'Glucose (mg/dL)',
    'basal_rate': 'Basal Rate (U/h)',
    'sleep_duration_hours': 'Sleep Duration (Hrs)',
    'bolus_dose': 'Bolus Dose (U)',
    'bolus_duration_mins': 'Bolus Duration (Mins)',
    'carbs': 'Carbs (Grams)',
    'exercise_duration_mins': 'Exercise Duration (Mins)',
    'bolus_flag': 'Bolus Flag',
    'meal_flag': 'Meal Flag',
    'exercise_flag': 'Exercise Flag',
    'hour_of_day': 'Hour Of Day'
}

# Renaming the columns in the DataFrame
df_combined.rename(columns=new_column_names, inplace=True)

# Display the updated DataFrame with the new column names
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12408 entries, 0 to 12407
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Timestamp                 12408 non-null  datetime64[ns]
 1   Glucose (mg/dL)           12408 non-null  float64       
 2   Basal Rate (U/h)          12408 non-null  float64       
 3   Sleep Duration (Hrs)      12408 non-null  float64       
 4   Bolus Dose (U)            12408 non-null  float64       
 5   Bolus Duration (Mins)     12408 non-null  float64       
 6   Carbs (Grams)             12408 non-null  float64       
 7   Exercise Duration (Mins)  12408 non-null  float64       
 8   Bolus Flag                12408 non-null  int64         
 9   Meal Flag                 12408 non-null  int64         
 10  Exercise Flag             12408 non-null  int64         
 11  Hour Of Day               12408 non-null  int32         
dtypes: datetime64[ns](

#### 8. Save as csv file

In [651]:
# Save DataFrame to a CSV file
df_combined.to_csv('path-to-save-file.csv', index=False)