In [7]:
import os
import pandas as pd
import numpy as np
import pytz
import datetime as dt

def _fix_doubling(df: pd.DataFrame, col: str, date_col: str = "date", start_date: str = "2017-09-27", end_date: str = "2018-06-12") -> pd.DataFrame:
    """Fixes doubling happening in a dataframe for a specific column"""
    print(f"Fixing doubling for {col}.")
    
    start_date = pd.to_datetime(start_date).date()
    end_date = pd.to_datetime(end_date).date()

    date_range_mask = (df[date_col] <= end_date) & (df[date_col] >= start_date)
    df.loc[date_range_mask, col] = df.loc[date_range_mask, col] / 2

    return df


def _is_daylight_savings(date, timezone: str = "US/Eastern") -> int:
    """Checks if a given date is in daylight savings time for a specific timezone."""
    # date = date.date()
    tz = pytz.timezone(timezone)
    dt = tz.localize(datetime.combine(date, datetime.min.time()), is_dst=None)
    return int(dt.dst() != timedelta(0))


def _process_interval_data(csv_df, interval_minutes=5):
    def round_time_to_nearest_interval(time):
        minutes = (time.hour * 60) + time.minute
        rounded_minutes = round(minutes / interval_minutes) * interval_minutes
        return dt.time(hour=(rounded_minutes // 60) % 24, minute=rounded_minutes % 60)

    def add_12_hours_to_time(time_obj):
        datetime_obj = dt.datetime.combine(dt.date(1, 1, 1), time_obj)
        datetime_obj += dt.timedelta(hours=12)
        return datetime_obj.time()
    
    df = csv_df.copy()
    
    df = df[["startDate", "endDate", "value"]]
    df['startDate'] = pd.to_datetime(df['startDate']) - pd.Timedelta(hours=12)
    df['date'] = df['startDate'].dt.date
    df['time'] = df['startDate'].dt.time.map(round_time_to_nearest_interval)
    df_grouped = df.groupby(['date', 'time'])['value'].mean().reset_index()
    df_pivot = df_grouped.pivot_table(index='date', columns='time', values='value').reset_index()

    df_pivot.set_index('date', inplace=True)
    time_start = dt.time(hour=9) #9PM Start
    time_end = dt.time(hour=21) # 9AM End
    df_filtered = df_pivot.loc[:, (df_pivot.columns >= time_start) & (df_pivot.columns <= time_end)]
    df_filtered = df_filtered.rename(columns=add_12_hours_to_time)
    df_filtered.iloc[:, 1:] = df_filtered.iloc[:, 1:].interpolate(axis=1).ffill(axis=1).bfill(axis=1)
    df_filtered = df_filtered.reset_index()

    #final_df = unique_dates.merge(df_filtered, on="date", how="left")
    
    df_filtered.columns = [str(name).replace(':', '_') for name in df_filtered.columns]
    
    return df_filtered

def _calculate_night_hours(orig_df):
    df = orig_df.copy()
    
    df['startDate'] = pd.to_datetime(df['startDate']).dt.tz_localize(None)
    df['endDate'] = pd.to_datetime(df['endDate']).dt.tz_localize(None)
    
    df = df.sort_values(by=['startDate', 'endDate'])
    
    # Get the date range in the dataframe
    min_date = df['startDate'].min().date()
    max_date = df['endDate'].max().date()

    # Initialize an empty list to store the results
    results = []

    # Loop through each date in the range
    if pd.notnull(min_date) and pd.notnull(max_date):

        for date in pd.date_range(min_date, max_date):
            # startSleep time boundaries - Based on analysis of train_detailed
            start_day = pd.Timestamp.combine(date, pd.Timestamp('21:30:00').time())
            end_day = pd.Timestamp.combine(date + pd.DateOffset(1), pd.Timestamp('02:30:00').time())
            # endSleep time boundaries - Based on analysis of train_detailed
            start_night = pd.Timestamp.combine(date + pd.DateOffset(1), pd.Timestamp('05:30:00').time())
            end_night = pd.Timestamp.combine(date + pd.DateOffset(1), pd.Timestamp('10:30:00').time())
            # print(f'date {date}, start sleep {start_day}, end sleep {end_day}, start awake {start_night}, end awake {end_night}')

            # Filter the dataframe for start of sleep
            mask_startSleep_startDate = (df['startDate'] >= start_day) & (df['startDate'] <= end_day)
            filtered_startSleep_startDate = df[mask_startSleep_startDate]

            mask_startSleep_endDate = (df['endDate'] >= start_day) & (df['endDate'] <= end_day)
            filtered_startSleep_endDate = df[mask_startSleep_endDate]

            # Filter the dataframe for end of sleep
            mask_endSleep_startDate = (df['startDate'] >= start_night) & (df['startDate'] <= end_night)
            filtered_endSleep_startDate = df[mask_endSleep_startDate]

            mask_endSleep_endDate = (df['endDate'] >= start_night) & (df['endDate'] <= end_night)
            filtered_endSleep_endDate = df[mask_endSleep_endDate]

            # Append the results to the list
            results.append({
                'date': date,
                'startSleep_min_startDate': filtered_startSleep_startDate['startDate'].min(),
                'startSleep_max_startDate': filtered_startSleep_startDate['startDate'].max() ,
                'startSleep_min_endDate': filtered_startSleep_endDate['endDate'].min(),
                'startSleep_max_endDate': filtered_startSleep_endDate['endDate'].max(), 
                'endSleep_min_startDate': filtered_endSleep_startDate['startDate'].min(),
                'endSleep_max_startDate': filtered_endSleep_startDate['startDate'].max(),
                'endSleep_min_endDate': filtered_endSleep_endDate['endDate'].min(),
                'endSleep_max_endDate': filtered_endSleep_endDate['endDate'].max(),
            })

        # Convert the results to a dataframe and return
        result_df = pd.DataFrame(results)

        start_sleep_columns = [
            "startSleep_min_startDate",
            "startSleep_max_startDate",
            "startSleep_min_endDate",
            "startSleep_max_endDate",
        ]   

        end_sleep_columns = [
            "endSleep_min_startDate",
            "endSleep_max_startDate",
            "endSleep_min_endDate",
            "endSleep_max_endDate",
        ]

        # Clips (calculated from last 3 months)
        avg_sleep = 6.78525641025641
        max_sleep = 7.734282511512526
        min_sleep = 5.836230309000293

        for i, start_col in enumerate(start_sleep_columns):
            for j, end_col in enumerate(end_sleep_columns):
                result_df[f"diff_{i}_{j}"] = (result_df[end_col] - result_df[start_col]).dt.total_seconds() / 3600
                result_df[f"diff_{i}_{j}"] = ((avg_sleep + result_df[f"diff_{i}_{j}"])/2).clip(lower=min_sleep, upper=max_sleep)

        for col in start_sleep_columns:
            result_df[f"{col}_hr"] = result_df[col].dt.hour + result_df[col].dt.minute / 60 + result_df[col].dt.second / 3600
            result_df[f"{col}_hr"] = result_df[f"{col}_hr"].apply(lambda x: x + 24 if x < 12 else x) # If the hour is less than 12, add 24 to it

        for col in end_sleep_columns:
            result_df[f"{col}_hr"] = result_df[col].dt.hour + result_df[col].dt.minute / 60 + result_df[col].dt.second / 3600

        result_df = result_df.drop(columns = ['startSleep_min_startDate', 'startSleep_max_startDate', 'startSleep_min_endDate', 'startSleep_max_endDate', 'endSleep_min_startDate', 'endSleep_max_startDate', 'endSleep_min_endDate', 'endSleep_max_endDate']).reset_index(drop=True)
    
    else:
        result_df = pd.DataFrame(columns=['date', 'diff_0_0', 'diff_0_1', 'diff_0_2', 'diff_0_3', 'diff_1_0', 'diff_1_1', 'diff_1_2', 'diff_1_3', 'diff_2_0', 'diff_2_1', 'diff_2_2', 'diff_2_3', 'diff_3_0', 'diff_3_1', 'diff_3_2', 'diff_3_3', 'startSleep_min_startDate_hr', 'startSleep_max_startDate_hr', 'startSleep_min_endDate_hr', 'startSleep_max_endDate_hr', 'endSleep_min_startDate_hr', 'endSleep_max_startDate_hr', 'endSleep_min_endDate_hr', 'endSleep_max_endDate_hr'])
    
    return result_df
        
def _create_xml_features(path: str) -> pd.DataFrame:
    """Create XML features from the provided CSV file."""
    print(f"Featurizing {path}")
    csv_df = pd.read_csv(path, low_memory=False)
    base_name = os.path.basename(path).split(".")[0]
    

    value = "totalEnergyBurned" if base_name == "Workout" else "value"
    agg_func = "mean" if base_name == "BodyMassIndex" else "sum"
    
    csv_df["startDate"] = pd.to_datetime(csv_df["startDate"]).dt.tz_convert("US/Eastern")
    csv_df["endDate"] = pd.to_datetime(csv_df["endDate"]).dt.tz_convert("US/Eastern")
    csv_df["date"] = (pd.to_datetime(csv_df["startDate"])- pd.to_timedelta('12:00:00')).dt.date
    csv_df["time"] = pd.to_datetime(csv_df["startDate"]).dt.time
    
    csv_df = csv_df.sort_values(by=['startDate', 'endDate'])
    
    csv_df["hours_between"] = (csv_df["startDate"].shift(-1) - csv_df["endDate"]).dt.total_seconds() / 3600
    csv_df['is_night'] = (csv_df['startDate'] - pd.Timedelta(hours=12)).dt.date == csv_df['startDate'].dt.date
    
    groupby_agg = {
        "startDate": ["max", "min"],
        "endDate": ["max", "min"],
        f"{value}": agg_func,
        "hours_between" : "max"
    }

    df = csv_df.groupby("date").agg(groupby_agg).reset_index()
    df.columns = ["_".join(tup).rstrip("_") for tup in df.columns.values]

    df = df.rename(columns={f"{value}_{agg_func}": base_name})
    df = df.rename(columns={"hours_between_max": "slp_"+base_name+"_max_hrs_between"})
    
    # Sum hours between if is_night is True 
    def sum_night_hours(group):
        return group.loc[group['is_night'], 'hours_between'].sum()

    df[f"slp_{base_name}_sum_hrs_between"] = csv_df.groupby("date").apply(sum_night_hours).values
    
    # Count hours between if is_night is True (sleep interruptions)
    def count_night_hours(group):
        return group.loc[group['is_night'], 'hours_between'].count()
    
    df[f"slp_{base_name}_count_hrs_between"] = csv_df.groupby("date").apply(count_night_hours).values
    
    # Sum hours_between when HeartRate is less than 60
    csv_df["hours_inbetween"] = (csv_df["endDate"] - csv_df["startDate"]).dt.total_seconds() / 3600
    def custom_features(group, value, operator):
        if operator == "<":
            return group.loc[group['value'] < value, 'hours_inbetween'].sum()
        elif operator == ">":
            return group.loc[group['value'] > value, 'hours_inbetween'].sum()
        else:
            raise ValueError("Invalid operator. Only '<' or '>' are allowed.")
    
    if base_name == "HeartRate":
        df[f"slp_{base_name}_est"] = csv_df.groupby("date").apply(lambda group: custom_features(group, 52.71599196743369, "<")).values # slp_HeartRate_est
    elif base_name == "OxygenSaturation":
        df[f"slp_{base_name}_est"] = csv_df.groupby("date").apply(lambda group: custom_features(group, 0.9695523020888221, "<")).values # slp_OxygenSaturation_est
    elif base_name == "RespiratoryRate":
        df[f"slp_{base_name}_est"] = csv_df.groupby("date").apply(lambda group: custom_features(group, 17.292111591847622, "<")).values # slp_RespiratoryRate_est
    else:
        pass
    
    for time_col in ["startDate_max", "startDate_min", "endDate_max", "endDate_min"]:
        # Hours
        col_prefix = f"{base_name}_{time_col}_"
        df[col_prefix + "hr"] = df[time_col].dt.hour
        
    
    df["date"] = pd.to_datetime(df["date"]).dt.date
    
    # Check if csv_df "value" is numeric, and if so, calculate interval data
    if pd.to_numeric(csv_df["value"], errors='coerce').notnull().all():

        # Time intervals
        intervals = _process_interval_data(csv_df)
        intervals = intervals.add_prefix(f"slp_{base_name}_")
        intervals = intervals.rename(columns={f"slp_{base_name}_date": "date"})
        intervals["date"] = pd.to_datetime(intervals["date"]).dt.date
        df = df.merge(intervals, how="left", on = "date")


    # Night Hours
    if base_name == "RespiratoryRate":
        csv_df = csv_df[csv_df["value"] < 18.0]
    elif base_name == "OxygenSaturation":
        csv_df = csv_df[csv_df["value"] < 0.97]
    elif base_name == "HeartRate":
        csv_df = csv_df[csv_df["value"] < 51.0]
    else:
        pass
    
    night_hours_df = _calculate_night_hours(csv_df)
    night_hours_df = night_hours_df.add_prefix(f"slp_{base_name}_")
    night_hours_df = night_hours_df.rename(columns={f"slp_{base_name}_date": "date"})
    night_hours_df["date"] = pd.to_datetime(night_hours_df["date"]).dt.date
    df = df.merge(night_hours_df, how="left", on = "date")


    df = _fix_doubling(df, base_name)
    df = df.drop(columns=["startDate_max", "startDate_min", "endDate_max", "endDate_min"]) 
    
    # Drop non-numeric columns
    # df = df[df.columns[df.columns.isin(['date']) | df.dtypes.isin(['number'])]]

    return df

In [18]:
def create_xml_data(path = "./data/xml_export", xml_files_names = ["RespiratoryRate"]):
    """Featurize XML data from given path."""
    print("Creating XML data")
    
    xml_files_names = [os.path.join(path, f"{xml_file}.csv") for xml_file in xml_files_names]

    # Create DataFrame with date column from 1/1/2015 to 12/31/2023
    xml_data = pd.DataFrame({"date": pd.date_range(start="1/1/2015", end="12/31/2023", freq="D")})
    xml_data["date"] = xml_data["date"].dt.date

    # Parse each xml file output and merge with the train data
    for xml_file in xml_files_names:
        
        xml = _workout_features(xml_file) if "Workout" in xml_file else _create_xml_features(xml_file) 
        
        xml["date"] = pd.to_datetime(xml["date"]).dt.date
        
        xml_data = pd.merge(xml_data, xml, on="date", how="outer")
    
    return xml_data

In [34]:
#test.slp_RespiratoryRate_diff_0_3.isna().value_counts()

# Max date where slp_RespiratoryRate_diff_0_3 is not NaN
test[test.slp_RespiratoryRate_diff_0_3.notna()].date.min()


datetime.date(2021, 10, 25)

In [19]:
test = create_xml_data()

Creating XML data
Featurizing ./data/xml_export/RespiratoryRate.csv
Fixing doubling for RespiratoryRate.


In [29]:
df = pd.read_csv("./data/train.csv")
df["date"] = pd.to_datetime(df["date"]).dt.date

# Filter date on >= self.start_date
#df = df[df["date"] >= pd.to_datetime(self.start_date)].reset_index(drop=True)
df = df[df["date"] >= pd.to_datetime("2015-06-01").date()].reset_index(drop=True)


df = df.sort_values(by="date")
df = _fix_doubling(df, "sleep_hours")

start_date, end_date = df["date"].min(), df["date"].max()
print(f"Start date: {start_date}, End date: {end_date}")

date_range = pd.date_range(start=start_date, end=end_date, freq="D")
date_range = pd.DataFrame({"date": date_range})
date_range["date"] = date_range["date"].dt.date

df = date_range.merge(df, on="date", how="left")
print(f"Missing days: {df.sleep_hours.isna().sum()}")

df = df.merge(test, on="date", how="left") #Add XML data


Fixing doubling for sleep_hours.
Start date: 2015-06-08, End date: 2021-12-31
Missing days: 87


In [32]:
df.date.max()

datetime.date(2021, 12, 31)