In [None]:
import pandas as pd
import numpy as np
import os

df = pd.read_csv('./data/train.csv')
sub = pd.read_csv('./data/sample_submission.csv')

# Make sure sorted by date
df = df.sort_values(by='date')
sub = sub.sort_values(by='date')

# Format date as date
df["date"] = pd.to_datetime(df["date"]).dt.date
sub["date"] = pd.to_datetime(sub["date"]).dt.date

In [None]:
df.head()

In [None]:
# This is going to be a common problem to fix, but tracking is doubled 9/27/2017 and 6/12/2018

def fix_double_tracking(df, col, date_col="date"):
    df.loc[(df[date_col] <= pd.to_datetime("2018-06-12").date()) & (df[date_col] >= pd.to_datetime("2017-09-27").date()), col] = \
    df.loc[(df[date_col] <= pd.to_datetime("2018-06-12").date()) & (df[date_col] >= pd.to_datetime("2017-09-27").date()), col] / 2
    return df


In [None]:
# sleep_hours between 9/27/2017 and 6/12/2018 is doubled and needs to be divided by 2
df = fix_double_tracking(df, "sleep_hours")

In [None]:
df.sleep_hours.hist()

In [None]:
# Fill in missing dates (for time series)

start_date = df["date"].min()
print(f"Start date: {start_date}")

end_date = df["date"].max()
print(f"End date: {end_date}")

date_range = pd.date_range(start=start_date, end=end_date, freq="D")
date_range = pd.DataFrame({"date": date_range})
date_range["date"] = date_range["date"].dt.date

df = date_range.merge(df, on="date", how="left")
print('missing days: ', df.sleep_hours.isna().sum())

In [None]:
# forward fill the missing values/targets
df["sleep_hours"] = df["sleep_hours"].fillna(method="ffill")

In [None]:
#plot df_sleep.sleep_hours as dots by df_sleep.date
df.plot.scatter(x='date', y='sleep_hours')

In [None]:
# Median of sleep_hours
med = df.sleep_hours.median()
print(f"Median sleep hours: {med}")


In [None]:
sub_median = sub.copy()
sub_median["sleep_hours"] = med
sub_median.to_csv("./submissions/submission_median.csv", index=False)
sub_median.head()

# kaggle competitions submit -c kaggle-pog-series-s01e04 -f ./submissions/submission_median.csv -m "Trying just all median"
# Score : 0.67685

In [None]:
# Median of sleep_hours
mean = df.sleep_hours.mean()
print(f"Median sleep hours: {mean}")

In [None]:
sub_mean = sub.copy()
sub_mean["sleep_hours"] = mean
sub_mean.to_csv("./submissions/submission_mean.csv", index=False)
sub_mean.head()

# kaggle competitions submit -c kaggle-pog-series-s01e04 -f ./submissions/submission_mean.csv -m "Trying just all mean"
# Score: 0.67396

In [None]:
import pytz
from datetime import datetime, timedelta
from pandas.tseries.holiday import USFederalHolidayCalendar
### Time series Data ###

df["day_of_week"] = pd.to_datetime(df["date"]).dt.dayofweek # Day of the week
df["month"] = pd.to_datetime(df["date"]).dt.month # Month
df["is_weekend"] = df["day_of_week"].apply(lambda x: 1 if x >= 5 else 0) # is_weekend

cal = USFederalHolidayCalendar()
holidays = cal.holidays(start="2014-01-01", end="2023-12-31") 
df["is_holiday"] = df["date"].apply(lambda x: 1 if x in holidays else 0) # is_holiday
df["is_workday"] = 1-(df["is_weekend"] + df["is_holiday"]) # is_workday

In [None]:
df.head()

In [None]:
# Create submissions by median by DOW
sub_median_group = sub.copy()
med = df.groupby("day_of_week")["sleep_hours"].median()
sub_median_group["day_of_week"] = pd.to_datetime(sub_median_group["date"]).dt.dayofweek
sub_median_group["sleep_hours"] = sub_median_group["day_of_week"].map(med)
sub_median_group = sub_median_group.drop("day_of_week", axis=1)
sub_median_group.to_csv("./submissions/submission_median_group.csv", index=False)
sub_median_group.head()

# kaggle competitions submit -c kaggle-pog-series-s01e04 -f ./submissions/submission_median_group.csv -m "Trying all median by DOW"
# Score: 0.67406

In [None]:
# Create submissions by median by DOW
sub_mean_group = sub.copy()
mean = df.groupby("day_of_week")["sleep_hours"].mean()
sub_mean_group["day_of_week"] = pd.to_datetime(sub_mean_group["date"]).dt.dayofweek
sub_mean_group["sleep_hours"] = sub_mean_group["day_of_week"].map(mean)
sub_mean_group = sub_mean_group.drop("day_of_week", axis=1)
sub_mean_group.to_csv("./submissions/submission_mean_group.csv", index=False)
sub_mean_group.head()

# kaggle competitions submit -c kaggle-pog-series-s01e04 -f ./submissions/submission_mean_group.csv -m "Trying all mean by DOW"
# Score: ________________

In [None]:
# Submission by median by DOW and and is_workday
sub_median_group_workday = sub.copy()
med = df.groupby(["day_of_week", "is_workday"])["sleep_hours"].median()
sub_median_group_workday["day_of_week"] = pd.to_datetime(sub_median_group_workday["date"]).dt.dayofweek
sub_median_group_workday["is_workday"] = 1-(sub_median_group_workday["day_of_week"].apply(lambda x: 1 if x >= 5 else 0) + sub_median_group_workday["date"].apply(lambda x: 1 if x in holidays else 0))
sub_median_group_workday["sleep_hours"] = sub_median_group_workday[["day_of_week", "is_workday"]].apply(lambda x: med[x[0]][x[1]], axis=1)
sub_median_group_workday = sub_median_group_workday.drop(["day_of_week", "is_workday"], axis=1)
sub_median_group_workday.to_csv("./submissions/submission_median_group_workday.csv", index=False)
sub_median_group_workday.head()
# kaggle competitions submit -c kaggle-pog-series-s01e04 -f ./submissions/submission_median_group_workday.csv -m "Trying all median by DOW and is_workday"
# Score: __________________

In [None]:
# Submission by mean by DOW and and is_workday
sub_mean_group_workday = sub.copy()
mean = df.groupby(["day_of_week", "is_workday"])["sleep_hours"].mean()
sub_mean_group_workday["day_of_week"] = pd.to_datetime(sub_mean_group_workday["date"]).dt.dayofweek
sub_mean_group_workday["is_workday"] = 1-(sub_mean_group_workday["day_of_week"].apply(lambda x: 1 if x >= 5 else 0) + sub_mean_group_workday["date"].apply(lambda x: 1 if x in holidays else 0))
sub_mean_group_workday["sleep_hours"] = sub_mean_group_workday[["day_of_week", "is_workday"]].apply(lambda x: mean[x[0]][x[1]], axis=1)
sub_mean_group_workday = sub_mean_group_workday.drop(["day_of_week", "is_workday"], axis=1)
sub_mean_group_workday.to_csv("./submissions/submission_mean_group_workday.csv", index=False)
sub_mean_group_workday.head()
# kaggle competitions submit -c kaggle-pog-series-s01e04 -f ./submissions/submission_mean_group_workday.csv -m "Trying all mean by DOW and is_workday"
# Score: ________________

In [None]:
from glob import glob

csv_files = glob("./data/xml_export/*.csv")

# Keep what files have ~80% of the dates in the training data
unique_dates = set(df["date"])

match_threshold = 0.75 # Threshold for matching

def intersection_ratio(set1, set2):
    return len(set1 & set2) / len(set1)

matching_csvs = []
for csv_file in csv_files:
    csv_df = pd.read_csv(csv_file)  
    try: 
        csv_df["date"] = pd.to_datetime(csv_df["startDate"]).dt.date
    except:
        csv_df["date"] = pd.to_datetime(csv_df["dateComponents"]).dt.date # Some use dateComponents instead of startDate

    unique_creation_dates = set(csv_df)

    ratio = intersection_ratio(unique_dates, unique_creation_dates)

    if ratio >= match_threshold:
        matching_csvs.append(csv_file)

print("CSV files with at least 75% of the 'dates' from the original DataFrame:")
for matching_csv in matching_csvs:
    print(matching_csv)

In [None]:

csv_df.loc[csv_df["date"] >= pd.to_datetime("2021-01-01").date()]

In [None]:
list = ["test1", "test2", "test3"]


In [None]:
# Datasets worth exploring
basal_energy_burned = pd.read_csv('./data/xml_export/BasalEnergyBurned.csv', low_memory=False)
body_mass = pd.read_csv('./data/xml_export/BodyMass.csv', low_memory=False)
flights_climbed = pd.read_csv('./data/xml_export/FlightsClimbed.csv', low_memory=False)
step_count = pd.read_csv('./data/xml_export/StepCount.csv', low_memory=False)
body_mass_index = pd.read_csv('./data/xml_export/BodyMassIndex.csv', low_memory=False)
distance_walking_running = pd.read_csv('./data/xml_export/DistanceWalkingRunning.csv', low_memory=False)


In [None]:

def parse_xml_output(path):
    # Import the csv
    csv_df = pd.read_csv(path, low_memory=False)
    base_name = os.path.basename(path).split(".")[0]
    
    # BodyMassIndex we want to avg not sum
    if base_name == "BodyMassIndex":
        agg_func = "mean"
    else:
        agg_func = "sum"
    
    # Convert startDate and endDate columns to datetime objects
    csv_df["startDate"] = pd.to_datetime(csv_df["startDate"]).dt.tz_convert ("US/Eastern")
    csv_df["endDate"] = pd.to_datetime(csv_df["endDate"]).dt.tz_convert ("US/Eastern")
    # Create date & time column
    csv_df["date"] = pd.to_datetime(csv_df["startDate"]).dt.date
    csv_df["time"] = pd.to_datetime(csv_df["startDate"]).dt.time
    
    # Group by date and perform aggregations
    csv_df = csv_df.groupby("date").agg(
        max_start_time=pd.NamedAgg(column="startDate", aggfunc="max"),
        min_start_time=pd.NamedAgg(column="startDate", aggfunc="min"),
        max_end_time=pd.NamedAgg(column="endDate", aggfunc="max"),
        min_end_time=pd.NamedAgg(column="endDate", aggfunc="min"),
        value_sum=pd.NamedAgg(column="value", aggfunc=agg_func)
    ).reset_index()
    
    # Check if the dates match between min_start_time and max_end_time
    csv_df["dates_match"] = csv_df.apply(
        lambda row: row["min_start_time"].date() == row["max_end_time"].date(), axis=1)
    csv_df = csv_df.rename(columns={"value_sum": base_name})
    
    # Assert if dates match in all rows
    # assert csv_df["dates_match"].all(), "Dates do not match in some rows"
    
    # Trigonomic Hours
    csv_df[base_name+"_mx_st_hr_sin"] = np.sin(2 * np.pi * csv_df["max_start_time"].dt.hour / 24)
    csv_df[base_name+"_mx_st_hr_cos"] = np.cos(2 * np.pi * csv_df["max_start_time"].dt.hour / 24)
    csv_df[base_name+"_mn_st_hr_sin"] = np.sin(2 * np.pi * csv_df["min_start_time"].dt.hour / 24)
    csv_df[base_name+"_mn_st_hr_cos"] = np.cos(2 * np.pi * csv_df["min_start_time"].dt.hour / 24)
    csv_df[base_name+"_mx_et_hr_sin"] = np.sin(2 * np.pi * csv_df["max_end_time"].dt.hour / 24)
    csv_df[base_name+"_mx_et_hr_cos"] = np.cos(2 * np.pi * csv_df["max_end_time"].dt.hour / 24)
    csv_df[base_name+"_mn_et_hr_sin"] = np.sin(2 * np.pi * csv_df["min_end_time"].dt.hour / 24)
    csv_df[base_name+"_mn_et_hr_cos"] = np.cos(2 * np.pi * csv_df["min_end_time"].dt.hour / 24) 
    
    #! HIGHLY ILLEGAL FEATURE??? (Step Count basically calculates sleep time)
    # Hours between startDate and next startDate (lag = -1)
    csv_df[base_name+"_hours_between"] = (csv_df["min_start_time"].shift(-1) - csv_df["max_start_time"]).dt.total_seconds() / 3600
    
    #Fix double tracking
    csv_df = fix_double_tracking(csv_df, base_name)
    
    # Drop unnecessary columns
    # csv_df = csv_df.drop(columns=["max_start_time", "min_start_time", "max_end_time", "min_end_time", "dates_match"])

    return csv_df

## Basal Energy Burned

In [None]:
basal_energy_burned.head()

In [None]:
# Value is the only useful feature
# Group by date and sum (set alias to basal_energy_burned)

# NOTE: Summing assumes duplicate tracking is not occuring between devices

basal_energy_burned = parse_xml_output("./data/xml_export/BasalEnergyBurned.csv")
basal_energy_burned.head()

In [None]:
# Check distribution
basal_energy_burned.BasalEnergyBurned.hist()

In [None]:
basal_energy_burned.plot.scatter(x='date', y='BasalEnergyBurned')

Looks like Rob got a better tracker or became more serious about racking in 2021

# Body Mass

In [None]:
body_mass.head()

In [None]:
body_mass["date"] = pd.to_datetime(body_mass["startDate"]).dt.date
body_mass = body_mass.groupby("date").sum().reset_index()
body_mass = body_mass.rename(columns={"value": "body_mass"})
body_mass = fix_double_tracking(body_mass, "body_mass")
body_mass.head()

In [None]:
# Check distribution
body_mass.body_mass.hist()

In [None]:
body_mass.plot.scatter(x='date', y='body_mass')

## Flights Climbed

In [None]:
flights_climbed.head()

In [None]:
flights_climbed["date"] = pd.to_datetime(flights_climbed["startDate"]).dt.date
flights_climbed = flights_climbed.groupby("date").sum().reset_index()
flights_climbed = flights_climbed.rename(columns={"value": "flights_climbed"})
flights_climbed = fix_double_tracking(flights_climbed, "flights_climbed")
flights_climbed.head()

In [None]:
# Check distribution
flights_climbed.flights_climbed.hist()

# Definitely lognormal

In [None]:
flights_climbed.plot.scatter(x='date', y='flights_climbed')

Very weird data. Will have to explore further. Possible duplication?

## Step Count

In [None]:
step_count.head()

In [None]:
step_count["date"] = pd.to_datetime(step_count["startDate"]).dt.date
step_count = step_count.groupby("date").sum().reset_index()
step_count = step_count.rename(columns={"value": "step_count"})
step_count = fix_double_tracking(step_count, "step_count")
step_count.head()

In [None]:
# Check distribution
step_count.step_count.hist()

#kind of lognormal but not quite

In [None]:
step_count.plot.scatter(x='date', y='step_count')

## Body Mass Index
This is a slow changing variable


In [None]:
body_mass_index.head()

In [None]:
body_mass_index["date"] = pd.to_datetime(body_mass_index["startDate"]).dt.date
body_mass_index = body_mass_index.groupby("date").mean(numeric_only =True).reset_index() #! MEAN here
body_mass_index = body_mass_index.rename(columns={"value": "body_mass_index"})
body_mass_index = body_mass_index.drop(columns=["sourceVersion", "device"], axis=1)
body_mass_index.head()

In [None]:
# Check distribution
body_mass_index.body_mass_index.hist()

In [None]:
body_mass_index.plot.scatter(x='date', y='body_mass_index')

Not measured frequently in 2012-2015

## Distance Walking/Running

In [None]:
distance_walking_running.head()

In [None]:
distance_walking_running = parse_xml_output("./data/xml_export/DistanceWalkingRunning.csv")
distance_walking_running.head()

In [None]:
# Check distribution
distance_walking_running.DistanceWalkingRunning.hist()

# Lognormal

In [None]:
distance_walking_running.plot.scatter(x='date', y='DistanceWalkingRunning')

In [None]:
# Extract the hour and minute information from the datetime column and convert it to minutes
distance_walking_running['time_minutes'] = distance_walking_running['min_start_time'].dt.hour * 60 + distance_walking_running['min_start_time'].dt.minute

# Define the bin size (in minutes)
bin_size = 60  # 1-hour bins

# Create the bins using numpy
bins = np.arange(0, 24*60 + bin_size, bin_size)

# Plot the histogram using seaborn's histplot with time bins
plt.figure(figsize=(10, 6))
sns.histplot(data=distance_walking_running, x='time_minutes', bins=bins, kde=True)
plt.xticks(range(0, 24*60+1, bin_size), [f'{i:02d}:00' for i in range(0, 24+1, bin_size//60)], rotation=45)
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.title('Frequency of Time')
plt.show()

In [None]:
distance_walking_running['time'].dt.seconds

# Feature Engineering

In [None]:
# Combine all data into one DataFrame

for d in [basal_energy_burned, body_mass, flights_climbed, step_count, body_mass_index, distance_walking_running]:
    df = df.merge(d, on="date", how="left")

# Time series data so use ffill
df = df.fillna(method="ffill")
# Note: also ffills sleep_hours 

df.head()

In [None]:
# Interactions
df["distance_per_step"] = df["distance"] / df["step_count"] # To account for jumping, hiking, etc.
df["calorie_per_step"] = df["basal_energy_burned"] / df["step_count"] # To account for intensity of exercise
d["calorie_per_distance"] = df["basal_energy_burned"] / df["distance"] # Gym days vs. Outdoor days

df.head()

In [None]:
# Time series predictions to come later
# Try a column for every sleep_hours for the past week


# YData Profiling


In [None]:
from helper import POG4_Dataset

data = POG4_Dataset()

#print(df.columns.to_list())

In [None]:
data.features

In [None]:

df["sleep_h"].value_counts(normalize=True, dropna=False).iloc[0] > threshold]


In [None]:
from ydata_profiling import ProfileReport

profile_main = ProfileReport(df, title="POG4 Profiling Report", explorative=True)
profile_main.to_file("data_profile_main.html")

profile_ts = ProfileReport(df, tsmode=True, sortby="date", title="Time-Series EDA", explorative=True)
profile_ts.to_file("data_profile_ts.html")

In [None]:
# Merge Submissions test

from glob import glob

files = glob("./submissions/*.csv")

sub_all = pd.concat([pd.read_csv(f) for f in files], axis=0, ignore_index=True)
sub_all = sub_all.groupby("date")["sleep_hours"].mean().reset_index()
sub_all.to_csv("./submissions/submission_combined.csv", index=False)
sub_all.head()


In [None]:
from data import POG4_Dataset

data = POG4_Dataset()
#data.create_lags()


In [None]:
data.X.DistanceWalkingRunning_nhours_avg_avg.value_counts()

In [None]:
for i in data.X.columns:
    print(i)

In [None]:
print('nas', data.X.HeartRate_sleep_hours.isna().sum())
print(len(data.X))
data.X.HeartRate_sleep_hours.value_counts()

In [None]:
data.X.HeartRate_sleep_hours.hist()

In [None]:
import pandas as pd
print(pd.value_counts(data.X.dtypes))


In [None]:
import pandas as pd
pd.concat([data.X_test.reset_index(drop=True), data.y_test.reset_index(drop=True)], axis=1).isna().sum().sum()

In [None]:
1915/2508

In [None]:
import wandb
import pandas as pd

api = wandb.Api()
runs = api.runs("sgobat/pog4_xgb_classifier")
runs_2 = api.runs("sgobat/pog4_xgb")
# Initialize a list to store feature importances from all runs
feature_importances_list = []

# Iterate through the runs and fetch feature importances
for run in runs:
    # Fetch logged feature importances from the current run
    feature_importances = {
        key.replace("feature/", ""): value
        for key, value in run.summary.items()
        if key.startswith("feature/")
    }
    
    # Append the fetched feature importances to the list
    feature_importances_list.append(feature_importances)


for run in runs_2:
    # Fetch logged feature importances from the current run
    feature_importances = {
        key.replace("feature/", ""): value
        for key, value in run.summary.items()
        if key.startswith("feature/")
    }
    
    # Append the fetched feature importances to the list
    feature_importances_list.append(feature_importances)

# Create a DataFrame from the list of feature importances
df = pd.DataFrame(feature_importances_list)

# Calculate the mean feature importances across all runs
mean_feature_importances = df.sum().sort_values(ascending=False)

In [None]:
with pd.option_context("display.max_rows", 1000):
    display(mean_feature_importances)

In [None]:
feat_imp_threshold = mean_feature_importances.median() #- 1*mean_feature_importances.std()
print(feat_imp_threshold)

for i in mean_feature_importances.index:
    if mean_feature_importances[i] > feat_imp_threshold:
        print(i)

In [None]:
 # Get first doy_mean where is_tuesday == 1
df.loc[df["is_tuesday"] == 1, "doy_mean"]


In [None]:
[
"AppleStandTime_hrs_btween",
"BodyMassIndex_hrs_btween",
"OxygenSaturation_hrs_btween",
"is_weekend",
"appleExerciseTime",
"AppleStandTime",
"AppleStandHour_hrs_btween",
"VO2Max",
"AppleExerciseTime_hrs_btween",
"dow_median",
"appleStandHours",
"AppleStandTime_night_hours",
"AppleExerciseTime",
"day_of_week",
"avg_endDate_max_sin",
"max_startDate_min_hr",
"activeEnergyBurned",
"avg_startDate_min_sin",
"avg_startDate_min_hr",
"calorie_per_step",
"DistanceWalkingRunning_night_hours",
"month_sin",
"HeadphoneAudioExposure",
"avg_startDate_min_cos",
"max_endDate_min_hr",
"FlightsClimbed_hrs_btween",
"avg_endDate_min_sin",
"avg_endDate_max_cos",
"calorie_per_distance",
"HeartRateVariabilitySDNN_hrs_btween",
"DistanceWalkingRunning_hrs_btween",
"min_endDate_max_hr",
"StepCount_night_hours",
"FlightsClimbed_night_hours",
"distance_per_step",
"day_of_year",
"max_startDate_max_hr",
"min_endDate_min_hr",
"min_startDate_min_hr",
"StepCount_hrs_btween",
"BodyMassIndex",
"VO2Max_hrs_btween",
"doy_sin",
"month",
"month_cos",
"avg_startDate_max_sin",
"DistanceWalkingRunning",
"HeadphoneAudioExposure_hrs_btween",
"AppleExerciseTime_night_hours",
]

In [None]:
import pandas as pd
apple_stand_time = pd.read_csv("./data/xml_export/AppleStandTime.csv", low_memory=False)

In [None]:
df = apple_stand_time.copy()

def calculate_night_hours(df):
    df['startDate'] = pd.to_datetime(df['startDate'])
    df['endDate'] = pd.to_datetime(df['endDate'])

    # Define the time range for night time
    start_hour = 20  # 8 PM
    end_hour = 8  # 8 AM

    df = df[((df['startDate'].dt.hour >= start_hour) & (df['startDate'].dt.hour < 24)) |
                ((df['endDate'].dt.hour >= 0) & (df['endDate'].dt.hour < end_hour))]

    df['adjusted_startDate'] = (df['startDate'] - pd.to_timedelta('12:00:00')).dt.date # Subtract 12 hours from startDate
    df = df.groupby(['adjusted_startDate', 'sourceName']).agg(startDate=('startDate', 'min'), endDate=('endDate', 'max'), totalValue=('value', 'sum')).reset_index()
    df["night_hours"] = (df['endDate'] - df['startDate']).dt.total_seconds() / 3600

    return df

df = calculate_night_hours(df)
df.head(10)

In [None]:
df.night_hours.hist()

In [None]:
#See where sourcename is Rob’s Apple Watch
df[df['sourceName'] == "Rob's Apple Watch"]

In [None]:
import pandas as pd
df = apple_stand_time.copy()

# Assuming your dataframe is named df, and has columns 'startDate' and 'endDate'
# Ensure the columns are in datetime format
df['startDate'] = pd.to_datetime(df['startDate'])
df['endDate'] = pd.to_datetime(df['endDate'])

# Define the time range for night time
start_hour = 20  # 8 PM
end_hour = 8  # 8 AM

# Filter the dataframe based on the specified time range
df_night = df[((df['startDate'].dt.hour >= start_hour) & (df['startDate'].dt.hour < 24)) |
              ((df['endDate'].dt.hour >= 0) & (df['endDate'].dt.hour < end_hour))]



# Group by the date and find min startDate and max endDate
df_night['startDate_date'] = df_night['startDate'].dt.date
df_night['endDate_date'] = df_night['endDate'].dt.date
df_night_grouped = df_night.groupby(['startDate_date', 'endDate_date']).agg({'startDate': 'min', 'endDate': 'max'}).reset_index()

# Hours beteween start and enddate
df_night_grouped['hrs_btween'] = (df_night_grouped['endDate'] - df_night_grouped['startDate']).dt.total_seconds() / 3600

# Display the resul
df_night_grouped.head(10)



In [None]:
from data import POG4_Dataset
data = POG4_Dataset()

In [None]:
data.train_test_split()
data.preprocess_data()

In [None]:
data.X_test

In [None]:
0.8*579


In [None]:
import pandas as pd

d = pd.concat([data.y,data.X], axis=1)
d.to_csv("./regression.csv", index=False)
print(d)

In [None]:
 for i in data.X.columns:
     print(i)


In [None]:
data.X["AppleStandTime_night_hours"].plot()

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import numpy as np

import xgboost as xgb

# Mean all columns that end in night_hours
X = data.X[1000:].filter(like="night_hours")
y = data.y[1000:]


model = xgb.XGBRegressor(gpu_id=0, tree_method="gpu_hist", random_state=42)

# Set up the cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Scaler
scaler = StandardScaler()

# Imputer 
imputer = SimpleImputer(strategy="mean")

pipeline = Pipeline(steps=[("imputer", imputer), ("scaler", scaler), ("model", model)])

# Perform cross-validation and calculate metrics
cv_scores = cross_val_score(model, X, y, cv=tscv, scoring="neg_mean_squared_error")
rmse_scores = np.sqrt(-cv_scores)
avg_rmse = np.mean(rmse_scores)
print(f"Average RMSE: {avg_rmse:.2f}")

### Stair Ascent/Descent

In [None]:
ascent = pd.read_csv("./data/xml_export/StairAscentSpeed.csv", low_memory=False)
ascent["type"] = "ascent"
descent = pd.read_csv("./data/xml_export/StairDescentSpeed.csv", low_memory=False)
descent["type"] = "descent"

stairs = pd.concat([ascent, descent], axis=0)
stairs.head()

In [None]:
df = stairs[stairs["value"] >= 1] # Optional?

df['startDate'] = pd.to_datetime(df['startDate']).dt.tz_localize(None)
df['endDate'] = pd.to_datetime(df['endDate']).dt.tz_localize(None)

df = df.sort_values(by=['startDate', 'endDate'])

# Get the date range in the dataframe
min_date = df['startDate'].min().date()
max_date = df['endDate'].max().date()

# Initialize an empty list to store the results
results = []

# Loop through each date in the range
for date in pd.date_range(min_date, max_date):
    # startSleep time boundaries - Based on analysis of train_detailed
    start_day = pd.Timestamp.combine(date, pd.Timestamp('21:30:00').time())
    end_day = pd.Timestamp.combine(date + pd.DateOffset(1), pd.Timestamp('01:30:00').time())
    
    # endSleep time boundaries - Based on analysis of train_detailed
    start_night = pd.Timestamp.combine(date + pd.DateOffset(1), pd.Timestamp('06:30:00').time())
    end_night = pd.Timestamp.combine(date + pd.DateOffset(1), pd.Timestamp('9:30:00').time())

    # Filter the dataframe for max_endDate
    mask_endDate = ((df['endDate'] >= start_day)) & ((df['endDate'] <= end_day))
    filtered_df_endDate = df[mask_endDate]

    # Filter the dataframe for min_startDate
    mask_startDate = ((df['startDate'] >= start_night)) & ((df['startDate'] <= end_night))
    filtered_df_startDate = df[mask_startDate]

    # Find max_endDate and min_startDate
    min_endDate = filtered_df_endDate['endDate'].min() # if not filtered_df_endDate.empty else pd.to_datetime(start_day)
    max_endDate = filtered_df_endDate['endDate'].max() # if not filtered_df_endDate.empty else pd.to_datetime(end_day)
    min_startDate = filtered_df_startDate['startDate'].min() # if not filtered_df_startDate.empty else pd.to_datetime(start_night)
    max_startDate = filtered_df_startDate['startDate'].max() # if not filtered_df_startDate.empty else pd.to_datetime(end_night)

    # Append the results to the list
    results.append({
        'date': date,
        'min_endDate': min_endDate, # Min Possible Start Sleeping
        'max_endDate': max_endDate, # Max Possible Start Sleeping
        'min_startDate': min_startDate, # Min Possible End Sleeping
        'max_startDate': max_startDate # Max Possible End Sleeping
    })

# Convert the results to a dataframe and return
result_df = pd.DataFrame(results)

# Time Differences in hours # Attempt to manually calculate sleep time - doesn't work, but still useful
result_df["nhours_min_min"] = (result_df["min_startDate"] - result_df["min_endDate"]).dt.total_seconds() / 3600
result_df["nhours_min_max"] = (result_df["min_startDate"] - result_df["max_endDate"]).dt.total_seconds() / 3600
result_df["nhours_max_min"] = (result_df["max_startDate"] - result_df["min_endDate"]).dt.total_seconds() / 3600
result_df["nhours_max_max"] = (result_df["max_startDate"] - result_df["max_endDate"]).dt.total_seconds() / 3600

# Hours
result_df["min_endDate_hr"] = result_df["min_endDate"].dt.hour
result_df["max_endDate_hr"] = result_df["max_endDate"].dt.hour
result_df["min_startDate_hr"] = result_df["min_startDate"].dt.hour
result_df["max_startDate_hr"] = result_df["max_startDate"].dt.hour

In [None]:
result_df.nhours_min_max.isna().value_counts()

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv("./data/train_detailed.csv", low_memory=False)

# Remove rows where "value" is HKCategoryValueSleepAnalysisInBed
df = df[df["value"] != "HKCategoryValueSleepAnalysisInBed"]

# Convert startDate and endDate columns to datetime objects
df["startDate"] = pd.to_datetime(df["startDate"])
df["endDate"] = pd.to_datetime(df["endDate"])

# Create adjusted_start_date column by subtracting 12 hours from startDate
df['adjusted_start_date'] = (df['startDate'] - pd.DateOffset(hours=12)).dt.date

# Save all unique adjusted_start_date values
unique_dates = pd.DataFrame(df["adjusted_start_date"].unique(), columns=["adjusted_start_date"])

# Filter rows with startDate hours >= 22 or endDate hours <= 10
df = df[(df["startDate"].dt.hour >= 22) | (df["endDate"].dt.hour <= 10)]

# Group by adjusted_start_date and get the min startDate and max endDate
df = df.groupby("adjusted_start_date").agg({"startDate": "min", "endDate": "max"}).reset_index()

# Convert startDate and endDate to hours since midnight
df["startDate"] = df["startDate"].dt.hour + df["startDate"].dt.minute / 60 + df["startDate"].dt.second / 3600
df["endDate"] = df["endDate"].dt.hour + df["endDate"].dt.minute / 60 + df["endDate"].dt.second / 3600

# If startDate is less than 12, add 24 hours
df.loc[df["startDate"] < 12, "startDate"] += 24

# Merge the results with the unique_dates DataFrame
final_df = unique_dates.merge(df, on="adjusted_start_date", how="left").fillna(method="ffill").fillna(method="bfill")



In [None]:
final_df.endDate.hist()

In [None]:
final_df.startDate.hist()

In [2]:
from data import POG4_Dataset

data = POG4_Dataset()

INFO - Creating XML data
INFO - Creating activity data
INFO - Missing days: 87
INFO - Featurizing time series data
INFO - Creating interactions...


In [3]:
import pandas as pd

train = data.train[(data.train['date'] >= pd.to_datetime('2018-09-25').date()) & (data.train['date'] <= pd.to_datetime('2021-11-30').date())]

# Using cross-validation so concat the train and test sets
X = train.drop(['sleep_hours', 'date'], axis=1)
y = train.sleep_hours.fillna(method="ffill")

In [4]:
X

Unnamed: 0,slp_DistanceWalkingRunning_hrs_max_max,slp_FlightsClimbed_max_hrs_between,day_of_week,slp_StepCount_hrs_max_max,distance_per_step,hr_02:40:00,is_workday,slp_DistanceWalkingRunning_hrs_min_max,steps_23:35:00,hr_08:15:00,...,hr_03:25:00,cal_08:15:00,slp_AppleStandTime_max_hrs_between,slp_StepCount_hrs_max_min,day_of_year,hr_06:55:00,hr_06:45:00,slp_StepCount_hrs_min_max,doy_cos,steps_08:10:00
1205,9.384167,13.061389,1,9.384167,0.000471,,True,6.580833,20.0,,...,,0.033635,,10.654722,268,,,6.580833,-0.098820,87.0
1206,,3.772778,2,,0.000472,,True,,229.0,,...,,0.020456,,,269,,,,-0.081676,44.0
1207,9.441944,5.228889,3,9.441944,0.000581,,True,8.096667,33.0,,...,,0.083612,,9.441944,270,,,8.096667,-0.064508,86.0
1208,9.108333,8.928611,4,9.108333,0.000728,,True,8.785278,79.0,,...,,0.023569,,10.687500,271,,,8.785278,-0.047321,54.0
1209,9.242222,2.377778,5,9.242222,0.000579,,False,8.530000,58.0,,...,,15.055000,,9.242222,272,,,8.530000,-0.030120,23368.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2363,7.714722,10.569167,4,7.714722,0.000884,53.0000,True,6.845833,101.0,43.000,...,53.0,0.017026,5.416667,10.195000,330,49.0,49.0,6.845833,0.823923,37.0
2364,8.337500,8.499722,5,8.290000,0.000776,61.0000,False,5.452222,75.0,54.000,...,71.0,0.006369,5.833333,10.867500,331,58.0,59.0,5.452222,0.833556,12.0
2365,7.911944,10.818611,6,7.953056,0.001030,69.5000,False,5.784444,133.0,58.000,...,56.0,0.022134,6.250000,10.292222,332,63.0,59.0,5.784444,0.842942,49.0
2366,8.448333,8.799167,0,8.448333,0.000873,60.0000,True,6.228889,73.0,51.000,...,60.0,0.004065,5.750000,9.887500,333,52.0,50.0,6.228889,0.852078,46.0


In [5]:
import xgboost as xgb

xgb.XGBRegressor().fit(X, y)

In [None]:
print(len(data.X))
print(len(data.sleep_times))

In [4]:
import pandas as pd
import datetime as dt

def process_heart_rate_data(filename, interval_minutes):
    df = pd.read_csv(filename, low_memory=False)

    # Keep only startDate, endDate, and value columns
    df = df[["startDate", "endDate", "value"]]

    df['startDate'] = pd.to_datetime(df['startDate'])
    df['startDate'] = df['startDate'] - pd.Timedelta(hours=12)

    df['date'] = df['startDate'].dt.date

    def round_time_to_nearest_interval(time):
        minutes = (time.hour * 60) + time.minute
        rounded_minutes = round(minutes / interval_minutes) * interval_minutes
        return dt.time(hour=(rounded_minutes // 60) % 24, minute=rounded_minutes % 60)

    df['time'] = df['startDate'].dt.time.map(round_time_to_nearest_interval)

    df_grouped = df.groupby(['date', 'time'])['value'].mean().reset_index()

    df_pivot = df_grouped.pivot_table(index='date', columns='time', values='value').reset_index()

    df_pivot.set_index('date', inplace=True)

    time_start = dt.time(hour=9)
    time_end = dt.time(hour=21)
    df_filtered = df_pivot.loc[:, (df_pivot.columns >= time_start) & (df_pivot.columns <= time_end)]


    def add_12_hours_to_time(time_obj):
        datetime_obj = dt.datetime.combine(dt.date(1, 1, 1), time_obj)
        datetime_obj += dt.timedelta(hours=12)
        return datetime_obj.time()

    df_filtered = df_filtered.rename(columns=add_12_hours_to_time)
    df_filtered.iloc[:, 1:] = df_filtered.iloc[:, 1:].interpolate(axis=1).ffill(axis=1).bfill(axis=1)
    df_filtered = df_filtered.reset_index()

    return df_filtered

filename = "./data/xml_export/HeartRate.csv"
interval_minutes = 5

df_filtered = process_heart_rate_data(filename, interval_minutes)


df_filtered

time,date,21:00:00,21:05:00,21:10:00,21:15:00,21:20:00,21:25:00,21:30:00,21:35:00,21:40:00,...,08:15:00,08:20:00,08:25:00,08:30:00,08:35:00,08:40:00,08:45:00,08:50:00,08:55:00,09:00:00
0,2015-04-20,,,,,,,,,,...,,,,,,,,,,
1,2015-04-21,,95.0000,95.0000,105.644599,96.183333,124.206667,124.063333,137.3000,144.273333,...,112.509901,112.509901,112.509901,112.509901,112.509901,112.509901,112.509901,112.509901,112.509901,112.509901
2,2015-04-22,,,,,,,,,,...,,,,,,,,,,
3,2015-04-23,,,,,,,,,,...,,,,,,,,,,
4,2015-04-24,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,2023-03-13,84.0,86.0000,86.0000,95.500000,83.250000,71.000000,76.000000,76.2500,76.500000,...,56.000000,54.500000,53.000000,54.000000,52.000000,52.250000,52.500000,49.000000,51.000000,52.000000
916,2023-03-14,76.0,73.5000,73.5000,77.000000,73.750000,70.500000,70.500000,70.5000,70.000000,...,78.000000,80.000000,85.000000,91.500000,94.000000,96.500000,89.000000,92.500000,96.000000,94.000000
917,2023-03-15,,60.0856,60.0856,60.085600,60.085600,60.085600,60.085600,60.0856,60.085600,...,99.000000,115.000000,99.000000,95.000000,97.000000,100.000000,99.000000,100.000000,96.000000,96.000000
918,2023-03-16,64.0,76.0000,76.0000,76.000000,76.000000,76.000000,76.000000,76.0000,76.000000,...,67.000000,66.500000,66.000000,64.000000,59.000000,60.000000,60.250000,60.500000,61.000000,61.000000


In [34]:
df_filtered.shape

(920, 146)