In [161]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 50)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score

In [151]:
physio = pd.read_csv('physiological_cycles.csv')
journal = pd.read_csv('journal_entries.csv')

<h4>Data Cleaning</h4>

In [152]:
# Convert 'Cycle start time' to datetime
journal["Cycle start time"] = pd.to_datetime(journal["Cycle start time"])
journal["Cycle end time"] = pd.to_datetime(journal["Cycle end time"])

# Filter for specific 'Question text' (replace with your picks)
desired_questions = ["Viewed a screen device in bed?", "Have any alcoholic drinks?", "See artificial light upon waking up?",
                    "Experience bloating?", "Read (non-screened device) while in bed?", "See direct sunlight upon waking up?",
                    "Feeling sick or ill?", "Consume fruits and/or vegetables?", "Avoid consuming processed foods?",
                    "Eat all your meals during daylight hours?"]  
filtered_journal = journal[journal["Question text"].isin(desired_questions)]

# Transpose questions as columns
pivot_journal = filtered_journal.pivot_table(
    index=["Cycle start time", "Cycle end time"], 
    columns="Question text", 
    values="Answered yes", 
    aggfunc="sum"
).reset_index()

#Fill nans with 0, assuming False for the day with no entry
pivot_journal = pivot_journal.fillna(0.0)
pivot_journal = pivot_journal.reset_index(drop=True)

# Shift metrics up one row (previous cycle’s value that impacted recovery)
physio["Previous energy burned"] = physio["Energy burned (cal)"].shift(-1)
physio["Previous max hr"] = physio["Max HR (bpm)"].shift(-1)
physio["Previous average hr"] = physio["Average HR (bpm)"].shift(-1)

# Clean datetime values
physio["Cycle start time"] = pd.to_datetime(physio["Cycle start time"])
physio["Cycle end time"] = pd.to_datetime(physio["Cycle end time"])
physio["Sleep onset"] = pd.to_datetime(physio["Sleep onset"])
physio["Wake onset"] = pd.to_datetime(physio["Wake onset"])

# Convert numeric value nans with median of column
numeric_cols = physio.select_dtypes(include=["int64", "float64"]).columns
for col in numeric_cols:
    median_value = physio[col].median()
    physio.loc[:, col] = physio[col].fillna(median_value)
    
selected_physio_columns = ['Cycle start time', 'Cycle end time',
       'Recovery score %', 'Resting heart rate (bpm)',
       'Heart rate variability (ms)', 
       'Day Strain','Previous energy burned','Previous max hr', 'Previous average hr',
       'Sleep onset', 'Wake onset', 'Sleep performance %',
       'Asleep duration (min)',
       'In bed duration (min)', 'Light sleep duration (min)',
       'Deep (SWS) duration (min)', 'REM duration (min)',
       'Awake duration (min)', 'Sleep consistency %']
df = physio[selected_physio_columns]
df = df.merge(pivot_journal, how='left', on=['Cycle start time', 'Cycle end time'])

# Fill merged nans with 0, assuming False like before for the day with no entry logged
numeric_cols_journal = pivot_journal.select_dtypes(include=["int64", "float64"]).columns
for col in numeric_cols_journal:
    df.loc[:, col] = df[col].fillna(0)

# Clean up column names
df.columns = df.columns.str.lower().str.replace(" ", "_")
df.columns = df.columns.str.replace("(", "")
df.columns = df.columns.str.replace(")", "")
df.columns = df.columns.str.replace("?", "")

<h4>Feature Engineering</h4>

In [154]:
# Extract features from 'Cycle start time'
df["cycle_day_of_week"] = df["cycle_start_time"].dt.dayofweek  # 0-6 (Mon-Sun)
df["sleep_start_hour"] = df["sleep_onset"].dt.hour              # 0-23
df["start_day"] = df["cycle_start_time"].dt.day                # 1-31

# Extract features from 'Cycle end time'
df["end_day_of_week"] = df["cycle_end_time"].dt.dayofweek
df["sleep_end_hour"] = df["wake_onset"].dt.hour

# Extract cycle duration
df["cycle_duration"] = (df["cycle_end_time"] - df["cycle_start_time"]).dt.total_seconds() / 60
df["time_awake"] = df["cycle_duration"] - df["asleep_duration_min"]
df["awake_sleep_ratio"] = df["time_awake"]/df["asleep_duration_min"]

# Only use data where valid cycles and sleep exist
df = df[~df["cycle_end_time"].isna()]
df = df[~df["cycle_start_time"].isna()]
df = df[~df["sleep_onset"].isna()]
df = df[~df["wake_onset"].isna()]

del df["cycle_start_time"]
del df["cycle_end_time"]
del df["sleep_onset"]
del df["wake_onset"]

<h4>Train Model</h4>

In [160]:
# Identify numeric columns (excluding target 'recovery' for now)
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
features = [col for col in numeric_cols if col != "recovery_score_%"]  # All numeric except target

# Standardize features
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# Check it
print("Means:", df[features].mean().round(2))  # Should be ~0
print("Stds:", df[features].std().round(2))    # Should be ~1

Means: resting_heart_rate_bpm                      0.0
heart_rate_variability_ms                  -0.0
day_strain                                  0.0
previous_energy_burned                      0.0
previous_max_hr                             0.0
previous_average_hr                         0.0
sleep_performance_%                        -0.0
asleep_duration_min                        -0.0
in_bed_duration_min                         0.0
light_sleep_duration_min                    0.0
deep_sws_duration_min                       0.0
rem_duration_min                            0.0
awake_duration_min                         -0.0
sleep_consistency_%                         0.0
avoid_consuming_processed_foods            -0.0
consume_fruits_and/or_vegetables           -0.0
eat_all_your_meals_during_daylight_hours    0.0
experience_bloating                        -0.0
feeling_sick_or_ill                        -0.0
have_any_alcoholic_drinks                   0.0
read_non-screened_device_while_in

In [166]:
# Features and target
X = df[features].values  # All numeric except 'recovery'
y = df["recovery_score_%"].values

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Models to test
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name}: MAE = {mae:.2f}, R² = {r2:.3f}")

Linear Regression: MAE = 6.64, R² = 0.901
Random Forest: MAE = 6.23, R² = 0.898
XGBoost: MAE = 7.42, R² = 0.861


In [165]:
# Feature importance (for Random Forest)
rf = models["XGBoost"]
importances = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
print("\nRandom Forest Feature Importances:")
print(importances.head(10))


Random Forest Feature Importances:
heart_rate_variability_ms        0.726333
asleep_duration_min              0.062693
awake_sleep_ratio                0.048380
sleep_performance_%              0.036678
deep_sws_duration_min            0.014371
viewed_a_screen_device_in_bed    0.011117
resting_heart_rate_bpm           0.010124
sleep_consistency_%              0.009335
rem_duration_min                 0.009151
awake_duration_min               0.008259
dtype: float32


In [169]:
activities = pd.read_csv('workouts.csv')

In [171]:
# One-hot encode 'Activity name'
activity_dummies = pd.get_dummies(activities["Activity name"], prefix="is")

# Combine with original data
activities_transformed = pd.concat([activities.drop("Activity name", axis=1), activity_dummies], axis=1)

In [178]:
activities_transformed

Unnamed: 0,Cycle start time,Cycle end time,Cycle timezone,Workout start time,Workout end time,Duration (min),Activity Strain,Energy burned (cal),Max HR (bpm),Average HR (bpm),HR Zone 1 %,HR Zone 2 %,HR Zone 3 %,HR Zone 4 %,HR Zone 5 %,GPS enabled,Distance (meters),Altitude gain (meters),Altitude change (meters),is_Activity,is_Box Fitness,is_Hiking,is_Manual Labor,is_Motocross,is_Running,is_Soccer,is_Softball,is_Strength Trainer,is_Swimming,is_Track & Field,is_Volleyball,is_Walking,is_Weightlifting
0,2025-02-02 04:25:46,2025-02-03 03:36:34,UTC-05:00,2025-02-02 22:07:30,2025-02-02 23:08:59,61,12.2,583.0,182.0,136.0,29.0,35.0,20.0,4.0,0.0,False,,,,False,False,False,False,False,False,True,False,False,False,False,False,False,False
1,2025-01-31 01:31:22,2025-02-01 02:11:52,UTC-05:00,2025-01-31 19:59:07,2025-01-31 20:56:43,57,4.3,104.0,131.0,93.0,9.0,0.0,0.0,0.0,0.0,False,,,,False,False,False,False,False,False,False,False,True,False,False,False,False,False
2,2025-01-30 01:52:42,2025-01-31 01:31:22,UTC-05:00,2025-01-30 20:06:00,2025-01-30 22:02:53,116,12.7,895.0,163.0,128.0,60.0,27.0,4.0,0.0,0.0,False,,,,False,False,False,False,False,False,False,False,False,False,False,True,False,False
3,2025-01-29 02:44:33,2025-01-30 01:52:42,UTC-05:00,2025-01-29 19:31:36,2025-01-29 20:42:19,70,4.8,142.0,135.0,98.0,12.0,0.0,0.0,0.0,0.0,False,,,,False,False,False,False,False,False,False,False,True,False,False,False,False,False
4,2025-01-28 01:50:00,2025-01-29 02:44:33,UTC-05:00,2025-01-28 22:19:02,2025-01-28 23:07:33,48,4.2,85.0,133.0,88.0,8.0,0.0,0.0,0.0,0.0,False,,,,False,False,False,False,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306,2024-03-14 01:32:30,2024-03-15 02:58:34,UTC-04:00,2024-03-14 16:23:39,2024-03-14 17:26:34,62,5.9,212.0,137.0,104.0,51.0,33.0,5.0,0.0,0.0,False,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,True
307,2024-03-11 23:08:57,2024-03-13 01:07:27,UTC-04:00,2024-03-12 21:42:32,2024-03-12 22:03:33,21,9.8,259.0,177.0,149.0,0.0,4.0,23.0,58.0,14.0,True,2795.08,94.94,4.5,False,False,False,False,False,True,False,False,False,False,False,False,False,False
308,2024-03-11 23:08:57,2024-03-13 01:07:27,UTC-04:00,2024-03-12 06:43:55,2024-03-12 07:30:36,46,4.2,85.0,133.0,97.0,63.0,13.0,0.0,0.0,0.0,False,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,True
309,2024-03-10 00:00:00,2024-03-11 02:50:29,UTC-04:00,2024-03-10 21:53:35,2024-03-10 22:58:07,64,14.5,745.0,195.0,141.0,0.0,17.0,48.0,23.0,12.0,False,,,,False,False,False,False,False,False,True,False,False,False,False,False,False,False


In [177]:
df

Unnamed: 0,recovery_score_%,resting_heart_rate_bpm,heart_rate_variability_ms,day_strain,previous_energy_burned,previous_max_hr,previous_average_hr,sleep_performance_%,asleep_duration_min,in_bed_duration_min,light_sleep_duration_min,deep_sws_duration_min,rem_duration_min,awake_duration_min,sleep_consistency_%,avoid_consuming_processed_foods,consume_fruits_and/or_vegetables,eat_all_your_meals_during_daylight_hours,experience_bloating,feeling_sick_or_ill,have_any_alcoholic_drinks,read_non-screened_device_while_in_bed,see_artificial_light_upon_waking_up,see_direct_sunlight_upon_waking_up,viewed_a_screen_device_in_bed,cycle_day_of_week,sleep_start_hour,start_day,end_day_of_week,sleep_end_hour,cycle_duration,time_awake,awake_sleep_ratio
1,53.0,0.165842,-0.479590,-1.373019,0.041743,1.210064,-0.434907,0.333031,0.611899,0.770342,-0.134456,2.301994,0.113842,1.036449,-0.362908,-0.750605,-1.138913,-0.460642,-0.390244,-0.300501,-0.551598,-0.410437,-0.612803,-0.708734,-0.820233,0,-0.079407,3,-0.981056,2.071345,-0.778487,-1.238434,-0.569338
2,94.0,-1.177374,1.569174,0.714895,-0.374219,-1.371675,-1.686824,0.023014,-0.007566,-0.239766,1.183618,-0.666497,-1.761530,-1.159629,-1.296304,1.206973,0.836011,-0.460642,-0.390244,-0.300501,-0.551598,-0.410437,-0.612803,1.410966,-0.820233,6,0.088689,2,-1.477677,1.435508,-0.420949,-0.412524,-0.235216
3,82.0,-0.841570,0.544792,-1.028412,-0.346093,0.159865,-0.904376,0.271027,0.533058,0.332976,0.094189,0.335850,0.780093,-0.793616,0.337139,1.206973,-1.138913,-0.460642,-0.390244,-0.300501,-0.551598,-0.410437,-0.612803,1.410966,-0.820233,5,-0.247503,1,1.502051,0.163835,1.155894,0.743549,-0.216860
4,47.0,-0.337864,-0.319530,-0.481094,0.475469,0.378657,1.129989,-0.100993,0.352850,0.457938,-0.134456,0.837024,0.483982,0.670436,0.259356,-0.750605,0.836011,-0.460642,-0.390244,-0.300501,-0.551598,-0.410437,-0.612803,-0.708734,-0.820233,4,-0.415600,31,1.005430,0.163835,0.351540,0.081224,-0.270693
5,22.0,0.165842,-0.127459,0.836521,-0.680639,-0.846575,-0.747886,-2.147107,-2.125010,-2.270395,-1.439080,-2.247122,-0.577084,-1.525642,1.037186,1.206973,-1.138913,-0.460642,-0.390244,-0.300501,-0.551598,-0.410437,1.631846,-0.708734,-0.820233,3,-0.415600,30,0.508808,-2.379511,-0.180963,1.434683,1.914493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,49.0,0.333744,-0.511602,0.674353,1.035019,1.297581,0.034562,-0.535017,-0.683346,-0.541757,-0.739694,0.567161,-0.503056,0.461285,-0.129559,-0.750605,-1.138913,-0.460642,-0.390244,-0.300501,-0.551598,-0.410437,-0.612803,-0.708734,1.160478,4,-0.247503,15,1.005430,0.163835,0.088015,0.606635,0.281787
327,97.0,-1.177374,1.056983,1.404109,-0.491162,-1.634225,-1.060865,1.263082,1.141260,0.957785,0.605279,0.952680,0.780093,-0.479891,0.026007,-0.750605,-1.138913,-0.460642,-0.390244,-0.300501,-0.551598,-0.410437,1.631846,1.410966,1.160478,3,-0.415600,14,0.508808,0.799672,0.743957,-0.127856,-0.524737
328,93.0,-0.169962,0.224673,-0.967599,0.136482,1.035031,-0.121927,0.519041,0.848422,0.853650,0.820475,-0.049668,0.385278,0.356710,-0.596257,-0.750605,-1.138913,-0.460642,-0.390244,-0.300501,-0.551598,2.436426,-0.612803,-0.708734,-0.820233,2,-0.415600,13,0.012187,0.163835,0.218486,-0.427499,-0.498707
329,92.0,-0.505766,0.160649,0.309475,-1.001863,-1.240400,-0.121927,-0.162997,-0.165248,-0.260593,-0.685895,0.567161,0.533334,-0.532178,0.103790,-0.750605,-1.138913,-0.460642,-0.390244,-0.300501,-0.551598,2.436426,1.631846,-0.708734,1.160478,0,3.282515,11,-0.484434,-1.743674,1.023270,1.142313,0.127702
