In [161]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 50)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score

In [151]:
physio = pd.read_csv('physiological_cycles.csv')
journal = pd.read_csv('journal_entries.csv')

<h4>Data Cleaning</h4>

In [152]:
# Convert 'Cycle start time' to datetime
journal["Cycle start time"] = pd.to_datetime(journal["Cycle start time"])
journal["Cycle end time"] = pd.to_datetime(journal["Cycle end time"])

# Filter for specific 'Question text' (replace with your picks)
desired_questions = ["Viewed a screen device in bed?", "Have any alcoholic drinks?", "See artificial light upon waking up?",
                    "Experience bloating?", "Read (non-screened device) while in bed?", "See direct sunlight upon waking up?",
                    "Feeling sick or ill?", "Consume fruits and/or vegetables?", "Avoid consuming processed foods?",
                    "Eat all your meals during daylight hours?"]  
filtered_journal = journal[journal["Question text"].isin(desired_questions)]

# Transpose questions as columns
pivot_journal = filtered_journal.pivot_table(
    index=["Cycle start time", "Cycle end time"], 
    columns="Question text", 
    values="Answered yes", 
    aggfunc="sum"
).reset_index()

#Fill nans with 0, assuming False for the day with no entry
pivot_journal = pivot_journal.fillna(0.0)
pivot_journal = pivot_journal.reset_index(drop=True)

# Shift metrics up one row (previous cycle’s value that impacted recovery)
physio["Previous energy burned"] = physio["Energy burned (cal)"].shift(-1)
physio["Previous max hr"] = physio["Max HR (bpm)"].shift(-1)
physio["Previous average hr"] = physio["Average HR (bpm)"].shift(-1)

# Clean datetime values
physio["Cycle start time"] = pd.to_datetime(physio["Cycle start time"])
physio["Cycle end time"] = pd.to_datetime(physio["Cycle end time"])
physio["Sleep onset"] = pd.to_datetime(physio["Sleep onset"])
physio["Wake onset"] = pd.to_datetime(physio["Wake onset"])

# Convert numeric value nans with median of column
numeric_cols = physio.select_dtypes(include=["int64", "float64"]).columns
for col in numeric_cols:
    median_value = physio[col].median()
    physio.loc[:, col] = physio[col].fillna(median_value)
    
selected_physio_columns = ['Cycle start time', 'Cycle end time',
       'Recovery score %', 'Resting heart rate (bpm)',
       'Heart rate variability (ms)', 
       'Day Strain','Previous energy burned','Previous max hr', 'Previous average hr',
       'Sleep onset', 'Wake onset', 'Sleep performance %',
       'Asleep duration (min)',
       'In bed duration (min)', 'Light sleep duration (min)',
       'Deep (SWS) duration (min)', 'REM duration (min)',
       'Awake duration (min)', 'Sleep consistency %']
df = physio[selected_physio_columns]
df = df.merge(pivot_journal, how='left', on=['Cycle start time', 'Cycle end time'])

# Fill merged nans with 0, assuming False like before for the day with no entry logged
numeric_cols_journal = pivot_journal.select_dtypes(include=["int64", "float64"]).columns
for col in numeric_cols_journal:
    df.loc[:, col] = df[col].fillna(0)

# Clean up column names
df.columns = df.columns.str.lower().str.replace(" ", "_")
df.columns = df.columns.str.replace("(", "")
df.columns = df.columns.str.replace(")", "")
df.columns = df.columns.str.replace("?", "")

<h4>Feature Engineering</h4>

In [154]:
# Extract features from 'Cycle start time'
df["cycle_day_of_week"] = df["cycle_start_time"].dt.dayofweek  # 0-6 (Mon-Sun)
df["sleep_start_hour"] = df["sleep_onset"].dt.hour              # 0-23
df["start_day"] = df["cycle_start_time"].dt.day                # 1-31

# Extract features from 'Cycle end time'
df["end_day_of_week"] = df["cycle_end_time"].dt.dayofweek
df["sleep_end_hour"] = df["wake_onset"].dt.hour

# Extract cycle duration
df["cycle_duration"] = (df["cycle_end_time"] - df["cycle_start_time"]).dt.total_seconds() / 60
df["time_awake"] = df["cycle_duration"] - df["asleep_duration_min"]
df["awake_sleep_ratio"] = df["time_awake"]/df["asleep_duration_min"]

# Only use data where valid cycles and sleep exist
df = df[~df["cycle_end_time"].isna()]
df = df[~df["cycle_start_time"].isna()]
df = df[~df["sleep_onset"].isna()]
df = df[~df["wake_onset"].isna()]

del df["cycle_start_time"]
del df["cycle_end_time"]
del df["sleep_onset"]
del df["wake_onset"]

<h4>Train Model</h4>

In [None]:
# Identify numeric columns (excluding target 'recovery' for now)
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
features = [col for col in numeric_cols if col != "recovery_score_%"]  # All numeric except target

# Standardize features
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# Check it
print("Means:", df[features].mean().round(2))  # Should be ~0
print("Stds:", df[features].std().round(2))    # Should be ~1

In [166]:
# Features and target
X = df[features].values  # All numeric except 'recovery'
y = df["recovery_score_%"].values

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Models to test
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name}: MAE = {mae:.2f}, R² = {r2:.3f}")

Linear Regression: MAE = 6.64, R² = 0.901
Random Forest: MAE = 6.23, R² = 0.898
XGBoost: MAE = 7.42, R² = 0.861


In [165]:
# Feature importance (for Random Forest)
rf = models["XGBoost"]
importances = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
print("\nRandom Forest Feature Importances:")
print(importances.head(10))


Random Forest Feature Importances:
heart_rate_variability_ms        0.726333
asleep_duration_min              0.062693
awake_sleep_ratio                0.048380
sleep_performance_%              0.036678
deep_sws_duration_min            0.014371
viewed_a_screen_device_in_bed    0.011117
resting_heart_rate_bpm           0.010124
sleep_consistency_%              0.009335
rem_duration_min                 0.009151
awake_duration_min               0.008259
dtype: float32


In [169]:
activities = pd.read_csv('workouts.csv')

In [171]:
# One-hot encode 'Activity name'
activity_dummies = pd.get_dummies(activities["Activity name"], prefix="is")

# Combine with original data
activities_transformed = pd.concat([activities.drop("Activity name", axis=1), activity_dummies], axis=1)