In [19]:
#import
import warnings, json
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    roc_auc_score, f1_score, accuracy_score
)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# Optional, if installed
try:
    import xgboost as xgb
    HAS_XGB = True
except:
    HAS_XGB = False

In [20]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
DATA_PATH = Path("../data/fitness.csv")   # place Kaggle CSV here
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import sklearn

print("sklearn version:", sklearn.__version__)  # sanity check

def _rmse(y_true, y_pred):
    # Works on any sklearn version
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        # older versions don't have 'squared' -> compute manually
        return np.sqrt(mean_squared_error(y_true, y_pred))

def regression_report(y_true, y_pred):
    return {
        "RMSE": _rmse(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2":  r2_score(y_true, y_pred)
    }


def classification_report_simple(y_true, y_prob, thresh=0.5):
    y_hat = (y_prob >= thresh).astype(int)
    return {
        "Accuracy": accuracy_score(y_true, y_hat),
        "F1": f1_score(y_true, y_hat),
        "ROC-AUC": roc_auc_score(y_true, y_prob)
    }

sklearn version: 1.7.2


In [21]:
# =======================
# 1) Load & sanity check
# =======================
df = pd.read_csv(DATA_PATH)

# Expected columns (rename here if needed):
# user_id, age, gender, date, steps, heart_rate_avg, sleep_hours,
# calories_burned, exercise_minutes, stress_level

# Normalize/parse
if "date" in df.columns:
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
if "gender" in df.columns:
    df["gender"] = df["gender"].astype(str).str.upper().str[:1]  # e.g., 'M'/'F'

# Drop rows with missing core signals we rely on
core_cols = ["stress_level","steps","exercise_minutes","sleep_hours","heart_rate_avg","calories_burned"]
for c in core_cols:
    if c in df.columns:
        df = df[df[c].notna()]

# Sort for time-aware operations
sort_keys = [c for c in ["user_id", "date"] if c in df.columns]
if sort_keys:
    df = df.sort_values(sort_keys)

print("Rows after basic cleaning:", len(df))
display(df.head())

Rows after basic cleaning: 365000


Unnamed: 0,user_id,age,gender,date,steps,heart_rate_avg,sleep_hours,calories_burned,exercise_minutes,stress_level,weight_kg,bmi
0,0,56,F,2024-09-06,9341,62.029621,9.368819,2230.230419,0.623979,2,73.496429,22.471978
1,0,56,F,2024-09-07,10873,59.062818,6.358311,1840.454777,109.208987,3,68.237867,22.569858
2,0,56,F,2024-09-08,6638,58.494078,6.099619,2284.231946,3.083319,4,81.68789,17.595609
3,0,56,F,2024-09-09,6062,56.546095,7.584023,1620.464266,22.023327,10,86.379884,20.154137
4,0,56,F,2024-09-10,10399,59.507172,7.327957,2264.528312,76.483061,8,81.782982,32.62404


In [22]:
# ==========================================
# 2) Time-aware stress targets (next day)
#    - next_day_stress (regression)
#    - high_stress_next (classification, e.g., >= 7)
# ==========================================
grp = df.groupby("user_id", group_keys=False) if "user_id" in df.columns else [(None, df)]

def add_targets(g, threshold=7):
    g = g.sort_values("date") if "date" in g.columns else g
    if "stress_level" in g.columns:
        g["next_day_stress"] = g["stress_level"].shift(-1)
        g["high_stress_next"] = (g["next_day_stress"] >= threshold).astype("Int64")
    return g

if isinstance(grp, pd.core.groupby.generic.DataFrameGroupBy):
    df = grp.apply(add_targets)
else:
    df = add_targets(df)

# Remove last day per user (no next-day target)
df = df[df["next_day_stress"].notna()]

In [23]:
# =====================================================
# 3) Lag & rolling features (leakage-safe, per user)
#    Use only past info to predict tomorrow's stress.
# =====================================================
def make_lags_rolls(g, cols_numeric, lags=(1, 2, 7), rolls=((3, "mean"), (7, "mean"), (7, "std"))):
    g = g.sort_values("date") if "date" in g.columns else g
    for c in cols_numeric:
        if c not in g.columns: 
            continue
        for L in lags:
            g[f"{c}_lag{L}"] = g[c].shift(L)
        for win, agg in rolls:
            if agg == "mean":
                g[f"{c}_roll{win}_mean"] = g[c].shift(1).rolling(win, min_periods=2).mean()
            elif agg == "std":
                g[f"{c}_roll{win}_std"] = g[c].shift(1).rolling(win, min_periods=3).std()
    return g

num_source_cols = [c for c in [
    "stress_level","steps","exercise_minutes","sleep_hours","heart_rate_avg","calories_burned"
] if c in df.columns]

if isinstance(grp, pd.core.groupby.generic.DataFrameGroupBy):
    df = df.groupby("user_id", group_keys=False).apply(
        make_lags_rolls, cols_numeric=num_source_cols
    )
else:
    df = make_lags_rolls(df, num_source_cols)

# After creating lags/rolls, build modeling frame
feature_cols = [c for c in df.columns if any(x in c for x in ["_lag", "_roll"])]
keep_cols = ["user_id","date","gender","age","next_day_stress","high_stress_next"] + feature_cols
keep_cols = [c for c in keep_cols if c in df.columns]
df_model = df[keep_cols].dropna().copy()

print("Model rows after lag/rolling & dropna:", len(df_model))
display(df_model.head())

Model rows after lag/rolling & dropna: 357000


Unnamed: 0,user_id,date,gender,age,next_day_stress,high_stress_next,stress_level_lag1,stress_level_lag2,stress_level_lag7,stress_level_roll3_mean,...,heart_rate_avg_lag7,heart_rate_avg_roll3_mean,heart_rate_avg_roll7_mean,heart_rate_avg_roll7_std,calories_burned_lag1,calories_burned_lag2,calories_burned_lag7,calories_burned_roll3_mean,calories_burned_roll7_mean,calories_burned_roll7_std
7,0,2024-09-13,F,56,10.0,1,7.0,1.0,2.0,5.333333,...,62.029621,53.656194,56.728742,5.381481,2226.499339,1660.834309,2230.230419,2050.620653,2018.177624,299.247239
8,0,2024-09-14,F,56,7.0,1,3.0,7.0,3.0,3.666667,...,59.062818,52.841562,56.019264,4.86914,2353.914903,2226.499339,1840.454777,2080.416184,2035.846836,316.980858
9,0,2024-09-15,F,56,10.0,1,10.0,3.0,4.0,6.666667,...,58.494078,57.07781,55.879228,4.780304,2469.393097,2353.914903,2284.231946,2349.93578,2125.695167,340.621107
10,0,2024-09-16,F,56,3.0,0,7.0,10.0,10.0,6.666667,...,56.546095,58.987648,56.353946,5.227352,1574.421061,2469.393097,1620.464266,2132.576354,2024.293612,387.9282
11,0,2024-09-17,F,56,4.0,0,10.0,7.0,8.0,9.0,...,59.507172,57.907565,55.964936,5.311318,2094.353869,1574.421061,2264.528312,2046.056009,2091.992127,344.64436


In [24]:
# =====================================
# 4) Time split (80% oldest → train/val; 20% newest → test)
# =====================================
assert "date" in df_model.columns, "Need a date column for time-aware split."
cutoff = df_model["date"].quantile(0.80)
df_trainval = df_model[df_model["date"] <= cutoff].copy()
df_test     = df_model[df_model["date"] >  cutoff].copy()

print("Cutoff date:", cutoff.date())
print("Train/Val rows:", len(df_trainval), " Test rows:", len(df_test))

# Build validation from last 10% of trainval time
cutoff_tv = df_trainval["date"].quantile(0.90)
df_train = df_trainval[df_trainval["date"] <= cutoff_tv].copy()
df_val   = df_trainval[df_trainval["date"] >  cutoff_tv].copy()

print("Train rows:", len(df_train), " Val rows:", len(df_val))

Cutoff date: 2025-06-25
Train/Val rows: 286000  Test rows: 71000
Train rows: 258000  Val rows: 28000


In [25]:
# ===========================
# 5) REGRESSION: next_day_stress
# ===========================
target_reg = "next_day_stress"

from sklearn.preprocessing import OneHotEncoder

# ---- drop non-feature ID columns ----
drop_cols = ["date", target_reg, "high_stress_next", "user_id"]  # add other IDs if present
X_train_r = df_train.drop(columns=drop_cols, errors="ignore")
y_train_r = df_train[target_reg]

X_val_r   = df_val.drop(columns=drop_cols, errors="ignore")
y_val_r   = df_val[target_reg]

X_test_r  = df_test.drop(columns=drop_cols, errors="ignore")
y_test_r  = df_test[target_reg]

# optional: ensure 'age' is numeric if it came in as string
for c in ["age"]:
    if c in X_train_r.columns:
        X_train_r[c] = pd.to_numeric(X_train_r[c], errors="coerce")
        X_val_r[c]   = pd.to_numeric(X_val_r[c], errors="coerce")
        X_test_r[c]  = pd.to_numeric(X_test_r[c], errors="coerce")

num_cols = [c for c in X_train_r.columns if pd.api.types.is_numeric_dtype(X_train_r[c])]
cat_cols = [c for c in X_train_r.columns if c not in num_cols]

from sklearn.preprocessing import OneHotEncoder

numeric_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

categorical_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

pre = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols)
], remainder="drop")


# Baseline 1: Linear Regression
lin_r = Pipeline([("pre", pre), ("model", LinearRegression())])
lin_r.fit(X_train_r, y_train_r)
pred_val_lin = lin_r.predict(X_val_r)
pred_test_lin = lin_r.predict(X_test_r)
print("REG | Linear (VAL):", regression_report(y_val_r, pred_val_lin))
print("REG | Linear (TEST):", regression_report(y_test_r, pred_test_lin))

# Baseline 2: Random Forest
rf_r = Pipeline([("pre", pre), ("model", RandomForestRegressor(
    n_estimators=500, max_depth=None, random_state=RANDOM_SEED, n_jobs=-1
))])
rf_r.fit(X_train_r, y_train_r)
pred_val_rf = rf_r.predict(X_val_r)
pred_test_rf = rf_r.predict(X_test_r)
print("REG | RF (VAL):", regression_report(y_val_r, pred_val_rf))
print("REG | RF (TEST):", regression_report(y_test_r, pred_test_rf))

# Optional 3: XGBoost
if HAS_XGB:
    xgb_r = Pipeline([("pre", pre), ("model", xgb.XGBRegressor(
        n_estimators=800, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        random_state=RANDOM_SEED, n_jobs=-1
    ))])
    xgb_r.fit(X_train_r, y_train_r)
    pred_val_xgb = xgb_r.predict(X_val_r)
    pred_test_xgb = xgb_r.predict(X_test_r)
    print("REG | XGB (VAL):", regression_report(y_val_r, pred_val_xgb))
    print("REG | XGB (TEST):", regression_report(y_test_r, pred_test_xgb))


REG | Linear (VAL): {'RMSE': np.float64(2.8745620897056137), 'MAE': 2.5040251018101856, 'R2': -0.00013277682597512275}
REG | Linear (TEST): {'RMSE': np.float64(2.8664242143775134), 'MAE': 2.49406384080066, 'R2': 6.107036323166337e-05}
REG | RF (VAL): {'RMSE': np.float64(2.8786294845687537), 'MAE': 2.504368285714286, 'R2': -0.0029650780792833675}
REG | RF (TEST): {'RMSE': np.float64(2.870848853699674), 'MAE': 2.493772366197183, 'R2': -0.003028342456031119}
REG | XGB (VAL): {'RMSE': np.float64(2.887785315693779), 'MAE': 2.507820689865521, 'R2': -0.009355329217192399}
REG | XGB (TEST): {'RMSE': np.float64(2.8772211679415833), 'MAE': 2.4948136238783176, 'R2': -0.007486052162677614}


In [26]:
# ===========================
# 6) CLASSIFICATION: high_stress_next (>=7)
# ===========================
if "high_stress_next" in df_train.columns:
    target_clf = "high_stress_next"

    X_train_c = df_train.drop(columns=["date", target_clf, "next_day_stress"], errors="ignore")
    y_train_c = df_train[target_clf].astype(int)

    X_val_c   = df_val.drop(columns=["date", target_clf, "next_day_stress"], errors="ignore")
    y_val_c   = df_val[target_clf].astype(int)

    X_test_c  = df_test.drop(columns=["date", target_clf, "next_day_stress"], errors="ignore")
    y_test_c  = df_test[target_clf].astype(int)

    # Logistic Regression (balanced)
    logit = Pipeline([("pre", pre), ("model", LogisticRegression(
        max_iter=2000, class_weight="balanced"
    ))])
    logit.fit(X_train_c, y_train_c)
    val_prob = logit.predict_proba(X_val_c)[:,1]
    test_prob = logit.predict_proba(X_test_c)[:,1]
    print("CLF | LogReg (VAL):", classification_report_simple(y_val_c, val_prob))
    print("CLF | LogReg (TEST):", classification_report_simple(y_test_c, test_prob))

    # Random Forest Classifier
    rf_c = Pipeline([("pre", pre), ("model", RandomForestClassifier(
        n_estimators=500, max_depth=None, random_state=RANDOM_SEED, n_jobs=-1, class_weight="balanced"
    ))])
    rf_c.fit(X_train_c, y_train_c)
    val_prob_rf = rf_c.predict_proba(X_val_c)[:,1]
    test_prob_rf = rf_c.predict_proba(X_test_c)[:,1]
    print("CLF | RF (VAL):", classification_report_simple(y_val_c, val_prob_rf))
    print("CLF | RF (TEST):", classification_report_simple(y_test_c, test_prob_rf))

    # Optional XGBClassifier
    if HAS_XGB:
        xgb_c = Pipeline([("pre", pre), ("model", xgb.XGBClassifier(
            n_estimators=800, max_depth=6, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            random_state=RANDOM_SEED, n_jobs=-1,
            eval_metric="auc", scale_pos_weight=None
        ))])
        xgb_c.fit(X_train_c, y_train_c)
        val_prob_xgb = xgb_c.predict_proba(X_val_c)[:,1]
        test_prob_xgb = xgb_c.predict_proba(X_test_c)[:,1]
        print("CLF | XGB (VAL):", classification_report_simple(y_val_c, val_prob_xgb))
        print("CLF | XGB (TEST):", classification_report_simple(y_test_c, test_prob_xgb))

CLF | LogReg (VAL): {'Accuracy': 0.5009642857142858, 'F1': 0.4427072946994775, 'ROC-AUC': 0.5003722004728373}
CLF | LogReg (TEST): {'Accuracy': 0.5049295774647887, 'F1': 0.44744867482000816, 'ROC-AUC': 0.5056521623732576}
CLF | RF (VAL): {'Accuracy': 0.5957142857142858, 'F1': 0.0001766472354707649, 'ROC-AUC': 0.49835082375795475}
CLF | RF (TEST): {'Accuracy': 0.5991549295774647, 'F1': 0.0001405283867341203, 'ROC-AUC': 0.5022863891369723}
CLF | XGB (VAL): {'Accuracy': 0.5929285714285715, 'F1': 0.029461852861035424, 'ROC-AUC': 0.5009192098617925}
CLF | XGB (TEST): {'Accuracy': 0.5957887323943662, 'F1': 0.0314535452735311, 'ROC-AUC': 0.5021451563007668}


In [27]:
# =======================================
# 7) Simple fairness / slice checks
# =======================================
def print_reg_slice(dfX, y_true, y_pred, name):
    rep = regression_report(y_true, y_pred)
    print(f"[{name}] n={len(dfX)} | RMSE={rep['RMSE']:.2f} MAE={rep['MAE']:.2f} R2={rep['R2']:.3f}")

if "gender" in X_test_r.columns:
    for g in sorted(X_test_r["gender"].dropna().unique()):
        idx = (X_test_r["gender"]==g)
        print_reg_slice(X_test_r[idx], y_test_r[idx], pred_test_rf[idx], f"gender={g}")

[gender=F] n=36281 | RMSE=2.87 MAE=2.49 R2=-0.003
[gender=M] n=34719 | RMSE=2.87 MAE=2.49 R2=-0.004


In [28]:
# ===========================
# 8) Save lightweight artifacts
# ===========================
Path("artifacts").mkdir(exist_ok=True)
out = {
    "reg_linear_test": regression_report(y_test_r, pred_test_lin),
    "reg_rf_test": regression_report(y_test_r, pred_test_rf)
}
if HAS_XGB:
    out["reg_xgb_test"] = regression_report(y_test_r, pred_test_xgb)
with open("artifacts/metrics_stress.json","w") as f:
    json.dump(out, f, indent=2)
print("Saved artifacts/metrics_stress.json")

Saved artifacts/metrics_stress.json
