# Stress Level Prediction Model

This notebook builds an XGBoost classifier to predict stress levels (Low, Medium, High) based on lifestyle and health factors.

In [None]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings('ignore')

In [None]:
data_path = Path("../data/processed/stress_data_processed.csv")
df = pd.read_csv(data_path)
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Select features for the model
numeric_features = [
    'Sleep_Duration', 'Sleep_Quality', 'Screen_Time',
    'Physical_Activity', 'Caffeine_Intake', 'Work_Hours',
    'Travel_Time', 'Social_Interactions'
]

# Meditation_Practice is already encoded as 0/1
categorical_features = ['Meditation_Practice']

# Exercise_Type needs encoding
exercise_encoder = LabelEncoder()
df['Exercise_Type_Encoded'] = exercise_encoder.fit_transform(df['Exercise_Type'])

all_features = numeric_features + categorical_features + ['Exercise_Type_Encoded']
print(f"Features: {all_features}")

In [None]:
X = df[all_features].copy()
y = df['Stress_Level_Encoded'].copy()

print(f"X shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts().sort_index()}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
print(f"After SMOTE: {X_train_resampled.shape}")
print(f"Resampled distribution:\n{pd.Series(y_train_resampled).value_counts().sort_index()}")

In [None]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'min_child_weight': [1, 3]
}

xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    xgb, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1
)
grid_search.fit(X_train_resampled, y_train_resampled)

print(f"Best params: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Low', 'Medium', 'High']))

print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}")

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

In [None]:
models_dir = Path("../models")
models_dir.mkdir(exist_ok=True)

# Save model
with open(models_dir / "xgb_stress_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

# Save scaler
with open(models_dir / "scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Save feature names
with open(models_dir / "feature_names.pkl", "wb") as f:
    pickle.dump(all_features, f)

# Save exercise encoder
with open(models_dir / "exercise_encoder.pkl", "wb") as f:
    pickle.dump(exercise_encoder, f)

# Save confusion matrix for dashboard
with open(models_dir / "confusion_matrix.pkl", "wb") as f:
    pickle.dump(cm, f)

print("All artifacts saved to models/")