# Stress Level Prediction Model

This notebook builds an XGBoost classifier to predict stress levels (Low, Medium, High) based on lifestyle and health factors.

In [1]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_path = Path("../data/processed/stress_data_processed.csv")
df = pd.read_csv(data_path)
print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (773, 33)


Unnamed: 0,Age,Gender,Occupation,Marital_Status,Sleep_Duration,Sleep_Quality,Wake_Up_Time,Bed_Time,Physical_Activity,Screen_Time,...,Wake_Up_Time_Minutes,Stress_Level_Encoded,Sleep_Efficiency,Screen_Activity_Ratio,Work_Travel_Total,High_Screen_Time,Low_Sleep,Sleep_Duration_Bin,Work_Hours_Bin,Screen_Time_Bin
0,30,Male,Software Engineer,Single,7.0,4.0,7:00 AM,10:00 PM,2.0,4.0,...,420,0,0.571429,1.904762,9.0,0,0,Normal (6-7h),Normal (6-8h),Moderate (2-4h)
1,35,Female,Marketing Manager,Married,6.0,3.0,6:00 AM,11:00 PM,1.0,3.0,...,360,1,0.5,2.727273,11.0,0,0,Low (5-6h),High (8-10h),Moderate (2-4h)
2,40,Male,Data Scientist,Divorced,7.0,4.0,7:00 AM,10:00 PM,2.0,4.0,...,420,2,0.571429,1.904762,9.0,0,0,Normal (6-7h),Normal (6-8h),Moderate (2-4h)
3,35,Male,Software Engineer,Single,7.0,4.0,7:00 AM,10:00 PM,2.0,4.0,...,420,0,0.571429,1.904762,9.0,0,0,Normal (6-7h),Normal (6-8h),Moderate (2-4h)
4,29,Female,Teacher,Single,8.0,5.0,6:30 AM,10:30 PM,3.0,2.0,...,390,0,0.625,0.645161,8.0,0,0,Good (7-8h),Normal (6-8h),Low (<2h)


In [3]:
# Select features for the model
numeric_features = [
    'Sleep_Duration', 'Sleep_Quality', 'Screen_Time',
    'Physical_Activity', 'Caffeine_Intake', 'Work_Hours',
    'Travel_Time', 'Social_Interactions'
]

# Meditation_Practice is already encoded as 0/1
categorical_features = ['Meditation_Practice']

# Exercise_Type needs encoding
exercise_encoder = LabelEncoder()
df['Exercise_Type_Encoded'] = exercise_encoder.fit_transform(df['Exercise_Type'])

all_features = numeric_features + categorical_features + ['Exercise_Type_Encoded']
print(f"Features: {all_features}")

Features: ['Sleep_Duration', 'Sleep_Quality', 'Screen_Time', 'Physical_Activity', 'Caffeine_Intake', 'Work_Hours', 'Travel_Time', 'Social_Interactions', 'Meditation_Practice', 'Exercise_Type_Encoded']


In [4]:
X = df[all_features].copy()
y = df['Stress_Level_Encoded'].copy()

print(f"X shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts().sort_index()}")

X shape: (773, 10)
Target distribution:
Stress_Level_Encoded
0    162
1    310
2    301
Name: count, dtype: int64


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")

Train: (618, 10), Test: (155, 10)


In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
print(f"After SMOTE: {X_train_resampled.shape}")
print(f"Resampled distribution:\n{pd.Series(y_train_resampled).value_counts().sort_index()}")

After SMOTE: (744, 10)
Resampled distribution:
Stress_Level_Encoded
0    248
1    248
2    248
Name: count, dtype: int64


In [8]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'min_child_weight': [1, 3]
}

xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    xgb, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1
)
grid_search.fit(X_train_resampled, y_train_resampled)

print(f"Best params: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best params: {'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 100}
Best CV score: 0.7433


In [9]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Low', 'Medium', 'High']))

print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}")

Classification Report:
              precision    recall  f1-score   support

         Low       0.64      0.48      0.55        33
      Medium       0.63      0.69      0.66        62
        High       0.73      0.75      0.74        60

    accuracy                           0.67       155
   macro avg       0.67      0.64      0.65       155
weighted avg       0.67      0.67      0.67       155


Accuracy: 0.6710


In [10]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[16 12  5]
 [ 7 43 12]
 [ 2 13 45]]


In [11]:
models_dir = Path("../models")
models_dir.mkdir(exist_ok=True)

# Save model
with open(models_dir / "xgb_stress_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

# Save scaler
with open(models_dir / "scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Save feature names
with open(models_dir / "feature_names.pkl", "wb") as f:
    pickle.dump(all_features, f)

# Save exercise encoder
with open(models_dir / "exercise_encoder.pkl", "wb") as f:
    pickle.dump(exercise_encoder, f)

# Save confusion matrix for dashboard
with open(models_dir / "confusion_matrix.pkl", "wb") as f:
    pickle.dump(cm, f)

print("All artifacts saved to models/")

All artifacts saved to models/
