In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)

# Load all datasets
daily_logs = pd.read_csv("../data/raw/daily_logs.csv")
daily_all = pd.read_csv("../data/raw/daily_all.csv")
interventions = pd.read_csv("../data/raw/interventions.csv")
weekly = pd.read_csv("../data/raw/weekly_summaries.csv")

print("Daily logs:", daily_logs.shape)
print("Daily all:", daily_all.shape)
print("Interventions:", interventions.shape)
print("Weekly summaries:", weekly.shape)


Daily logs: (731000, 26)
Daily all: (731000, 53)
Interventions: (332, 6)
Weekly summaries: (105000, 10)


In [9]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# 1. Select the columns that contribute to burnout
burnout_features = weekly[[
    "perceived_stress_scale",
    "anxiety_score",
    "depression_score",
    "sleep_debt_hours",
    "job_satisfaction",
]]

# 2. Standardize them (z-scores)
scaler = StandardScaler()
burnout_z = scaler.fit_transform(burnout_features)

burnout_z = pd.DataFrame(
    burnout_z,
    columns=burnout_features.columns,
    index=weekly.index
)

# 3. Build a composite burnout score:
#    stress, anxiety, depression, sleep_debt = positive contribution
#    job_satisfaction = negative contribution
weekly["burnout_score"] = (
    burnout_z["perceived_stress_scale"]
    + burnout_z["anxiety_score"]
    + burnout_z["depression_score"]
    + burnout_z["sleep_debt_hours"]
    - burnout_z["job_satisfaction"]
) / 5.0


In [10]:
weekly["burnout_score"].describe()


count    1.050000e+05
mean    -1.808162e-16
std      6.748538e-01
min     -2.011962e+00
25%     -4.795661e-01
50%     -1.075208e-01
75%      3.998777e-01
max      4.497371e+00
Name: burnout_score, dtype: float64

In [12]:
low_thr = weekly["burnout_score"].quantile(0.33)
high_thr = weekly["burnout_score"].quantile(0.66)

def burnout_class(score):
    if score < low_thr:
        return 0  # low
    elif score < high_thr:
        return 1  # medium
    else:
        return 2  # high

weekly["burnout_level"] = weekly["burnout_score"].apply(burnout_class)
weekly["burnout_level"].value_counts()


burnout_level
2    35700
1    34658
0    34642
Name: count, dtype: int64

In [13]:
daily_logs["date"] = pd.to_datetime(daily_logs["date"])
weekly["week_start"] = pd.to_datetime(weekly["week_start"])

daily_logs["week"] = daily_logs["date"].dt.isocalendar().week
weekly["week"] = weekly["week_start"].dt.isocalendar().week

merged = pd.merge(
    daily_logs,
    weekly[["user_id", "week", "burnout_score", "burnout_level"]],
    on=["user_id", "week"],
    how="left"
)
