In [55]:
import numpy as np
import pandas as pd

np.random.seed(42)
N_USERS = 1000

# Target distribution
TARGET_DISTRIBUTION = {
    "High": int(N_USERS * 0.20),
    "Moderate": int(N_USERS * 0.30),
    "Low": N_USERS - int(N_USERS * 0.20) - int(N_USERS * 0.30)
}

# Burnout cutoff ranges (weekly averages)
RANGES = {
    "EE": {
        "Low": (0.0, 2.0),
        "Moderate": (2.1, 2.9),
        "High": (3.0, 6.0)
    },
    "DP": {
        "Low": (0.0, 1.0),
        "Moderate": (1.1, 1.9),
        "High": (2.0, 6.0)
    },
    "PA": {
        "Low": (4.88, 6.0),
        "Moderate": (4.0, 4.75),
        "High": (0.0, 3.88)
    }
}

# Classification functions
def classify_ee(avg_tired):
    if avg_tired >= 3.0:
        return "High"
    elif avg_tired >= 2.1:
        return "Moderate"
    else:
        return "Low"

def classify_dp(avg_meaningful):
    dp = 6 - avg_meaningful  # Inverted
    if dp >= 2.0:
        return "High"
    elif dp >= 1.1:
        return "Moderate"
    else:
        return "Low"

def classify_pa(avg_capable):
    if avg_capable <= 3.88:
        return "High"
    elif avg_capable <= 4.75:
        return "Moderate"
    else:
        return "Low"

def compute_burnout_label(ee, dp, pa):
    levels = [ee, dp, pa]
    if levels.count("High") >= 2:
        return "High"
    elif levels.count("Moderate") >= 2 or "High" in levels:
        return "Moderate"
    else:
        return "Low"

def choose_subscale_levels(target_label):
    if target_label == "High":
        return ["High", "High", "High"]  # max risk → low capable (high burnout)
    elif target_label == "Moderate":
        return ["Moderate", "Moderate", "Moderate"]  # balanced subscales
    else:
        return ["Low", "Low", "Low"]  # least risk → high capable (low burnout)

# Score generator with loosened tolerance
def generate_scores_for_range(target_avg, tolerance=0.3, n=7):
    min_val = max(0, int(np.floor(target_avg - 1)))
    max_val = min(6, int(np.ceil(target_avg + 1)))
    for _ in range(1000):
        values = np.random.randint(min_val, max_val + 1, size=n)
        avg = np.mean(values)
        if abs(avg - target_avg) <= tolerance:
            return values
    raise ValueError(f"Failed to generate scores with target average: {target_avg}")

# Fully validated record generator
def generate_user_validated(user_id, target_label):
    max_attempts = 1000
    for _ in range(max_attempts):
        ee_level, dp_level, pa_level = choose_subscale_levels(target_label)

        ee_avg = np.round(np.random.uniform(*RANGES["EE"][ee_level]), 2)
        dp_avg = np.round(np.random.uniform(*RANGES["DP"][dp_level]), 2)
        pa_avg = np.round(np.random.uniform(*RANGES["PA"][pa_level]), 2)

        try:
            tired = generate_scores_for_range(ee_avg)
            meaningful = generate_scores_for_range(6 - dp_avg)  # Inverted
            capable = generate_scores_for_range(pa_avg)
        except ValueError:
            continue

        ee_actual = classify_ee(np.mean(tired))
        dp_actual = classify_dp(np.mean(meaningful))
        pa_actual = classify_pa(np.mean(capable))
        burnout_computed = compute_burnout_label(ee_actual, dp_actual, pa_actual)

        if burnout_computed == target_label:
            return {
                "user_id": user_id,
                "avg_tired": round(np.mean(tired), 2),
                "avg_capable": round(np.mean(capable), 2),
                "avg_meaningful": round(np.mean(meaningful), 2),
                "burnout": target_label
            }

    raise ValueError(f"Could not generate a conforming record for {target_label}")

# Generate dataset
records_clean = []
user_id = 1
for label, count in TARGET_DISTRIBUTION.items():
    for _ in range(count):
        record = generate_user_validated(user_id, label)
        records_clean.append(record)
        user_id += 1

# Save to CSV
df_clean = pd.DataFrame(records_clean)
df_clean = df_clean.sample(frac=1, random_state=42).reset_index(drop=True)
df_clean.to_csv("./Data/simulated_weekly_burnout.csv", index=False)

# Print counts
print("🔥 Burnout class counts:")
print(df_clean["burnout"].value_counts())


🔥 Burnout class counts:
burnout
Low         500
Moderate    300
High        200
Name: count, dtype: int64
