In [1]:
import numpy as np
import pandas as pd

In [2]:
def generate_realistic_employee_data(num_employees=200, random_state=42):
    np.random.seed(random_state)

    # ----- Calendar -----
    meeting_hours = np.random.normal(8, 3, num_employees).clip(2, 15)
    meeting_counts = np.random.poisson(12, num_employees).clip(5, 25)

    # ----- Communication -----
    messages_sent = np.random.normal(80, 25, num_employees).clip(20, 150).astype(int)
    messages_received = np.random.normal(100, 30, num_employees).clip(30, 180).astype(int)
    latency = np.random.normal(10, 5, num_employees).clip(1, 30)
    burstiness = np.random.beta(2, 4, num_employees)
    after_hours_ratio = np.random.beta(1.5, 10, num_employees)
    balance = np.clip(messages_sent / (messages_received + 1), 0.6, 1.4)
    conversation_len = np.random.normal(12, 5, num_employees).clip(5, 30)

    # ----- Project Management -----
    tasks_assigned = np.random.poisson(25, num_employees).clip(10, 40)
    completion_rate = np.random.uniform(0.7, 1.0, num_employees)
    tasks_completed = (tasks_assigned * completion_rate).round().astype(int)
    task_age = np.random.normal(6, 3, num_employees).clip(1, 15)
    overdue_ratio = np.clip(1 - completion_rate + np.random.normal(0, 0.05, num_employees), 0, 0.3)
    sentiment = np.random.normal(0.1 - overdue_ratio, 0.3).clip(-1, 1)

    # ----- Attendance -----
    logged_hours = np.random.normal(42, 4, num_employees).clip(35, 50)
    var_hours = np.random.normal(1.5, 0.5, num_employees).clip(0, 3)
    late_starts = np.random.poisson(2, num_employees).clip(0, 6)
    early_exits = np.random.poisson(1, num_employees).clip(0, 4)
    absenteeism = np.random.uniform(0, 0.08, num_employees)
    avg_break = np.random.normal(45, 15, num_employees).clip(30, 90)

    # ----- Outcomes (y) -----
    performance = (
        0.4 * completion_rate
        + 0.2 * (1 - absenteeism)
        + 0.1 * (1 - overdue_ratio)
        + 0.1 * sentiment
        + 0.1 * (1 - after_hours_ratio)
        + np.random.normal(0, 0.03, num_employees)
    )
    performance = np.clip(performance, 0, 1)

    burnout = (
        0.3 * (meeting_hours / 15)
        + 0.3 * after_hours_ratio
        + 0.2 * (logged_hours / 50)
        - 0.2 * sentiment
        + np.random.normal(0, 0.05, num_employees)
    )
    burnout = np.clip(burnout, 0, 1)

    # ----- Assemble -----
    df = pd.DataFrame({
        "meeting_hours_per_week": meeting_hours.round(1),
        "meeting_counts_per_week": meeting_counts,
        "messages_sent_per_day": messages_sent,
        "messages_received_per_day": messages_received,
        "avg_response_latency_min": latency.round(1),
        "communication_burstiness": burstiness.round(2),
        "after_hours_message_ratio": after_hours_ratio.round(3),
        "communication_balance": balance.round(2),
        "conversation_length_avg": conversation_len.round(1),
        "avg_tasks_assigned_per_week": tasks_assigned,
        "avg_tasks_completed_per_week": tasks_completed,
        "task_completion_rate": completion_rate.round(2),
        "avg_task_age_days": task_age.round(1),
        "overdue_task_ratio": overdue_ratio.round(2),
        "task_comment_sentiment_mean": sentiment.round(2),
        "logged_hours_per_week": logged_hours.round(1),
        "variance_in_work_hours": var_hours.round(2),
        "late_start_count_per_month": late_starts,
        "early_exit_count_per_month": early_exits,
        "absenteeism_rate": absenteeism.round(3),
        "avg_break_length_minutes": avg_break.round(1),
        "performance_score": performance.round(2),
        "burnout_risk_score": burnout.round(2),
    })

    return df

In [3]:
df = generate_realistic_employee_data(2000)

In [4]:
df.head()

Unnamed: 0,meeting_hours_per_week,meeting_counts_per_week,messages_sent_per_day,messages_received_per_day,avg_response_latency_min,communication_burstiness,after_hours_message_ratio,communication_balance,conversation_length_avg,avg_tasks_assigned_per_week,...,overdue_task_ratio,task_comment_sentiment_mean,logged_hours_per_week,variance_in_work_hours,late_start_count_per_month,early_exit_count_per_month,absenteeism_rate,avg_break_length_minutes,performance_score,burnout_risk_score
0,9.5,11,93,133,9.6,0.37,0.201,0.69,12.2,19,...,0.29,0.04,43.4,1.04,5,0,0.044,59.0,0.64,0.38
1,7.6,10,61,112,16.6,0.2,0.072,0.6,20.7,23,...,0.15,0.07,40.1,1.27,4,0,0.03,52.3,0.75,0.3
2,9.9,12,106,71,6.5,0.29,0.104,1.4,11.6,19,...,0.15,0.26,46.1,1.93,3,0,0.007,59.8,0.7,0.41
3,12.6,6,48,42,19.3,0.48,0.18,1.12,9.8,30,...,0.09,-0.27,41.2,1.38,0,1,0.068,41.0,0.72,0.53
4,7.3,15,92,47,6.6,0.54,0.054,1.4,17.7,25,...,0.26,0.12,42.4,1.51,3,0,0.054,54.0,0.67,0.31


In [5]:
df.to_csv("employee_data.csv", index=False)