In [5]:
import numpy as np
import pandas as pd
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

target_samples_per_class = 666  # Approximately 2000 / 3 for balanced classes
num_samples = target_samples_per_class * 3  # Total samples ~2000

# Initialize lists to store data
data = {
    "Days_Left": [],
    "Task_Type": [],
    "Estimated_Duration": [],
    "Deadline_Time": [],
    "Task_Importance": [],
    "Past_Completion_Rate": [],
    "Number_Of_Overdue_Tasks": [],
    "Priority_Level": []
}

task_types = ["Work", "Personal", "Student"]

def generate_deadline_time():
    hour = random.randint(1, 12)
    minute = random.randint(0, 59)
    period = random.choice(["AM", "PM"])
    return f"{hour:02d}:{minute:02d} {period}"

# Counters for each priority level
priority_counts = {"High": 0, "Medium": 0, "Low": 0}

# Generate balanced data
while min(priority_counts.values()) < target_samples_per_class:
    # Generate synthetic features
    days_left = np.random.randint(0, 31)  # 0 to 30 days
    task_type = random.choice(task_types)
    estimated_duration = np.random.randint(1, 5)  # 1 to 4 hours
    deadline_time = generate_deadline_time()
    task_importance = np.random.randint(1, 6)  # 1 to 5
    past_completion_rate = np.random.randint(50, 101)  # 50% to 100%
    number_of_overdue_tasks = np.random.randint(0, 6)  # 0 to 5 overdue tasks

    # Compute the priority score
    score = (
        0.4 * ((30 - days_left) / 30) +
        0.3 * (task_importance / 5) +
        0.2 * (number_of_overdue_tasks / 5) -
        0.1 * (past_completion_rate / 100)
    )

    # Determine priority based on thresholds
    if score >= 0.6:
        priority = "High"
    elif score >= 0.3:
        priority = "Medium"
    else:
        priority = "Low"

    # Only add the sample if we haven't reached the target for that priority
    if priority_counts[priority] < target_samples_per_class:
        data["Days_Left"].append(days_left)
        data["Task_Type"].append(task_type)
        data["Estimated_Duration"].append(estimated_duration)
        data["Deadline_Time"].append(deadline_time)
        data["Task_Importance"].append(task_importance)
        data["Past_Completion_Rate"].append(past_completion_rate)
        data["Number_Of_Overdue_Tasks"].append(number_of_overdue_tasks)
        data["Priority_Level"].append(priority)
        priority_counts[priority] += 1

# Create a DataFrame
df = pd.DataFrame(data)

# Save the dataset to a CSV file
df.to_csv("synthetic_task_data_balanced.csv", index=False)

# Print confirmation and class distribution
print(f"Synthetic balanced dataset generated with {len(df)} rows and saved to 'synthetic_task_data_balanced.csv'.")
print("Priority Level Distribution:")
print(df["Priority_Level"].value_counts())

Synthetic balanced dataset generated with 1998 rows and saved to 'synthetic_task_data_balanced.csv'.
Priority Level Distribution:
Priority_Level
High      666
Medium    666
Low       666
Name: count, dtype: int64
