In [22]:
import pandas as pd
import numpy as np
from datetime import timedelta
from datetime import datetime



In [2]:
#Load task duration dataframe
task_duration_df = pd.read_csv('task_duration_config.csv')
task_duration_df.set_index('Task', inplace=True)


In [3]:
task_duration_df.head()

Unnamed: 0_level_0,Avg,Max
Task,Unnamed: 1_level_1,Unnamed: 2_level_1
New Client Onboarding Request,0.5,1.5
Review Documents,0.25,2.5
Automated Scoreboarding,0.1,0.15
Manual Scoreboarding,1.0,3.0
Update Backend Systems,0.25,0.5


In [4]:
def build_instance_task_list(instance_id, process_variant, start_date_time, task_duration_df):
    instance_task_list = []
    rnd = np.random.default_rng()  # a random number generator

    # This code generates a random duration by leveraging the numpy lognormal function that gives a random
    # number drawn from a log normal distribution. Look up a picture of log normal distributions and you'll see 
    # why this is useful for generating random durations based on a mean and std deviation.
    # most samples are near the average with a long tail stretching towards infinity.

    # process_variant is a series
    for task_type in process_variant:
        # get the avg and max durations from the task_df dataframe using task_type as the key
        avg_dur = task_duration_df.loc[task_type, 'Avg']
        max_dur = task_duration_df.loc[task_type, 'Max']

        sigma = (max_dur - avg_dur) / max_dur  # std dev
        log_mean = np.log(avg_dur)  # can't pass the mean duration into lognormal until it has been logged itself 
        delta = rnd.lognormal(log_mean, sigma)  # get a random sample from a log normal distribution with a std dev

        task = [instance_id, task_type, start_date_time]
        instance_task_list.append(task)

        # increment the start time by the delta so that the next task start after this one
        start_date_time = start_date_time + timedelta(hours=delta)

    return instance_task_list

In [9]:
def generate_process_instances(variant, qty):

    global start_date_time
    global instance_counter
    task_list = []
    for x in range(0, qty):
        tasks = build_instance_task_list(instance_counter, variant, start_date_time, task_duration_df)
        for task in tasks:
            task_list.append(task)

        # Increment the start time by 24 hour
        start_date_time = start_date_time + timedelta(hours=24)
        # Increment the process_id
        instance_counter = instance_counter +1
        
    return task_list

In [38]:
start_date_time = datetime(2017, 11, 28, 18, 00, 00)
instance_time_offset = 24
instance_counter = 1

In [39]:

qty = 1
happy_path = ["New Client Onboarding Request", 
               "Review Documents", 
               "Automated Scoreboarding",
               "Update Backend Systems", 
               "Notification Review Request Completed"]

happy_path_task_list = []
happy_path_task_list = generate_process_instances(happy_path, qty)
happy_path_task_list_df = pd.DataFrame(happy_path_task_list)

# Increment the start time by 24 hour
start_date_time = start_date_time + timedelta(hours=24)

In [40]:

qty = 1
manual_path = ["New Client Onboarding Request", 
               "Review Documents", 
               "Automated Scoreboarding",
               "Manual Scoreboarding",
               "Update Backend Systems", 
               "Notification Review Request Completed"]

manual_path_task_list = []
manual_path_task_list = generate_process_instances(manual_path, qty)
manual_path_task_list_df = pd.DataFrame(manual_path_task_list)

# Increment the start time by 24 hour
start_date_time = start_date_time + timedelta(hours=24)

In [41]:
combined_df = pd.concat([happy_path_task_list_df,manual_path_task_list_df], axis=0)


In [43]:
combined_df.columns = ['process_id', 'task', 'start_time']

In [44]:
combined_df.head(25)

Unnamed: 0,process_id,task,start_time
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000
1,1,Review Documents,2017-11-28 18:07:02.697932
2,1,Automated Scoreboarding,2017-11-28 18:18:12.742701
3,1,Update Backend Systems,2017-11-28 18:26:53.495595
4,1,Notification Review Request Completed,2017-11-28 18:59:43.473563
0,2,New Client Onboarding Request,2017-11-30 18:00:00.000000
1,2,Review Documents,2017-11-30 18:36:02.036339
2,2,Automated Scoreboarding,2017-11-30 19:00:24.181003
3,2,Manual Scoreboarding,2017-11-30 19:07:47.673843
4,2,Update Backend Systems,2017-11-30 19:34:04.452911


In [47]:
combined_df.to_csv('pm_tasks.csv', index=False, date_format='%d-%m-%Y %H:%M:%S')