In [1]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta
from datetime import datetime



## Load Task Duration Config
For each task type that could be generated we need an average duration and a maximum duration.

In [2]:
#Load task duration dataframe
#task_duration_df = pd.read_csv('task_duration_config.csv')
#task_duration_df.set_index('Task', inplace=True)

data = [['New Client Onboarding Request', 0.5,1.5], 
        ['Review Documents', 0.25,  2.5], 
        ['Automated Scoreboarding', 0.1, 0.15],
        ['Manual Scoreboarding', 1.0, 3.0],
        ['Update Backend Systems', 0.25, 0.5],
        ['Notification Review Request Completed', 0.1, 0.15]
       ]

task_duration_df = pd.DataFrame(data, columns=['Task', 'Avg', 'Max'])
task_duration_df.set_index('Task', inplace=True)

In [4]:
task_duration_df.head(10)

Unnamed: 0_level_0,Avg,Max
Task,Unnamed: 1_level_1,Unnamed: 2_level_1
New Client Onboarding Request,0.5,1.5
Review Documents,0.25,2.5
Automated Scoreboarding,0.1,0.15
Manual Scoreboarding,1.0,3.0
Update Backend Systems,0.25,0.5
Notification Review Request Completed,0.1,0.15


## Utility Functions

### Generate Process Instances
This function manages the creation of process instances for a specific process variant. A process variant is simply the list of tasks in execution order, including any loops. For example ['Wake Up', 'Breakfast', 'Work', 'Dinner' 'Sleep']

In [5]:
def generate_process_instances(process_variant, qty):

    global start_date_time
    global instance_counter
    task_list = []
    for x in range(0, qty):
        tasks = build_task_list(instance_counter, process_variant, start_date_time, task_duration_df)
        for task in tasks:
            task_list.append(task)

        # Increment the start time by 24 hour
        start_date_time = start_date_time + timedelta(hours=24)
        # Increment the process_id
        instance_counter = instance_counter +1
        
    return task_list

## Build Task List
The build_task_list function creates a list of tasks for a specific instance of a process. The sequence of tasks is defined in the variable called process_variant. Random task durations are calculated using the task_durations_df that contains the task name, the average duration and the max duration.  

This code generates a random duration by leveraging the numpy lognormal function that gives a random
number drawn from a log normal distribution. Look up a picture of log normal distributions and you'll see 
why this is useful for generating random durations based on a mean and std deviation.
most samples are near the average with a long tail stretching towards infinity.


In [50]:
def build_task_list(instance_id, process_variant, start_date_time, task_duration_df):
    instance_task_list = []
    rnd = np.random.default_rng()  # a random number generator

    # process_variant is a series
    for task_type in process_variant:
        # get the avg and max durations from the task_df dataframe using task_type as the key
        avg_dur = task_duration_df.loc[task_type, 'Avg']
        max_dur = task_duration_df.loc[task_type, 'Max']

        sigma = (max_dur - avg_dur) / max_dur  # std dev
        log_mean = np.log(avg_dur)  # can't pass the mean duration into lognormal until it has been logged itself 
        delta = rnd.lognormal(log_mean, sigma)  # get a random sample from a log normal distribution with a std dev
        end_time = start_date_time + timedelta(hours=delta)
        task = [instance_id, task_type, start_date_time, end_time]
        instance_task_list.append(task)

        # set the start time of the next tasks to be 5 mins after the end time of this task
        start_date_time = end_time + timedelta(minutes=5)

    return instance_task_list

## Delay Task Start Time
This function shifts the task start time for a specific task with a specific attribute set to a specific value. For example, to delay the start time of breakfast by 1hour "task_list, User, Gerry, Eat Breakfast, 1"

This function will apply the same delay to every subsequent task of the process instance. 

In [145]:
def delayStartTime(tasks, target_attribute, target_value, target_task, timeShift):

    # Logic requires we sort by process_id and start_date
    tasks = tasks.sort_values(['process_id', 'start_time'])
    tasks = tasks.reset_index(drop=True)
    process_being_modified = -1

    # iterate through the dataframe using the index value
    for x in tasks.index:
        row = tasks.loc[x]
        task_attribute = row[target_attribute]
        row_task = row['task']
        current_process = row['process_id']


        # Have we found the target activity
        if row_task == target_task and task_attribute == target_value:
            process_being_modified = current_process

        # Are we still processing the same process that we were when we found the target activity ?
        # If so we are moving all subsequent tasks back by the value of the timeshift parameter,
        # this means shifting the start time and the end time
        if current_process == process_being_modified:
            # shift the start time
            current_start = row['start_time']
            shifted_start = current_start + timedelta(hours=timeShift)
            tasks.loc[x, 'start_time'] = shifted_start
            
            # shift the end time
            current_end = row['end_time']
            shifted_end = current_end + timedelta(hours=timeShift)
            tasks.loc[x, 'end_time'] = shifted_end
            
    return tasks

In [128]:
def increaseDuration(tasks, target_attribute, target_value, target_task, timeShift):

    # Logic requires we sort by process_id and start_date
    tasks = tasks.sort_values(['process_id', 'start_time'])
    tasks = tasks.reset_index(drop=True)
    process_being_modified = -1

    # iterate through the dataframe using the index value
    for x in tasks.index:
        row = tasks.loc[x]
        task_attribute = row[target_attribute]
        row_task = row['task']
        current_process = row['process_id']
        processing_target_task = False

        # Have we found the target activity
        if row_task == target_task and task_attribute == target_value:
            process_being_modified = current_process
            processing_target_task = True

        # Are we still processing the same process that we were when we found the target activity ?
        # If so we are moving all subsequent tasks back by the value of the timeshift parameter,
        # this means shifting the start time and the end time
        if current_process == process_being_modified:
            
            if processing_target_task :
                # just increase the duration by increasing the end_time 
                current_end = row['end_time']
                shifted_end = current_end + timedelta(hours=timeShift)
                tasks.loc[x, 'end_time'] = shifted_end
            else:
                
                # increase start_time and end-time
                current_start = row['start_time']
                shifted_start = current_start + timedelta(hours=timeShift)
                tasks.loc[x, 'start_time'] = shifted_start
            
                # shift the end time
                current_end = row['end_time']
                shifted_end = current_end + timedelta(hours=timeShift)
                tasks.loc[x, 'end_time'] = shifted_end
            
    return tasks

## Initialise Data Generator

In [146]:
start_date_time = datetime(2017, 11, 28, 18, 00, 00)
instance_time_offset = 24
instance_counter = 1


## Create Happy Path Instances

In [147]:

qty = 900
happy_path = ["New Client Onboarding Request", 
               "Review Documents", 
               "Automated Scoreboarding",
               "Update Backend Systems", 
               "Notification Review Request Completed"]

happy_path_task_list = []
happy_path_task_list = generate_process_instances(happy_path, qty)
happy_path_task_list_df = pd.DataFrame(happy_path_task_list)

# Increment the start time by 24 hour
start_date_time = start_date_time + timedelta(hours=24)

In [148]:
happy_path_task_list_df.head()

Unnamed: 0,0,1,2,3
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000,2017-11-28 19:10:42.056829
1,1,Review Documents,2017-11-28 19:15:42.056829,2017-11-28 19:34:23.279261
2,1,Automated Scoreboarding,2017-11-28 19:39:23.279261,2017-11-28 19:46:24.140452
3,1,Update Backend Systems,2017-11-28 19:51:24.140452,2017-11-28 20:14:10.679723
4,1,Notification Review Request Completed,2017-11-28 20:19:10.679723,2017-11-28 20:23:52.289747


## Create Instances Requiring Manual Scoreboarding

In [149]:

qty = 80
manual_path = ["New Client Onboarding Request", 
               "Review Documents", 
               "Automated Scoreboarding",
               "Manual Scoreboarding",
               "Update Backend Systems", 
               "Notification Review Request Completed"]

manual_path_task_list = []
manual_path_task_list = generate_process_instances(manual_path, qty)
manual_path_task_list_df = pd.DataFrame(manual_path_task_list)

# Increment the start time by 24 hour
start_date_time = start_date_time + timedelta(hours=24)

In [150]:
manual_path_task_list_df.head()

Unnamed: 0,0,1,2,3
0,901,New Client Onboarding Request,2020-05-17 18:00:00.000000,2020-05-17 18:20:36.054000
1,901,Review Documents,2020-05-17 18:25:36.054000,2020-05-17 18:29:13.100988
2,901,Automated Scoreboarding,2020-05-17 18:34:13.100988,2020-05-17 18:39:26.956161
3,901,Manual Scoreboarding,2020-05-17 18:44:26.956161,2020-05-17 19:07:50.067849
4,901,Update Backend Systems,2020-05-17 19:12:50.067849,2020-05-17 19:23:41.470973


## Create Instance that loop
Create processes that loop back to Review Docs from Manual Scoreboarding.

In [151]:
qty = 20
loop_path = ["New Client Onboarding Request", 
               "Review Documents", 
               "Automated Scoreboarding",
               "Manual Scoreboarding",
               "Review Documents",
               "Update Backend Systems", 
               "Notification Review Request Completed"]

loop_path_task_list = []
loop_path_task_list = generate_process_instances(loop_path, qty)
loop_path_task_list_df = pd.DataFrame(loop_path_task_list)

# Increment the start time by 24 hour
start_date_time = start_date_time + timedelta(hours=24)

In [152]:
# Can only concat two dataframes at a time
df = pd.concat([happy_path_task_list_df,manual_path_task_list_df], axis=0)
df = pd.concat([df,loop_path_task_list_df], axis=0)

In [153]:
df.columns = ['process_id', 'task', 'start_time', 'end_time']

In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5120 entries, 0 to 139
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   process_id  5120 non-null   int64         
 1   task        5120 non-null   object        
 2   start_time  5120 non-null   datetime64[ns]
 3   end_time    5120 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(1)
memory usage: 200.0+ KB


In [155]:
# Confirm how many processes
len(df["process_id"].unique())

1000

In [156]:
df.head(10)

Unnamed: 0,process_id,task,start_time,end_time
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000,2017-11-28 19:10:42.056829
1,1,Review Documents,2017-11-28 19:15:42.056829,2017-11-28 19:34:23.279261
2,1,Automated Scoreboarding,2017-11-28 19:39:23.279261,2017-11-28 19:46:24.140452
3,1,Update Backend Systems,2017-11-28 19:51:24.140452,2017-11-28 20:14:10.679723
4,1,Notification Review Request Completed,2017-11-28 20:19:10.679723,2017-11-28 20:23:52.289747
5,2,New Client Onboarding Request,2017-11-29 18:00:00.000000,2017-11-29 18:30:49.766804
6,2,Review Documents,2017-11-29 18:35:49.766804,2017-11-29 18:47:05.447282
7,2,Automated Scoreboarding,2017-11-29 18:52:05.447282,2017-11-29 19:00:33.064888
8,2,Update Backend Systems,2017-11-29 19:05:33.064888,2017-11-29 19:10:52.786846
9,2,Notification Review Request Completed,2017-11-29 19:15:52.786846,2017-11-29 19:23:23.910859


## Add Task Level Business Data : User

In [157]:

# Add a column for User
df["user"] = ""

In [158]:
def setRandomUser(row):
    match row["task"]:
        case "Review Documents":
            return random.choice(['Rod','Jane','Freddy'])
        case "New Client Onboarding Request":
            return random.choice(['Clive','Francis','Nick','Seb','Tom'])
        case "Manual Scoreboarding":
            return random.choice(['Sharon','Susan', 'Sam'])
        case "Update Backend Systems":
            return "RPA"
        case "Automated Scoreboarding":
            return "SYSTEM"
        case "Notification Review Request Completed":
            return "SYSTEM"
        case _:
            return row["user"]

In [159]:
df["user"] = df.apply(setRandomUser, axis=1)

In [160]:
df.head(20)

Unnamed: 0,process_id,task,start_time,end_time,user
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000,2017-11-28 19:10:42.056829,Nick
1,1,Review Documents,2017-11-28 19:15:42.056829,2017-11-28 19:34:23.279261,Rod
2,1,Automated Scoreboarding,2017-11-28 19:39:23.279261,2017-11-28 19:46:24.140452,SYSTEM
3,1,Update Backend Systems,2017-11-28 19:51:24.140452,2017-11-28 20:14:10.679723,RPA
4,1,Notification Review Request Completed,2017-11-28 20:19:10.679723,2017-11-28 20:23:52.289747,SYSTEM
5,2,New Client Onboarding Request,2017-11-29 18:00:00.000000,2017-11-29 18:30:49.766804,Clive
6,2,Review Documents,2017-11-29 18:35:49.766804,2017-11-29 18:47:05.447282,Rod
7,2,Automated Scoreboarding,2017-11-29 18:52:05.447282,2017-11-29 19:00:33.064888,SYSTEM
8,2,Update Backend Systems,2017-11-29 19:05:33.064888,2017-11-29 19:10:52.786846,RPA
9,2,Notification Review Request Completed,2017-11-29 19:15:52.786846,2017-11-29 19:23:23.910859,SYSTEM


## Add Task Level Business Data : UserGroup

In [161]:
# Add a column for UserGroup
df["user_group"] = ""

In [162]:
def setUserGroup(row):
    match row["task"]:
        case "Review Documents":
            return "Operations"
        case "New Client Onboarding Request":
            return "Sales" 
        case "Manual Scoreboarding":
            return "Risk"
        case "Update Backend Systems":
            return "SYSTEM"
        case "Automated Scoreboarding":
            return "SYSTEM"
        case "Notification Review Request Completed":
            return "SYSTEM"
        case _:
            return ""

In [163]:
df["user_group"] = df.apply(setUserGroup, axis=1)

In [164]:
df.head(20)

Unnamed: 0,process_id,task,start_time,end_time,user,user_group
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000,2017-11-28 19:10:42.056829,Nick,Sales
1,1,Review Documents,2017-11-28 19:15:42.056829,2017-11-28 19:34:23.279261,Rod,Operations
2,1,Automated Scoreboarding,2017-11-28 19:39:23.279261,2017-11-28 19:46:24.140452,SYSTEM,SYSTEM
3,1,Update Backend Systems,2017-11-28 19:51:24.140452,2017-11-28 20:14:10.679723,RPA,SYSTEM
4,1,Notification Review Request Completed,2017-11-28 20:19:10.679723,2017-11-28 20:23:52.289747,SYSTEM,SYSTEM
5,2,New Client Onboarding Request,2017-11-29 18:00:00.000000,2017-11-29 18:30:49.766804,Clive,Sales
6,2,Review Documents,2017-11-29 18:35:49.766804,2017-11-29 18:47:05.447282,Rod,Operations
7,2,Automated Scoreboarding,2017-11-29 18:52:05.447282,2017-11-29 19:00:33.064888,SYSTEM,SYSTEM
8,2,Update Backend Systems,2017-11-29 19:05:33.064888,2017-11-29 19:10:52.786846,RPA,SYSTEM
9,2,Notification Review Request Completed,2017-11-29 19:15:52.786846,2017-11-29 19:23:23.910859,SYSTEM,SYSTEM


## Add Process Instance Business Data : Industry
Industy won't change during the process so all tasks for a given process ID must have the same value

In [165]:
df["industry"] = ""

In [166]:
process_id_list = df["process_id"].unique()

for process_id in process_id_list:
    industry = random.choice(['Federal','Finance','Healthcare','Insurance','Telecom'])
    df.loc[df["process_id"].eq(process_id), "industry"] = industry
    
                          
    

In [167]:
df.head(20)

Unnamed: 0,process_id,task,start_time,end_time,user,user_group,industry
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000,2017-11-28 19:10:42.056829,Nick,Sales,Telecom
1,1,Review Documents,2017-11-28 19:15:42.056829,2017-11-28 19:34:23.279261,Rod,Operations,Telecom
2,1,Automated Scoreboarding,2017-11-28 19:39:23.279261,2017-11-28 19:46:24.140452,SYSTEM,SYSTEM,Telecom
3,1,Update Backend Systems,2017-11-28 19:51:24.140452,2017-11-28 20:14:10.679723,RPA,SYSTEM,Telecom
4,1,Notification Review Request Completed,2017-11-28 20:19:10.679723,2017-11-28 20:23:52.289747,SYSTEM,SYSTEM,Telecom
5,2,New Client Onboarding Request,2017-11-29 18:00:00.000000,2017-11-29 18:30:49.766804,Clive,Sales,Healthcare
6,2,Review Documents,2017-11-29 18:35:49.766804,2017-11-29 18:47:05.447282,Rod,Operations,Healthcare
7,2,Automated Scoreboarding,2017-11-29 18:52:05.447282,2017-11-29 19:00:33.064888,SYSTEM,SYSTEM,Healthcare
8,2,Update Backend Systems,2017-11-29 19:05:33.064888,2017-11-29 19:10:52.786846,RPA,SYSTEM,Healthcare
9,2,Notification Review Request Completed,2017-11-29 19:15:52.786846,2017-11-29 19:23:23.910859,SYSTEM,SYSTEM,Healthcare


## Add Process Instance Business Data : Service Charge
Service charge will be a random choice based on industry.

In [168]:
df["service_charge"] = 0

In [169]:
def getServiceChargeByIndustry(industry):
    match industry:
        case "Federal":
            return random.choice([3000, 6000, 8000])
        case "Finance":
            return random.choice([10000, 12000, 20000])
        case "Healthcare":
            return random.choice([15000, 20000, 25000])
        case "Insurance":
            return 45000
        case "Telecom":
            return 49000
        case _:
            return 64000

In [170]:
last_process = -1
service_charge = 0

process_id_list = df["process_id"].unique()

for process_id in process_id_list:
        # get the tasks for this process instance, use copy() to make it clear to the interpreter we know
        # this is a copy to supress any warnings about updating copies
        process_instance_tasks = df.loc[df.process_id == process_id].copy()
        # Get the fir row out of the task list and get the industry value
        industry = process_instance_tasks.iloc[0]['industry']
        
        # call function to get a semi-random service charge for the industry
        service_charge = getServiceChargeByIndustry(industry)
        
        # set the service charge in every row for this process instance
        df.loc[df.process_id == process_id, 'service_charge'] = service_charge
    


In [171]:
df.head(50)

Unnamed: 0,process_id,task,start_time,end_time,user,user_group,industry,service_charge
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000,2017-11-28 19:10:42.056829,Nick,Sales,Telecom,49000
1,1,Review Documents,2017-11-28 19:15:42.056829,2017-11-28 19:34:23.279261,Rod,Operations,Telecom,49000
2,1,Automated Scoreboarding,2017-11-28 19:39:23.279261,2017-11-28 19:46:24.140452,SYSTEM,SYSTEM,Telecom,49000
3,1,Update Backend Systems,2017-11-28 19:51:24.140452,2017-11-28 20:14:10.679723,RPA,SYSTEM,Telecom,49000
4,1,Notification Review Request Completed,2017-11-28 20:19:10.679723,2017-11-28 20:23:52.289747,SYSTEM,SYSTEM,Telecom,49000
5,2,New Client Onboarding Request,2017-11-29 18:00:00.000000,2017-11-29 18:30:49.766804,Clive,Sales,Healthcare,20000
6,2,Review Documents,2017-11-29 18:35:49.766804,2017-11-29 18:47:05.447282,Rod,Operations,Healthcare,20000
7,2,Automated Scoreboarding,2017-11-29 18:52:05.447282,2017-11-29 19:00:33.064888,SYSTEM,SYSTEM,Healthcare,20000
8,2,Update Backend Systems,2017-11-29 19:05:33.064888,2017-11-29 19:10:52.786846,RPA,SYSTEM,Healthcare,20000
9,2,Notification Review Request Completed,2017-11-29 19:15:52.786846,2017-11-29 19:23:23.910859,SYSTEM,SYSTEM,Healthcare,20000


# Stories
Make a copy of the dataframe so we can restart here if required.

In [172]:
story_df = df.copy()

## Story : 1 - Rod is Too Busy
Rod is always very busy doing other work, so he may not be able to start right away.
Lets take a look at two `Review Documents` tasks that have been done by Rod. 

In [173]:
filter = (story_df['task'] == 'Review Documents') & (story_df['user'] == 'Rod')
story_df[filter].head(2)

Unnamed: 0,process_id,task,start_time,end_time,user,user_group,industry,service_charge
1,1,Review Documents,2017-11-28 19:15:42.056829,2017-11-28 19:34:23.279261,Rod,Operations,Telecom,49000
6,2,Review Documents,2017-11-29 18:35:49.766804,2017-11-29 18:47:05.447282,Rod,Operations,Healthcare,20000


Delay the start time of Rod's `Review Documents` tasks by 1 hour

In [174]:
# If user is Rod and activity is `Review Documents` shift the start_time by 1 hour
# 
story_df = delayStartTime(story_df, 'user','Rod', 'Review Documents', 1)

Lets see the result of this change:

In [175]:
filter = (story_df['task'] == 'Review Documents') & (story_df['user'] == 'Rod')
story_df[filter].head(2)

Unnamed: 0,process_id,task,start_time,end_time,user,user_group,industry,service_charge
1,1,Review Documents,2017-11-28 20:15:42.056829,2017-11-28 20:34:23.279261,Rod,Operations,Telecom,49000
6,2,Review Documents,2017-11-29 19:35:49.766804,2017-11-29 19:47:05.447282,Rod,Operations,Healthcare,20000


## Story : 2 - Complex Doc Review
Increase the time taken for doc review by 2hrs when the industry is federal. 
The rationale here is that the documentation requirements are greater.

Lets look at some `Review Documents` where the industry is `Federal`

In [176]:
filter = (story_df['task'] == 'Review Documents') & (story_df['industry'] == 'Federal')
story_df[filter].head(2)

Unnamed: 0,process_id,task,start_time,end_time,user,user_group,industry,service_charge
36,8,Review Documents,2017-12-05 20:01:27.536445,2017-12-05 20:08:23.137902,Rod,Operations,Federal,3000
81,17,Review Documents,2017-12-14 18:16:06.059101,2017-12-14 18:27:08.259339,Freddy,Operations,Federal,8000


In [177]:
story_df[story_df['process_id'] == 8]

Unnamed: 0,process_id,task,start_time,end_time,user,user_group,industry,service_charge
35,8,New Client Onboarding Request,2017-12-05 18:00:00.000000,2017-12-05 18:56:27.536445,Francis,Sales,Federal,3000
36,8,Review Documents,2017-12-05 20:01:27.536445,2017-12-05 20:08:23.137902,Rod,Operations,Federal,3000
37,8,Automated Scoreboarding,2017-12-05 20:13:23.137902,2017-12-05 20:17:43.106496,SYSTEM,SYSTEM,Federal,3000
38,8,Update Backend Systems,2017-12-05 20:22:43.106496,2017-12-05 20:53:39.267909,RPA,SYSTEM,Federal,3000
39,8,Notification Review Request Completed,2017-12-05 20:58:39.267909,2017-12-05 21:03:41.934813,SYSTEM,SYSTEM,Federal,3000


In [178]:
story_df = increaseDuration(story_df, 'industry','Federal', 'Review Documents', 2)

In [179]:
story_df[story_df['process_id'] == 8]

Unnamed: 0,process_id,task,start_time,end_time,user,user_group,industry,service_charge
35,8,New Client Onboarding Request,2017-12-05 18:00:00.000000,2017-12-05 18:56:27.536445,Francis,Sales,Federal,3000
36,8,Review Documents,2017-12-05 20:01:27.536445,2017-12-05 22:08:23.137902,Rod,Operations,Federal,3000
37,8,Automated Scoreboarding,2017-12-05 22:13:23.137902,2017-12-05 22:17:43.106496,SYSTEM,SYSTEM,Federal,3000
38,8,Update Backend Systems,2017-12-05 22:22:43.106496,2017-12-05 22:53:39.267909,RPA,SYSTEM,Federal,3000
39,8,Notification Review Request Completed,2017-12-05 22:58:39.267909,2017-12-05 23:03:41.934813,SYSTEM,SYSTEM,Federal,3000


## Export Finished Dataset

In [180]:
story_df.to_csv('pm_tasks_1k.csv', index=False, date_format='%d-%m-%Y %H:%M:%S')