In [1]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta
from datetime import datetime



## Load Task Duration Config

In [103]:
#Load task duration dataframe
#task_duration_df = pd.read_csv('task_duration_config.csv')
#task_duration_df.set_index('Task', inplace=True)

data = [['New Client Onboarding Request', 0.5,1.5], 
        ['Review Documents', 0.25,  2.5], 
        ['Automated Scoreboarding', 0.1, 0.15],
        ['Manual Scoreboarding', 1.0, 3.0],
        ['Update Backend Systems', 0.25, 0.5]]

task_duration_df = pd.DataFrame(data, columns=['Task', 'Avg', 'Max'])

In [104]:
task_duration_df.head()

Unnamed: 0,Task,Avg,Max
0,New Client Onboarding Request,0.5,1.5
1,Review Documents,0.25,2.5
2,Automated Scoreboarding,0.1,0.15
3,Manual Scoreboarding,1.0,3.0
4,Update Backend Systems,0.25,0.5


For each task type that could be generated we need an average duration and a maximum duration.

## Build Task List
The build_task_list function creates a list of tasks for a specific instance of a process. The sequence of tasks is defined in the variable called process_variant.


In [33]:
def build_task_list(instance_id, process_variant, start_date_time, task_duration_df):
    instance_task_list = []
    rnd = np.random.default_rng()  # a random number generator

    # This code generates a random duration by leveraging the numpy lognormal function that gives a random
    # number drawn from a log normal distribution. Look up a picture of log normal distributions and you'll see 
    # why this is useful for generating random durations based on a mean and std deviation.
    # most samples are near the average with a long tail stretching towards infinity.

    # process_variant is a series
    for task_type in process_variant:
        # get the avg and max durations from the task_df dataframe using task_type as the key
        avg_dur = task_duration_df.loc[task_type, 'Avg']
        max_dur = task_duration_df.loc[task_type, 'Max']

        sigma = (max_dur - avg_dur) / max_dur  # std dev
        log_mean = np.log(avg_dur)  # can't pass the mean duration into lognormal until it has been logged itself 
        delta = rnd.lognormal(log_mean, sigma)  # get a random sample from a log normal distribution with a std dev

        task = [instance_id, task_type, start_date_time]
        instance_task_list.append(task)

        # increment the start time by the delta so that the next task start after this one
        start_date_time = start_date_time + timedelta(hours=delta)

    return instance_task_list

In [34]:
def generate_process_instances(process_variant, qty):

    global start_date_time
    global instance_counter
    task_list = []
    for x in range(0, qty):
        tasks = build_task_list(instance_counter, process_variant, start_date_time, task_duration_df)
        for task in tasks:
            task_list.append(task)

        # Increment the start time by 24 hour
        start_date_time = start_date_time + timedelta(hours=24)
        # Increment the process_id
        instance_counter = instance_counter +1
        
    return task_list

In [70]:
def shiftActivityTime(df, target_attribute, target_value, target_task, timeShift):

    # Logic requires we sort by process_id and start_date
    df = df.sort_values(['process_id', 'start_time'])
    df = df.reset_index(drop=True)
    process_being_modified = -1

    # iterate through the dataframe using the index value
    for x in df.index:
        row = df.loc[x]
        task_attribute = row[target_attribute]
        row_task = row['task']
        current_process = row['process_id']


        # Have we found the target activity
        if row_task == target_task and task_attribute == target_value:
            process_being_modified = current_process

        # Are we still processing the same process that we were when we found the target activity ?
        # If so we are moving all subsequent tasks back by the value of the timeshift parameter
        if current_process == process_being_modified:
            # shift the time
            current_ts = row['start_time']
            shifted_ts = current_ts + timedelta(hours=timeShift)
            df.loc[x, 'start_time'] = shifted_ts

    return df

## Initialise Data Generator

In [71]:
start_date_time = datetime(2017, 11, 28, 18, 00, 00)
instance_time_offset = 24
instance_counter = 1


## Create Happy Path Instances

In [72]:

qty = 10
happy_path = ["New Client Onboarding Request", 
               "Review Documents", 
               "Automated Scoreboarding",
               "Update Backend Systems", 
               "Notification Review Request Completed"]

happy_path_task_list = []
happy_path_task_list = generate_process_instances(happy_path, qty)
happy_path_task_list_df = pd.DataFrame(happy_path_task_list)

# Increment the start time by 24 hour
start_date_time = start_date_time + timedelta(hours=24)

In [73]:
happy_path_task_list_df.head()

Unnamed: 0,0,1,2
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000
1,1,Review Documents,2017-11-28 18:44:27.770446
2,1,Automated Scoreboarding,2017-11-28 19:04:30.280830
3,1,Update Backend Systems,2017-11-28 19:11:32.020260
4,1,Notification Review Request Completed,2017-11-28 19:29:57.368799


## Create Instances Requiring Manual Scoreboarding

In [74]:

qty = 5
manual_path = ["New Client Onboarding Request", 
               "Review Documents", 
               "Automated Scoreboarding",
               "Manual Scoreboarding",
               "Update Backend Systems", 
               "Notification Review Request Completed"]

manual_path_task_list = []
manual_path_task_list = generate_process_instances(manual_path, qty)
manual_path_task_list_df = pd.DataFrame(manual_path_task_list)

# Increment the start time by 24 hour
start_date_time = start_date_time + timedelta(hours=24)

In [75]:
manual_path_task_list_df.head()

Unnamed: 0,0,1,2
0,11,New Client Onboarding Request,2017-12-09 18:00:00.000000
1,11,Review Documents,2017-12-09 18:34:36.920259
2,11,Automated Scoreboarding,2017-12-09 19:01:08.445443
3,11,Manual Scoreboarding,2017-12-09 19:05:41.599380
4,11,Update Backend Systems,2017-12-09 19:43:04.229421


## Create Instance that loop

In [76]:
qty = 1
loop_path = ["New Client Onboarding Request", 
               "Review Documents", 
               "Automated Scoreboarding",
               "Manual Scoreboarding",
               "Review Documents",
               "Update Backend Systems", 
               "Notification Review Request Completed"]

loop_path_task_list = []
loop_path_task_list = generate_process_instances(loop_path, qty)
loop_path_task_list_df = pd.DataFrame(loop_path_task_list)

# Increment the start time by 24 hour
start_date_time = start_date_time + timedelta(hours=24)

In [77]:
# Can only concat two dataframes at a time
combined_df = pd.concat([happy_path_task_list_df,manual_path_task_list_df], axis=0)
combined_df = pd.concat([combined_df,loop_path_task_list_df], axis=0)

In [78]:
combined_df.columns = ['process_id', 'task', 'start_time']

In [79]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87 entries, 0 to 6
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   process_id  87 non-null     int64         
 1   task        87 non-null     object        
 2   start_time  87 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 2.7+ KB


In [80]:
# Confirm how many processes
len(combined_df["process_id"].unique())

16

In [81]:
combined_df.head(50)

Unnamed: 0,process_id,task,start_time
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000
1,1,Review Documents,2017-11-28 18:44:27.770446
2,1,Automated Scoreboarding,2017-11-28 19:04:30.280830
3,1,Update Backend Systems,2017-11-28 19:11:32.020260
4,1,Notification Review Request Completed,2017-11-28 19:29:57.368799
5,2,New Client Onboarding Request,2017-11-29 18:00:00.000000
6,2,Review Documents,2017-11-29 18:53:53.439293
7,2,Automated Scoreboarding,2017-11-29 19:18:25.611178
8,2,Update Backend Systems,2017-11-29 19:22:26.930142
9,2,Notification Review Request Completed,2017-11-29 19:39:07.545029


## Add Task Level Business Data : User

In [82]:

# Add a column for User
combined_df["user"] = ""

In [83]:
def setRandomUser(row):
    match row["task"]:
        case "Review Documents":
            return random.choice(['Rod','Jane','Freddy'])
        case "New Client Onboarding Request":
            return random.choice(['Clive','Francis','Nick','Seb','Tom'])
        case "Manual Scoreboarding":
            return random.choice(['Sharon','Susan', 'Sam'])
        case "Update Backend Systems":
            return "RPA"
        case "Automated Scoreboarding":
            return "SYSTEM"
        case "Notification Review Request Completed":
            return "SYSTEM"
        case _:
            return row["user"]

In [84]:
combined_df["user"] = combined_df.apply(setRandomUser, axis=1)

In [85]:
combined_df.head(20)

Unnamed: 0,process_id,task,start_time,user
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000,Seb
1,1,Review Documents,2017-11-28 18:44:27.770446,Jane
2,1,Automated Scoreboarding,2017-11-28 19:04:30.280830,SYSTEM
3,1,Update Backend Systems,2017-11-28 19:11:32.020260,RPA
4,1,Notification Review Request Completed,2017-11-28 19:29:57.368799,SYSTEM
5,2,New Client Onboarding Request,2017-11-29 18:00:00.000000,Seb
6,2,Review Documents,2017-11-29 18:53:53.439293,Freddy
7,2,Automated Scoreboarding,2017-11-29 19:18:25.611178,SYSTEM
8,2,Update Backend Systems,2017-11-29 19:22:26.930142,RPA
9,2,Notification Review Request Completed,2017-11-29 19:39:07.545029,SYSTEM


## Add Process Instance Business Data : Industry
Industy won't change during the process so all tasks for a given process ID must have the same value

In [86]:
combined_df["industry"] = ""

In [87]:
process_id_list = combined_df["process_id"].unique()

for process_id in process_id_list:
    industry = random.choice(['Federal','Finance','Healthcare','Insurance','Telecom'])
    combined_df.loc[combined_df["process_id"].eq(process_id), "industry"] = industry
    
                          
    

In [88]:
combined_df.head(20)

Unnamed: 0,process_id,task,start_time,user,industry
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000,Seb,Telecom
1,1,Review Documents,2017-11-28 18:44:27.770446,Jane,Telecom
2,1,Automated Scoreboarding,2017-11-28 19:04:30.280830,SYSTEM,Telecom
3,1,Update Backend Systems,2017-11-28 19:11:32.020260,RPA,Telecom
4,1,Notification Review Request Completed,2017-11-28 19:29:57.368799,SYSTEM,Telecom
5,2,New Client Onboarding Request,2017-11-29 18:00:00.000000,Seb,Federal
6,2,Review Documents,2017-11-29 18:53:53.439293,Freddy,Federal
7,2,Automated Scoreboarding,2017-11-29 19:18:25.611178,SYSTEM,Federal
8,2,Update Backend Systems,2017-11-29 19:22:26.930142,RPA,Federal
9,2,Notification Review Request Completed,2017-11-29 19:39:07.545029,SYSTEM,Federal


## Add Process Instance Business Data : Service Charge
Service charge will be a random choice based on industry.

In [89]:
combined_df["service_charge"] = 0

In [90]:
def getServiceChargeByIndustry(industry):
    match industry:
        case "Federal":
            return random.choice([3000, 6000, 8000])
        case "Finance":
            return random.choice([10000, 12000, 20000])
        case "Healthcare":
            return random.choice([15000, 20000, 25000])
        case "Insurance":
            return 45000
        case "Telecom":
            return 49000
        case _:
            return 64000

In [96]:
last_process = -1
service_charge = 0

process_id_list = combined_df["process_id"].unique()

for process_id in process_id_list:
        # what industry is set for this process_id
        process_instance_tasks = combined_df.loc[combined_df.process_id == process_id].copy()
        industry = process_instance_tasks.iloc[0]['industry']
        service_charge = getServiceChargeByIndustry(industry)
        combined_df.loc[combined_df.process_id == process_id, 'service_charge'] = service_charge
    


In [97]:
combined_df.head(50)

Unnamed: 0,process_id,task,start_time,user,industry,service_charge
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000,Seb,Telecom,49000
1,1,Review Documents,2017-11-28 18:44:27.770446,Jane,Telecom,49000
2,1,Automated Scoreboarding,2017-11-28 19:04:30.280830,SYSTEM,Telecom,49000
3,1,Update Backend Systems,2017-11-28 19:11:32.020260,RPA,Telecom,49000
4,1,Notification Review Request Completed,2017-11-28 19:29:57.368799,SYSTEM,Telecom,49000
5,2,New Client Onboarding Request,2017-11-29 18:00:00.000000,Seb,Federal,8000
6,2,Review Documents,2017-11-29 18:53:53.439293,Freddy,Federal,8000
7,2,Automated Scoreboarding,2017-11-29 19:18:25.611178,SYSTEM,Federal,8000
8,2,Update Backend Systems,2017-11-29 19:22:26.930142,RPA,Federal,8000
9,2,Notification Review Request Completed,2017-11-29 19:39:07.545029,SYSTEM,Federal,8000


In [56]:
# next look at increasing the time taken for automated scoreboarding to start when the industry is federal
# do th same for healthcare
# rational here is that the documentation requirements are greater
 

In [98]:
# If user is Rod and activity is `Review Documents` shift the start_time by 1 hour
updated_df = shiftActivityTime(combined_df, 'user','Rod', 'Review Documents', 1)

In [99]:
filter = (combined_df['process_id'] == 3)
combined_df[filter]

Unnamed: 0,process_id,task,start_time,user,industry,service_charge
10,3,New Client Onboarding Request,2017-11-30 18:00:00.000000,Seb,Federal,6000
11,3,Review Documents,2017-11-30 18:35:44.896309,Rod,Federal,6000
12,3,Automated Scoreboarding,2017-11-30 19:05:10.328138,SYSTEM,Federal,6000
13,3,Update Backend Systems,2017-11-30 19:10:29.103089,RPA,Federal,6000
14,3,Notification Review Request Completed,2017-11-30 19:18:24.568572,SYSTEM,Federal,6000


In [100]:
filter = (updated_df['process_id'] == 3)
updated_df[filter]

Unnamed: 0,process_id,task,start_time,user,industry,service_charge
10,3,New Client Onboarding Request,2017-11-30 18:00:00.000000,Seb,Federal,6000
11,3,Review Documents,2017-11-30 19:35:44.896309,Rod,Federal,6000
12,3,Automated Scoreboarding,2017-11-30 20:05:10.328138,SYSTEM,Federal,6000
13,3,Update Backend Systems,2017-11-30 20:10:29.103089,RPA,Federal,6000
14,3,Notification Review Request Completed,2017-11-30 20:18:24.568572,SYSTEM,Federal,6000


## Export Finished Dataset

In [102]:
updated_df.to_csv('pm_tasks.csv', index=False, date_format='%d-%m-%Y %H:%M:%S')