In [1]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta
from datetime import datetime
from faker import Faker


## Load Task Duration Config

In [2]:
#Load task duration dataframe
task_duration_df = pd.read_csv('task_duration_config.csv')
task_duration_df.set_index('Task', inplace=True)


In [3]:
task_duration_df.head()

Unnamed: 0_level_0,Avg,Max
Task,Unnamed: 1_level_1,Unnamed: 2_level_1
New Client Onboarding Request,0.5,1.5
Review Documents,0.25,2.5
Automated Scoreboarding,0.1,0.15
Manual Scoreboarding,1.0,3.0
Update Backend Systems,0.25,0.5


For each task type that could be generated we need an average duration and a maximum duration.

## Build Task List
The build_task_list function creates a list of tasks for a specific instance of a process. The sequence of tasks is defined in the variable called process_variant.


In [4]:
def build_task_list(instance_id, process_variant, start_date_time, task_duration_df):
    instance_task_list = []
    rnd = np.random.default_rng()  # a random number generator

    # This code generates a random duration by leveraging the numpy lognormal function that gives a random
    # number drawn from a log normal distribution. Look up a picture of log normal distributions and you'll see 
    # why this is useful for generating random durations based on a mean and std deviation.
    # most samples are near the average with a long tail stretching towards infinity.

    # process_variant is a series
    for task_type in process_variant:
        # get the avg and max durations from the task_df dataframe using task_type as the key
        avg_dur = task_duration_df.loc[task_type, 'Avg']
        max_dur = task_duration_df.loc[task_type, 'Max']

        sigma = (max_dur - avg_dur) / max_dur  # std dev
        log_mean = np.log(avg_dur)  # can't pass the mean duration into lognormal until it has been logged itself 
        delta = rnd.lognormal(log_mean, sigma)  # get a random sample from a log normal distribution with a std dev

        task = [instance_id, task_type, start_date_time]
        instance_task_list.append(task)

        # increment the start time by the delta so that the next task start after this one
        start_date_time = start_date_time + timedelta(hours=delta)

    return instance_task_list

In [5]:
def generate_process_instances(process_variant, qty):

    global start_date_time
    global instance_counter
    task_list = []
    for x in range(0, qty):
        tasks = build_task_list(instance_counter, process_variant, start_date_time, task_duration_df)
        for task in tasks:
            task_list.append(task)

        # Increment the start time by 24 hour
        start_date_time = start_date_time + timedelta(hours=24)
        # Increment the process_id
        instance_counter = instance_counter +1
        
    return task_list

## Initialise Data Generator

In [61]:
start_date_time = datetime(2017, 11, 28, 18, 00, 00)
instance_time_offset = 24
instance_counter = 1
faker = Faker()

## Create Happy Path Instances

In [62]:

qty = 100
happy_path = ["New Client Onboarding Request", 
               "Review Documents", 
               "Automated Scoreboarding",
               "Update Backend Systems", 
               "Notification Review Request Completed"]

happy_path_task_list = []
happy_path_task_list = generate_process_instances(happy_path, qty)
happy_path_task_list_df = pd.DataFrame(happy_path_task_list)

# Increment the start time by 24 hour
start_date_time = start_date_time + timedelta(hours=24)

In [63]:
happy_path_task_list_df.head()

Unnamed: 0,0,1,2
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000
1,1,Review Documents,2017-11-28 18:27:33.158564
2,1,Automated Scoreboarding,2017-11-28 19:34:11.197355
3,1,Update Backend Systems,2017-11-28 19:41:34.003747
4,1,Notification Review Request Completed,2017-11-28 19:52:52.120246


## Create Instances Requiring Manual Scoreboarding

In [64]:

qty = 25
manual_path = ["New Client Onboarding Request", 
               "Review Documents", 
               "Automated Scoreboarding",
               "Manual Scoreboarding",
               "Update Backend Systems", 
               "Notification Review Request Completed"]

manual_path_task_list = []
manual_path_task_list = generate_process_instances(manual_path, qty)
manual_path_task_list_df = pd.DataFrame(manual_path_task_list)

# Increment the start time by 24 hour
start_date_time = start_date_time + timedelta(hours=24)

In [65]:
combined_df = pd.concat([happy_path_task_list_df,manual_path_task_list_df], axis=0)


In [66]:
combined_df.columns = ['process_id', 'task', 'start_time']

In [67]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 650 entries, 0 to 149
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   process_id  650 non-null    int64         
 1   task        650 non-null    object        
 2   start_time  650 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 20.3+ KB


In [70]:
# Confirm how many processes
len(combined_df["process_id"].unique())

125

## Add Business Data

In [71]:

# Add a column for User
combined_df["user"] = ""

In [72]:
def setRandomUser(row):
    match row["task"]:
        case "Review Documents":
            return random.choice(['Rod','Jane','Freddy'])
        case "New Client Onboarding Request":
            return random.choice(['Clive','Francis','Nick','Seb','Tom'])
        case "Manual Scoreboarding":
            return random.choice(['Sharon','Susan', 'Sam'])
        case "Update Backend Systems":
            return "RPA"
        case "Automated Scoreboarding":
            return "SYSTEM"
        case "Notification Review Request Completed":
            return "SYSTEM"
        case _:
            return row["user"]

In [73]:
combined_df.head()

Unnamed: 0,process_id,task,start_time,user
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000,
1,1,Review Documents,2017-11-28 18:27:33.158564,
2,1,Automated Scoreboarding,2017-11-28 19:34:11.197355,
3,1,Update Backend Systems,2017-11-28 19:41:34.003747,
4,1,Notification Review Request Completed,2017-11-28 19:52:52.120246,


In [74]:
combined_df["user"] = combined_df.apply(setRandomUser, axis=1)

In [78]:
combined_df.head(20)

Unnamed: 0,process_id,task,start_time,user
0,1,New Client Onboarding Request,2017-11-28 18:00:00.000000,Clive
1,1,Review Documents,2017-11-28 18:27:33.158564,Freddy
2,1,Automated Scoreboarding,2017-11-28 19:34:11.197355,SYSTEM
3,1,Update Backend Systems,2017-11-28 19:41:34.003747,RPA
4,1,Notification Review Request Completed,2017-11-28 19:52:52.120246,SYSTEM
5,2,New Client Onboarding Request,2017-11-29 18:00:00.000000,Francis
6,2,Review Documents,2017-11-29 18:48:40.592878,Rod
7,2,Automated Scoreboarding,2017-11-29 19:03:38.849192,SYSTEM
8,2,Update Backend Systems,2017-11-29 19:09:15.681705,RPA
9,2,Notification Review Request Completed,2017-11-29 19:24:21.563921,SYSTEM


In [None]:
combined_df['process_id

In [115]:
def shiftActivityTime(df, target_attribute, target_value, target_task, timeShift):

    # Logi requires we sort by process_id and start_date
    df = df.sort_values(['process_id', 'start_time'])
    df = df.reset_index(drop=True)
    process_being_modified = -1

    for x in df.index:
        row = df.loc[x]
        task_attribute = row[target_attribute]
        row_task = row['task']
        current_process = row['process_id']


        # Have we found the target activity
        if row_task == target_task and task_attribute == target_value:
            process_being_modified = current_process

        # Are we still processing the same process that we were when we found the target activity ?
        # If so we are moving all subsequent tasks back by the value of the timeshift parameter
        if current_process == process_being_modified:
            # shift the time
            current_ts = row['start_time']
            shifted_ts = current_ts + timedelta(hours=timeShift)
            df.loc[x, 'start_time'] = shifted_ts

    return df

In [116]:
# If user is Rod and activity is `Review Documents` shift the start_time by 1 hour
updated_df = shiftActivityTime(combined_df, 'user','Rod', 'Review Documents', 1)

In [117]:
filter = (combined_df['process_id'] == 2)
combined_df[filter]

Unnamed: 0,process_id,task,start_time,user
5,2,New Client Onboarding Request,2017-11-29 18:00:00.000000,Francis
6,2,Review Documents,2017-11-29 18:48:40.592878,Rod
7,2,Automated Scoreboarding,2017-11-29 19:03:38.849192,SYSTEM
8,2,Update Backend Systems,2017-11-29 19:09:15.681705,RPA
9,2,Notification Review Request Completed,2017-11-29 19:24:21.563921,SYSTEM


In [118]:
filter = (updated_df['process_id'] == 2)
updated_df[filter]

Unnamed: 0,process_id,task,start_time,user
5,2,New Client Onboarding Request,2017-11-29 18:00:00.000000,Francis
6,2,Review Documents,2017-11-29 19:48:40.592878,Rod
7,2,Automated Scoreboarding,2017-11-29 20:03:38.849192,SYSTEM
8,2,Update Backend Systems,2017-11-29 20:09:15.681705,RPA
9,2,Notification Review Request Completed,2017-11-29 20:24:21.563921,SYSTEM


## Export Finished Dataset

In [None]:
combined_df.to_csv('pm_tasks.csv', index=False, date_format='%d-%m-%Y %H:%M:%S')