In [1]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta
from datetime import datetime



## Load Task Duration Config
For each task type that could be generated we need an average duration and a maximum duration.

In [None]:
#Load task duration dataframe
#task_duration_df = pd.read_csv('task_duration_config.csv')
#task_duration_df.set_index('Task', inplace=True)

data = [['New Client Onboarding Request', 0.5,1.5], 
        ['Review Documents', 0.25,  2.5], 
        ['Automated Scoreboarding', 0.1, 0.15],
        ['Manual Scoreboarding', 1.0, 3.0],
        ['Update Backend Systems', 0.25, 0.5],
        ['Notification Review Request Completed', 0.1, 0.15]
       ]

task_duration_df = pd.DataFrame(data, columns=['Task', 'Avg', 'Max'])
task_duration_df.set_index('Task', inplace=True)

In [None]:

#task_duration_df = pd.read_csv('task_duration_config.csv')
#task_duration_df.set_index('Task', inplace=True)

In [None]:
task_duration_df.head(10)

## Utility Functions

### Generate Process Instances
This function manages the creation of process instances for a specific process variant. A process variant is simply the list of tasks in execution order, including any loops. For example ['Wake Up', 'Breakfast', 'Work', 'Dinner' 'Sleep']

def generate_process_instances(process_variant, qty):

    global start_date_time
    global instance_counter
    task_list = []
    for x in range(0, qty):
        tasks = build_task_list(instance_counter, process_variant, start_date_time, task_duration_df)
        for task in tasks:
            task_list.append(task)

        # Increment the start time by 24 hour
        start_date_time = start_date_time + timedelta(hours=24)
        # Increment the process_id
        instance_counter = instance_counter +1
        
    return task_list

## Build Task List
The build_task_list function creates a list of tasks for a specific instance of a process. The sequence of tasks is defined in the variable called process_variant. Random task durations are calculated using the task_durations_df that contains the task name, the average duration and the max duration.  

This code generates a random duration by leveraging the numpy lognormal function that gives a random
number drawn from a log normal distribution. Look up a picture of log normal distributions and you'll see 
why this is useful for generating random durations based on a mean and std deviation.
most samples are near the average with a long tail stretching towards infinity.


In [None]:
def build_task_list(instance_id, process_variant, start_date_time, task_duration_df):
    instance_task_list = []
    rnd = np.random.default_rng()  # a random number generator

    # process_variant is a series
    for task_type in process_variant:
        # get the avg and max durations from the task_df dataframe using task_type as the key
        avg_dur = task_duration_df.loc[task_type, 'Avg']
        max_dur = task_duration_df.loc[task_type, 'Max']

        sigma = (max_dur - avg_dur) / max_dur  # std dev
        log_mean = np.log(avg_dur)  # can't pass the mean duration into lognormal until it has been logged itself 
        delta = rnd.lognormal(log_mean, sigma)  # get a random sample from a log normal distribution with a std dev

        task = [instance_id, task_type, start_date_time]
        instance_task_list.append(task)

        # increment the start time by the delta so that the next task start after this one
        start_date_time = start_date_time + timedelta(hours=delta)

    return instance_task_list

In [None]:
## Shift Activity Start Time
This function shifts the activity start time for a specific task with a specific attribute set to a specific value. For example, to delay the start time of breakfast by 1hour "task_list, User, Gerry, Eat Breakfast, 1"

This function will apply the same timeshift to every subsequent task in the proccess instance. 

In [None]:
def shiftActivityStartTime(tasks, target_attribute, target_value, target_task, timeShift):

    # Logic requires we sort by process_id and start_date
    tasks = tasks.sort_values(['process_id', 'start_time'])
    tasks = tasks.reset_index(drop=True)
    process_being_modified = -1

    # iterate through the dataframe using the index value
    for x in tasks.index:
        row = tasks.loc[x]
        task_attribute = row[target_attribute]
        row_task = row['task']
        current_process = row['process_id']


        # Have we found the target activity
        if row_task == target_task and task_attribute == target_value:
            process_being_modified = current_process

        # Are we still processing the same process that we were when we found the target activity ?
        # If so we are moving all subsequent tasks back by the value of the timeshift parameter
        if current_process == process_being_modified:
            # shift the time
            current_ts = row['start_time']
            shifted_ts = current_ts + timedelta(hours=timeShift)
            tasks.loc[x, 'start_time'] = shifted_ts

    return tasks

## Initialise Data Generator

In [None]:
start_date_time = datetime(2017, 11, 28, 18, 00, 00)
instance_time_offset = 24
instance_counter = 1


## Create Happy Path Instances

In [None]:

qty = 10
happy_path = ["New Client Onboarding Request", 
               "Review Documents", 
               "Automated Scoreboarding",
               "Update Backend Systems", 
               "Notification Review Request Completed"]

happy_path_task_list = []
happy_path_task_list = generate_process_instances(happy_path, qty)
happy_path_task_list_df = pd.DataFrame(happy_path_task_list)

# Increment the start time by 24 hour
start_date_time = start_date_time + timedelta(hours=24)

In [None]:
happy_path_task_list_df.head()

## Create Instances Requiring Manual Scoreboarding

In [None]:

qty = 5
manual_path = ["New Client Onboarding Request", 
               "Review Documents", 
               "Automated Scoreboarding",
               "Manual Scoreboarding",
               "Update Backend Systems", 
               "Notification Review Request Completed"]

manual_path_task_list = []
manual_path_task_list = generate_process_instances(manual_path, qty)
manual_path_task_list_df = pd.DataFrame(manual_path_task_list)

# Increment the start time by 24 hour
start_date_time = start_date_time + timedelta(hours=24)

In [None]:
manual_path_task_list_df.head()

## Create Instance that loop

In [None]:
qty = 1
loop_path = ["New Client Onboarding Request", 
               "Review Documents", 
               "Automated Scoreboarding",
               "Manual Scoreboarding",
               "Review Documents",
               "Update Backend Systems", 
               "Notification Review Request Completed"]

loop_path_task_list = []
loop_path_task_list = generate_process_instances(loop_path, qty)
loop_path_task_list_df = pd.DataFrame(loop_path_task_list)

# Increment the start time by 24 hour
start_date_time = start_date_time + timedelta(hours=24)

In [None]:
# Can only concat two dataframes at a time
combined_df = pd.concat([happy_path_task_list_df,manual_path_task_list_df], axis=0)
combined_df = pd.concat([combined_df,loop_path_task_list_df], axis=0)

In [None]:
combined_df.columns = ['process_id', 'task', 'start_time']

In [None]:
combined_df.info()

In [None]:
# Confirm how many processes
len(combined_df["process_id"].unique())

In [None]:
combined_df.head(50)

## Add Task Level Business Data : User

In [None]:

# Add a column for User
combined_df["user"] = ""

In [None]:
def setRandomUser(row):
    match row["task"]:
        case "Review Documents":
            return random.choice(['Rod','Jane','Freddy'])
        case "New Client Onboarding Request":
            return random.choice(['Clive','Francis','Nick','Seb','Tom'])
        case "Manual Scoreboarding":
            return random.choice(['Sharon','Susan', 'Sam'])
        case "Update Backend Systems":
            return "RPA"
        case "Automated Scoreboarding":
            return "SYSTEM"
        case "Notification Review Request Completed":
            return "SYSTEM"
        case _:
            return row["user"]

In [None]:
combined_df["user"] = combined_df.apply(setRandomUser, axis=1)

In [None]:
combined_df.head(20)

## Add Process Instance Business Data : Industry
Industy won't change during the process so all tasks for a given process ID must have the same value

In [None]:
combined_df["industry"] = ""

In [None]:
process_id_list = combined_df["process_id"].unique()

for process_id in process_id_list:
    industry = random.choice(['Federal','Finance','Healthcare','Insurance','Telecom'])
    combined_df.loc[combined_df["process_id"].eq(process_id), "industry"] = industry
    
                          
    

In [None]:
combined_df.head(20)

## Add Process Instance Business Data : Service Charge
Service charge will be a random choice based on industry.

In [None]:
combined_df["service_charge"] = 0

In [None]:
def getServiceChargeByIndustry(industry):
    match industry:
        case "Federal":
            return random.choice([3000, 6000, 8000])
        case "Finance":
            return random.choice([10000, 12000, 20000])
        case "Healthcare":
            return random.choice([15000, 20000, 25000])
        case "Insurance":
            return 45000
        case "Telecom":
            return 49000
        case _:
            return 64000

In [None]:
last_process = -1
service_charge = 0

process_id_list = combined_df["process_id"].unique()

for process_id in process_id_list:
        # what industry is set for this process_id
        process_instance_tasks = combined_df.loc[combined_df.process_id == process_id].copy()
        industry = process_instance_tasks.iloc[0]['industry']
        service_charge = getServiceChargeByIndustry(industry)
        combined_df.loc[combined_df.process_id == process_id, 'service_charge'] = service_charge
    


In [None]:
combined_df.head(50)

In [None]:
# next look at increasing the time taken for automated scoreboarding to start when the industry is federal
# do th same for healthcare
# rational here is that the documentation requirements are greater
 

In [None]:
# If user is Rod and activity is `Review Documents` shift the start_time by 1 hour
updated_df = shiftActivityStartTime(combined_df, 'user','Rod', 'Review Documents', 1)

In [None]:
filter = (combined_df['process_id'] == 3)
combined_df[filter]

In [None]:
filter = (updated_df['process_id'] == 3)
updated_df[filter]