In [None]:
#necessary import:
import pandas as pd
import time
import datetime

In [None]:
def fix_time(time):
    return datetime.datetime.fromisoformat(time)

In [None]:
def load_dataset_csv(file_name : str, file_attribute_name : str) -> list:
    #Loading the two datasets. So far we'll only be using the BPI.csv one
    #This also parses the date column
    df = pd.read_csv(file_name)
    df['time:timestamp'] = df['time:timestamp'].apply(fix_time)
    df_attr = pd.read_csv(file_attribute_name)
    return df, df_attr

In [None]:
# The current version of the tool works with the 2012 dataset!!!
df, df_attr = load_dataset_csv('DBL 2012/BPI.csv', 'DBL 2012/BPI_attr.csv')
df.head()

Unnamed 0 is the number of the case, Unnamed 1 is the number of the step for that case

In [None]:
#Let us change the column names to the aforementioned
df.columns = ['case_id', 'step_number', 'org:resource', 'lifecycle:transition',
       'concept:name', 'time:timestamp']

In [None]:
df.head(10) #here let's a look at the df after some really basic pre-processing

Creating a smaller df to test functions on:

In [None]:
#Erasing all the non-complete actions from the database:
df = df[df['lifecycle:transition'] == 'COMPLETE']

In [None]:
df = df.reset_index()

In [None]:
#I used to suppress the output of the cell, but didn't know it also does not save any progress within this cell
#So in order for df to update globally, we can't suppress the output unfortunately
df.drop('index', axis=1)

In [None]:
def compute_time_difference():
    #Set the time difference column
    #This function is places here because of the erased non-complete actions
    df['time:time_between'] = df['time:timestamp'].diff()
    df.loc[df['step_number'] == 0, 'time:time_between'] = pd.Timedelta(0)

In [None]:
compute_time_difference()
df.head(10)

In [None]:
def parse_timestamp_data():
    # Adds extra time related columns to the dataset to be used later
    # 0:Monday,..., 6:Sunday
    df['time:weekday'] = [x.weekday() for x in df['time:timestamp']]
    df['time:hour'] = [x.hour for x in df['time:timestamp']]

In [None]:
parse_timestamp_data()
df.head()

In [None]:
df['concept:name'][15:30]

The baseline functions:

In [None]:
def creating_dict_for_next_step_stats (df : pd.DataFrame, concept_name : str) -> dict:
    '''For an input action checks for all the possible next actions and counts their occurence'''
    
    dic_occurrence = {}
    dic_total_time = {}
    ids = list(df['case_id']) + ['editor: last id'] #Otherwise we check i+1-th position that does not exist
    times = list(df['time:time_between']) + [pd.Timedelta(0)] #Otherwise we check i+1-th position that does not exist
    names = df['concept:name']
    df_concept = df[names == concept_name]
    
    for i, row in df_concept.iterrows():
        if (ids[i] == ids[i+1]): #an instance of the same case
            if (names[i+1] not in dic_occurrence):
                dic_occurrence[names[i+1]] = 1
                dic_total_time[names[i+1]] = times[i+1]
            else:
                dic_occurrence[names[i+1]] += 1
                dic_total_time[names[i+1]] += times[i+1]
        else: #the last instance of the case
            if ('editor: close_case' not in dic_occurrence):
                dic_occurrence['editor: close_case'] = 1
                dic_total_time['editor: close_case'] = times[i+1]
            else:
                dic_occurrence['editor: close_case'] += 1
                dic_total_time['editor: close_case'] += times[i+1]
    
    #Compute average time
    dic_avg_time = {}
    for key in dic_total_time:
        dic_avg_time[key] = dic_total_time[key] / dic_occurrence[key]
        
    return(dic_occurrence, dic_avg_time)

In [None]:
creating_dict_for_next_step_stats(df, 'A_SUBMITTED')

In [None]:
def choosing_next_action(dic : dict):
    '''Finds the max value of the input dict and returns the key of the max value'''
    
    max_key = max(dic, key=dic.get)
    return(max_key)

In [None]:
choosing_next_action({'A_PARTLYSUBMITTED': 910, "wow": 21, "not_wow": 37})

In [None]:
def cycles_shortcut(actions : list, concept_name : str, max_length : int) -> list or bool:
    '''For saving the operating time, we will try to terminate the baseline early if we get into a loop
    max_length is the longest_trace parameter'''
    
    if(concept_name in actions): #the action has already been done
        if(actions[-1] == concept_name): #and it's the most recent action (self-loop)
            while(len(actions) < max_length): #filling the rest of the list with the current action if we're in a self-loop
                actions.append(concept_name)
        
        else: #it is not the most recent action
            placement = actions.index(concept_name) #locating the index of the "duplicate"
            aid_array = actions[placement:] #copying the values
            print("aid_array = ", aid_array)
            
            actions = actions + [0] * (max_length-len(actions)) #making [x, y, z, x] into [x, y, z, x, 0, 0, 0, ...]
            print("actions = ", actions)
            
            for i in range(placement+1, max_length): #iterating only over all the indices of 0's in actions
                actions[i] = aid_array[(i-placement)%len(aid_array)] #copying the list's values over and over again
        
        return(actions) #This return has to be then the return of the iterated_expected_actions
    
    else:
        return(False)

In [None]:
cycles_shortcut([2, 1, 3, 7], 3, 15)

In [None]:
def iterating_expected_actions(df : pd.DataFrame, concept_name : str, n : int) -> list:
    '''concept_name is the starting point (first action)
    n is the length of the longest trace ever observed
    It is stored in lonest_trace but for runtime reasons, use n so far'''
    
    longest_trace = max(df['step_number']) #finding the longest trace in the database (nr of steps)
    #note that we determine this AFTER deleting some rows with uncomplete steps. We should be running this on full df
    
    i = 0
    actions = [concept_name] #Here is the list that will store all the subsequent actions the algorithm decices to perform
    while (i < n): #terminate if we are exceeding the max number of steps
        wow = creating_dict_for_next_step_stats(df, concept_name)[0] #list all possible options
        concept_name = choosing_next_action(wow) #Choose the most commonly used option
        
        if(cycles_shortcut(actions, concept_name, n) != False): #Checks if we are stuck in a loop
            print("we are stuck in a loop")
            return(cycles_shortcut(actions, concept_name, n))
        
        if(concept_name == 'editor: close_case'): #If it is the "terminate" option - terminate
            break
        actions.append(concept_name) #Add the action to the list
        i += 1
    
    actions.append('editor: close_case')
    print('i = ', i, "n = ", n)
    
    return(actions)

In [None]:
def add_expected_events(df : pd.DataFrame) -> list:
    all_events = df['concept:name'].unique()
    next_event_name_dic = {'editor: close_case': 'editor: close_case'}
    next_event_duration_dic = {'editor: close_case': pd.Timedelta(0)}
    for event in all_events:
        next_step_stats = creating_dict_for_next_step_stats(df, event)
        wow = next_step_stats[0] #list all possible options
        concept_name = choosing_next_action(wow) #Choose the most commonly used option
        next_event_name_dic[event] = concept_name
        next_event_duration_dic[event] = next_step_stats[1][concept_name]
    return next_event_name_dic, next_event_duration_dic

In [None]:
#Get list of all expected next events and the expected time till that next event
all_expected_events = add_expected_events(df)
all_expected_events

In [None]:
#Add column to dataframe with expected next events and times
df['expect:next_event'] = df['concept:name'].map(all_expected_events[0])
df['expect:next_time'] = df['concept:name'].map(all_expected_events[1]) + df['time:timestamp']

In [None]:
df.head()

In [None]:
df.to_csv('BPI_with_predictions.csv')

In [None]:
iterating_expected_actions(df, 'A_SUBMITTED', 15)

## Here let us try to measure the performance of the baseline

### We will be determining its running time vs. input size to make a graph of it and use for the poster

In [None]:
limits = [50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000]
runtimes = [0] * len(limits)

for i in range(0, len(limits)):
    print(i)
    df_small = df[:limits[i]]
    time_start = time.time()
    iterating_expected_actions(df_small, 'A_SUBMITTED', 15)
    time_end = time.time()
    runtimes[i] = time_end-time_start

In [None]:
runtimes

In [None]:
import matplotlib.pyplot as plt

In [None]:
wow = plt.figure(figsize=(7.5, 5))
plt.scatter(x = limits, y=runtimes, color = '#AB3334')
plt.title('Runtime of the baseline algorithm (log scale)', fontsize = 16)
plt.ylabel('Operating time [log(s)]', fontsize = 13)
plt.xlabel('Input size [log(n)]', fontsize = 13)
plt.xscale("log")
plt.yscale('log')
plt.show();

In [None]:
wow2 = plt.figure(figsize=(7.5, 5))
plt.scatter(x = limits, y=runtimes, color = '#420CDA')
plt.title('Runtime of the baseline algorithm', fontsize = 16)
plt.ylabel('Operating time [s]', fontsize = 13)
plt.xlabel('Input size [n]', fontsize = 13)
plt.show();

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,6))

In [None]:
ax1.scatter(x = limits, y=runtimes, color = '#420CDA')
ax1.set_title('Runtime of the baseline algorithm', fontsize = 16)
ax1.set_xlabel('Input size [n]', fontsize = 13)
ax1.set_ylabel('Operating time [s]', fontsize = 13)

ax2.scatter(x = limits, y=runtimes, color = '#AB3334')
ax2.set_title('Runtime of the baseline algorithm (log scale)', fontsize = 16)
ax2.set_ylabel('Operating time [log(s)]', fontsize = 13)
ax2.set_xlabel('Input size [log(n)]', fontsize = 13)
ax2.set_xscale("log")
ax2.set_yscale('log')

In [None]:
f