In [47]:
#All the imports here
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.visualization.process_tree import visualizer as pt_visualizer
from pm4py.objects.conversion.process_tree import converter as pt_converter
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
from pm4py.visualization.dfg import visualizer as dfg_visualization
from pm4py.objects.conversion.dfg import converter as dfg_mining
from pm4py.visualization.petrinet import factory as pn_vis_factory
from collections import defaultdict 
import pandas as pd
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.adapters.pandas import csv_import_adapter
from pm4py.objects.conversion.log import factory as conversion_factory
from pm4py.util import constants
import math
from datetime import date
import numpy as np

In [2]:
%config IPCompleter.greedy=True

In [3]:
def explore_log(log):
    for case_index, case in enumerate(log):
        print("\n case index: %d  case id: %s" % (case_index, case.attributes["concept:name"]))
        for event_index, event in enumerate(case):
            print("event index: %d  event activity: %s" % (event_index, event["concept:name"]))

In [4]:
#This class represents a directed graph using dfg 
class Graph: 

    def __init__(self,vertices): 
        self.V= vertices #No. of vertices 
        self.graph = defaultdict(list) # default dictionary to store graph 

    # function to add an edge to graph 
    def addEdge(self,u,v): 
        self.graph[u].append(v) 
    
    # Use BFS to check path between s and d 
    def isReachable(self, s, d): 
        # Mark all the vertices as not visited 
        visited =[False]*(self.V) 

        # Create a queue for BFS 
        queue=[]

        # Mark the source node as visited and enqueue it 
        queue.append(s) 
        visited[s] = True

        while queue: 

            #Dequeue a vertex from queue 
            n = queue.pop(0) 

            # If this adjacent node is the destination node, 
            # then return true 
            if n == d: 
                return True

            # Else, continue to do BFS 
            for i in self.graph[n]: 
                if visited[i] == False: 
                    queue.append(i) 
                    visited[i] = True
        # If BFS is complete without visited d 
        return False



In [5]:
def find_weakness(log, forbidden_sequence):
    #Weakness 1: Duplicate or loop-> Same event repeating twice in the log
    for case_index, case in enumerate(log):
        print("\n case index: %d  case id: %s" % (case_index, case.attributes["concept:name"]))
        event_list=[]
        
        for event_index, event in enumerate(case):
            print("event index: %d  event activity: %s" % (event_index, event["concept:name"]))
            event_list.append(event["concept:name"])
            
        print ("The events which got repeated in the trace are",find_duplicate_events(event_list))
        



    #Weakness 2: Find out if the forbidden sequence of events exists in the log
    #applying the Directly follows graph discovery to get the sequence which are directly following each other
    dfg_simple = dfg_discovery.apply(log)
    violated_restrictions=[]#for directly following each other or indirectly following
    for r in forbidden_sequence:
        count=0
        for d in dfg_simple.elements():
            if(r==d):
                count+=1
                violated_restrictions.append((r,count))
            #else if(r[0]==d[0]):
            
              
                
    print("Violated restrictions, Number of times violated: ",violated_restrictions)

In [6]:

# Create a graph for the given dfg

log = xes_importer.apply('running-example.xes')
dfg_simple = dfg_discovery.apply(log)

    
g = Graph(len(list(dfg_simple.elements())))
l=[]
for t in dfg_simple.elements(): 
    for x in t: 
        l.append(x) 
l=list(set(l))#list mapping every element to a number
for d in dfg_simple.elements():
    g.addEdge(l.index(d[0]),l.index(d[1]))
    

u =l.index("register request"); v = l.index("decide")

if g.isReachable(u, v): 
    print("There is a path from %s to %s" % (l[u],l[v])) 
else : 
    print("There is no path from %s to %s" % (l[u],l[v])) 



HBox(children=(FloatProgress(value=0.0, description='parsing log, completed traces :: ', max=6.0, style=Progre…


There is a path from register request to decide


In [7]:
log = xes_importer.apply('running-example.xes')
dfg_simple = dfg_discovery.apply(log)

for case_index, case in enumerate(log):
    print(type(case))
    #dfg_simple1 = dfg_discovery.apply(case)
    

print(type(log))

HBox(children=(FloatProgress(value=0.0, description='parsing log, completed traces :: ', max=6.0, style=Progre…


<class 'pm4py.objects.log.log.Trace'>
<class 'pm4py.objects.log.log.Trace'>
<class 'pm4py.objects.log.log.Trace'>
<class 'pm4py.objects.log.log.Trace'>
<class 'pm4py.objects.log.log.Trace'>
<class 'pm4py.objects.log.log.Trace'>
<class 'pm4py.objects.log.log.EventLog'>


In [8]:
from pm4py.objects.dfg.utils import dfg_utils
from pm4py.objects.petri.petrinet import PetriNet, Marking
from pm4py.objects.petri import utils as pn_util
from enum import Enum
from pm4py.util import exec_utils


class Parameters(Enum):
    START_ACTIVITIES = 'start_activities'
    END_ACTIVITIES = 'end_activities'



PARAM_KEY_START_ACTIVITIES = Parameters.START_ACTIVITIES
PARAM_KEY_END_ACTIVITIES = Parameters.END_ACTIVITIES

#obtain petrinet from dfg
def obtain_petrinet_from_dfg(dfg, parameters=None):
    """
    Applies the DFG mining on a given object (if it is a Pandas dataframe or a log, the DFG is calculated)

    Parameters
    -------------
    dfg
        Object (DFG) (if it is a Pandas dataframe or a log, the DFG is calculated)
    parameters
        Parameters
    """
    if parameters is None:
        parameters = {}

    dfg = dfg
    start_activities = exec_utils.get_param_value(Parameters.START_ACTIVITIES, parameters,
                                                  dfg_utils.infer_start_activities(
                                                      dfg))
    end_activities = exec_utils.get_param_value(Parameters.END_ACTIVITIES, parameters,
                                                dfg_utils.infer_end_activities(dfg))
    activities = dfg_utils.get_activities_from_dfg(dfg)

    net = PetriNet("")
    im = Marking()
    fm = Marking()

    source = PetriNet.Place("source")
    net.places.add(source)
    im[source] = 1
    sink = PetriNet.Place("sink")
    net.places.add(sink)
    fm[sink] = 1

    places_corr = {}
    index = 0

    for act in activities:
        places_corr[act] = PetriNet.Place(act)
        net.places.add(places_corr[act])

    for act in start_activities:
        if act in places_corr:
            index = index + 1
            trans = PetriNet.Transition(act + "_" + str(index), act)
            net.transitions.add(trans)
            pn_util.add_arc_from_to(source, trans, net)
            pn_util.add_arc_from_to(trans, places_corr[act], net)

    for act in end_activities:
        if act in places_corr:
            index = index + 1
            inv_trans = PetriNet.Transition(act + "_" + str(index), None)
            net.transitions.add(inv_trans)
            pn_util.add_arc_from_to(places_corr[act], inv_trans, net)
            pn_util.add_arc_from_to(inv_trans, sink, net)

    for el in dfg.keys():
        act1 = el[0]
        act2 = el[1]

        index = index + 1
        trans = PetriNet.Transition(act2 + "_" + str(index), act2)
        net.transitions.add(trans)

        pn_util.add_arc_from_to(places_corr[act1], trans, net)
        pn_util.add_arc_from_to(trans, places_corr[act2], net)

    return net, im, fm



In [9]:
def Unwanted_Activity(log, blacklist):
    print("Unwanted activity function")
    for case_index, case in enumerate(log):
        for event_index, event in enumerate(case):
            if(event["Activity"] in blacklist):
                print("Unwanted activity=> activity: %s -> case: %s  " % (event["Activity"], event["Case ID"]))
            

In [10]:
#Checks for the largest common prefix  
def lcp(s, t):  
  n = min(len(s),len(t));  
  for i in range(0,n):  
    if(s[i] != t[i]):  
      return s[0:i];  
  else:  
    return s[0:n];  

def Find_sequence(eventList):
    lrs="";  
    n = len(eventList);  
    for i in range(0,n):  
      for j in range(i+1,n):  
        #Checks for the largest common factors in every substring  
        x = lcp(eventList[i:n],eventList[j:n]);  
            #If the current prefix is greater than previous one   
            #then it takes the current one as longest repeating sequence  
        if(len(x) > len(lrs)):  
          lrs=x;    
    print("Longest repeating sequence: ",lrs);  

In [11]:
def Backloop(log):
    print("Backloop function")
    for case_index, case in enumerate(log):
        eventList=[]
        for event_index, event in enumerate(case):
            eventList.append(event["Activity"])
        print("Repeating sequence for events in case:",case.attributes['concept:name'])   
        Find_sequence(eventList)
        
    #for trace in event_log:
    #    print(trace)

In [12]:
def find_duplicate_events(x): 
    _size = len(x) 
    duplicate_list = [] 
    for i in range(_size): 
        k = i + 1
        for j in range(k, _size): 
            if x[i] == x[j] and x[i] not in duplicate_list: 
                duplicate_list.append(x[i]) 
    return duplicate_list

In [13]:
def Redundant_Activity(log):
    print("Redundant_Activity function")
    for case_index, case in enumerate(log):
        print("\n Case Id: %s" % ( case.attributes["concept:name"]))
        event_list=[]
        
        for event_index, event in enumerate(case):
            print("event index: %d  event activity: %s" % (event_index, event["Activity"]))
            event_list.append(event["Activity"])
            
        print ("The events which got repeated in the trace are",find_duplicate_events(event_list))

In [14]:
def Interface(log):
    for case_index, case in enumerate(log):
        d={}
        l=""
        print("\n Case Id: %s" % ( case.attributes["concept:name"]))
        
        for event_index, event in enumerate(case):
            if( len(d)!=0 and event["Activity"] in d.keys() and event["Resource"]!= d[event["Activity"]]):
                print("The resource has changed for the activity: %s from %s to %s"%(event["Activity"], d[event["Activity"]], event["Resource"]))
            d[event["Activity"]]=event["Resource"]
            
            
    print("Interface function")

In [15]:
def Switch_of_media(log):
    print("Switch_of_media function")
    print("Logic is same as Interface function as there is no column for media in the given CSV ")

In [16]:
def Idle_time(log):
    print("Idle_time function")
    for case_index, case in enumerate(log):
        print("\n Case Id: %s" % ( case.attributes["concept:name"]))
        prev_end_timestamp=0
        idle_time=0
        prev_activity=""
        for event_index, event in enumerate(case):
            if(prev_end_timestamp!=0):
                idle_time=pd.to_datetime(event["Start Timestamp"], format = "%m/%d/%Y %H:%M:%S")-prev_end_timestamp
            print("Idle time between previous activity:%s and current activity:%s is %s"%(prev_activity, event["Activity"], idle_time))
            prev_end_timestamp=pd.to_datetime(event["Complete Timestamp"], format = "%m/%d/%Y %H:%M:%S") 
            prev_activity=event["Activity"]

In [45]:
def mean1(log):
    total_events=0
    avg_dict={}
    for case_index, case in enumerate(log): 
        for event_index, event in enumerate(case):
            total_events=+1
            if event["Activity"] not in avg_dict.keys():
                avg_dict[event["Activity"]]=((pd.to_datetime(event["Complete Timestamp"], format = "%m/%d/%Y %H:%M:%S")-pd.to_datetime(event["Start Timestamp"], format = "%m/%d/%Y %H:%M:%S"))/ pd.Timedelta(hours=1),1)
            else:
                avg_dict[event["Activity"]]=(avg_dict[event["Activity"]][0] + (pd.to_datetime(event["Complete Timestamp"], format = "%m/%d/%Y %H:%M:%S")-pd.to_datetime(event["Start Timestamp"], format = "%m/%d/%Y %H:%M:%S"))/ pd.Timedelta(hours=1),(avg_dict[event["Activity"]][1])+1)
    
    avg_dict2={}
    for k, v in avg_dict.items():
        avg_dict2[k]=v[0]/v[1]
    print(avg_dict2)
        

In [61]:
def Variance_of_process_times(log):
    print("Variance_of_process_times function")
    d={}#mean1(log)
    l=[]
    for case_index, case in enumerate(log): 
        for event_index, event in enumerate(case):
            if event["Activity"] not in d.keys():
                l=[]
            else:
                l=d[event["Activity"]]
            l.append( (pd.to_datetime(event["Complete Timestamp"], format = "%m/%d/%Y %H:%M:%S")-pd.to_datetime(event["Start Timestamp"], format = "%m/%d/%Y %H:%M:%S"))/ pd.Timedelta(hours=1))
            d[event["Activity"]]=(l)
    variance_dict={}
    for k,v in d.items():
        variance_dict[k]= np.var(v)
    print("Variance for each activity:")
    print(variance_dict)
        
            
            

In [19]:
def Bottleneck(log):
    print("Bottleneck function")
    for case_index, case in enumerate(log):
        print("\n Case Id: %s" % ( case.attributes["concept:name"]))
        duration=0 
        a=""
        max_duration=0
        for event_index, event in enumerate(case):
            duration=pd.to_datetime(event["Complete Timestamp"], format = "%m/%d/%Y %H:%M:%S")-pd.to_datetime(event["Start Timestamp"], format = "%m/%d/%Y %H:%M:%S")
            if(max_duration==0 or duration>max_duration):
                max_duration=duration
                a=event["Activity"]
        print("Bottleneck Activity:%s took maximum time of %s to complete"%(a,max_duration ))
           
    

In [20]:
def Parallelizable_tasks():
    print("Parallelizable_tasks function")

In [62]:
# Defining main function 
def main(): 
    print("Welcome to Joint Master thesis:\nModelling of production expertise to extend the data-driven analysis of process models") 
    
    '''#Import a log
    log = xes_importer.apply('running-example.xes')
    print("Log imported")
    
    #Explore the log
    #explore_log(log)
    
    #Define the forbidden sequence of events
    #simple restriction which says you cannot decide without examining thoroughly 
    forbidden_sequence=[( 'decide','examine thoroughly')]
    
    #Find different kinds of weakness in the log
    find_weakness(log, forbidden_sequence)
    
    #obtain_petrinet_from_dfg
    dfg_simple = dfg_discovery.apply(log)
    net, im, fm = obtain_petrinet_from_dfg(dfg_simple)

    #Visualise the petrinet obtained
    gviz = pn_vis_factory.apply(net, im, fm)
    pn_vis_factory.view(gviz)'''
    
    #log = xes_importer.apply('running-example.xes')
    log_csv = pd.read_csv('Production_Data.csv', sep=',')
    log = conversion_factory.apply(log_csv, parameters={constants.PARAMETER_CONSTANT_CASEID_KEY: "Case ID",
                                                   constants.PARAMETER_CONSTANT_ACTIVITY_KEY: "Activity",
                                                    constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY:"Start Timestamp",
                                                    constants.PARAMETER_CONSTANT_RESOURCE_KEY:"Resource",
                                                    constants.PARAMETER_CONSTANT_TIMESTAMP_KEY:"Complete Timestamp"
                                                   })
    print("Log imported\n\n\n")
  
    
    blacklist=[ 'Lapping - Machine 1','Turning & Milling - Machine 8']
    #Unwanted_Activity(log, blacklist)
    
    #Backloop(log)
    
    #Redundant_Activity(log) 
    
    #Interface(log)
    
    #Switch_of_media(log) Same as Interface as there is no column for media in the given CSV 
    
    #Idle_time(log)
    
    Variance_of_process_times(log)
    
    #Bottleneck(log)
    
    #Parallelizable_tasks()
main()


Welcome to Joint Master thesis:
Modelling of production expertise to extend the data-driven analysis of process models
Log imported



Variance_of_process_times function




Variance for each activity:
{'Turning & Milling - Machine 4': 16.528016432608823, 'Turning & Milling Q.C.': 2.577405883362774, 'Laser Marking - Machine 7': 0.5306985386747292, 'Lapping - Machine 1': 3.238505924843762, 'Round Grinding - Machine 3': 9.699499389499387, 'Final Inspection Q.C.': 3.0094172644628094, 'Packing': 0.0, 'Turning & Milling - Machine 9': 12.729769768466593, 'Turning Q.C.': 1.6897965105601467, 'Flat Grinding - Machine 11': 2.178565014227164, 'Turning & Milling - Machine 8': 8.186719135802468, 'Grinding Rework - Machine 12': 1.2100045913682278, 'Setup - Machine 8': 0.0, 'Round Grinding - Machine 12': 6.170809537973429, 'Round Grinding - Manual': 7.631096426996725, 'Round Grinding - Q.C.': 4.745569121261451, 'Turning & Milling - Machine 5': 14.454836673408101, 'Turning & Milling - Machine 10': 15.32080589462937, 'Round Grinding - Machine 2': 8.560435371827662, 'Turning & Milling - Machine 6': 15.112434744384585, 'Turning - Machine 4': 7.88447482638889, 'Grinding Rewor

In [None]:
#Calling main function
if __name__=="__main__": 
    main() 

In [None]:
t=(pd.to_datetime("1/30/2012 5:45:00", format = "%m/%d/%Y %H:%M:%S")-pd.to_datetime("1/29/2012 23:15:00", format = "%m/%d/%Y %H:%M:%S"))/ pd.Timedelta(hours=1)
d={}
d["test"]=(t,2)
d.update({'test2':(1.4,1)})
d.update({'test2':(1.5,1)})
d['test2']=(2,4)
d