<a href="https://colab.research.google.com/github/dijahanga/DL_Approach_To_Process_Mining/blob/main/PGraphDD_SS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import graphviz
import math
import numpy as np
import random
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from matplotlib import pyplot as plt
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import KFold
from keras.layers import Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
import pydotplus as pydot
from graphviz import Digraph
import copy
import csv
from google.colab import files
from collections import namedtuple

In [None]:
class dataDivider:
  def __init__(self, data):
    self.data = data
    self.tracesPerPart = 0
    self.totalCount = 0
  def getPartDictionary(self, labels):
    indexes = []
    _activeCase = None
    caseLabel = labels.case;
    indexi = []
    index = 0
    for index, row in self.data.iterrows():
      if (_activeCase == None):
        _activeCase = row[caseLabel]
        indexi = [index, -1]
      else:
        if (_activeCase != row[caseLabel]):
          indexi[1] = index
          indexes.append(tuple(indexi))
          indexi[0] = index + 1
          _activeCase = row[caseLabel]
    indexi[1] = index + 1
    indexes.append(tuple(indexi))
    return indexes
  def setParts(self, labels, parts):
    indexes = self.getPartDictionary(labels)
    if parts > len(indexes):
      raise ValueError('Part cannot be greater than total events') 
    approxSize = round(len(indexes)/parts)
    partIndexes = []
    partIndexesSeries = []
    partCount = []
    for i in range(0, parts):
      top = i * approxSize
      bottom = top + approxSize -1
      startIndex = indexes[top][0]
      if len(indexes) <= bottom:
        bottom = len(indexes) - 1
      endIndex = indexes[bottom][1];
      series = indexes[top:bottom+1]      
      partIndexes.append([startIndex, endIndex])
      partIndexesSeries.append(series)
      partCount.append(approxSize)
    self.partIndexes = partIndexes
    self.totalCount = len(indexes)
    self.tracesPerPart = approxSize
    self.partIndexesSeries = partIndexesSeries
    return partIndexes
  def getPartIndex(self, indx):
    return self.data.iloc[self.partIndexes[indx][0]], self.data.iloc[self.partIndexes[indx][1]];
  def getPart(self, index, percentage = 100):
    if(percentage == 100):
      return self.data.iloc[self.partIndexes[index][0]:self.partIndexes[index][1], :]
    thisSeries = self.partIndexesSeries[index]
    count = round((percentage/100) * len(thisSeries));
    topIndex = thisSeries[0][0];
    bottomIndex = thisSeries[count+1][1]
    return self.data.iloc[topIndex:bottomIndex, :]

In [None]:
class prepareData:
  def __init__(self, data, label):
    self.data = data
    self.label = label
  def create_input_output(self, xy):
    # Define Empty List
    values = []
    xList = []
    _ncols = ('X', 'Y')
    values.append(("NULL", xy[0]))
    i = 0
    while i < len(xy):
        try:
            xList = xy[0: i+1]
            xList.insert(0, "NULL")
            values.append((xList, xy[i + 1]))
        except:
            xList = xy[0: i+1]
            xList.insert(0, "NULL")
            values.append((xList, "END"))
        i = i + 1
    return pd.DataFrame(values, columns=_ncols) 

  def prepare(self, validEvts = None, test_size = 0, tokenizer = None):
    nameLabel = self.label[0]
    valueLabel = self.label[1]
    _activeCase = "NULL"
    _tempxy = []
    _ncols = ('X', 'Y')
    maindfObj = pd.DataFrame([], columns=_ncols)
    if validEvts is not None:
      helperObj = helper()
      validEvts = helperObj.oneDimStrToLower(validEvts)
    for index, row in self.data.iterrows():
      if validEvts is not None and row[valueLabel].lower() not in validEvts:
        continue
      if nameLabel in row and (row[nameLabel] == _activeCase or _activeCase == "NULL"):
        concatenatedString = row[valueLabel]
        _tempxy.append(concatenatedString)
        _activeCase = row[nameLabel]
      else:
        subObject = self.create_input_output(_tempxy)
        maindfObj = maindfObj.append(subObject)
        _activeCase = row[nameLabel]
        _tempxy.clear()
        concatenatedString = row[valueLabel]
        _tempxy.append(concatenatedString)
    self.tokenize(maindfObj, tokenizer)
    self.maindfObj = maindfObj
    return self.custom_split(self.X, self.Y, test_size)

  def append_to_2d(self, former_2d, new_2d):
    for i in range(len(new_2d)):
      former_2d.append(new_2d[i])
    return former_2d

  def custom_split(self, X, Y, test_size):
    Xtrain = []
    Ytrain = []
    Xtest = []
    Ytest = []
    size = X.shape  
    import random
    startList = []
    endList = []
    for i in range(size[0]):
      consid = X[i]
      if consid[len(consid) - 2] == 0:
        startList.append(i)
        if(i > 0):
          endList.append(i-1)
    endList.append(size[0]-1) #Tail End of the Array is the last element of endList
    num_test = int(round(len(startList)*test_size))  
    num_train = len(startList) - num_test    
    t = random.sample(startList, num_test)
    counter = 0
    for i in startList:
      Xcase = np.array(X[i:endList[counter]+1])
      Ycase = np.array(Y[i:endList[counter]+1])
      if (i in t):
        Xtest = self.append_to_2d(Xtest, Xcase)
        Ytest = self.append_to_2d(Ytest, Ycase)
      else:
        Xtrain = self.append_to_2d(Xtrain, Xcase)
        Ytrain = self.append_to_2d(Ytrain, Ycase)
      counter = counter + 1
    return np.array(Xtrain), np.array(Xtest), np.array(Ytrain), np.array(Ytest)

  def tokenize(self, data, tokenizer):
    if tokenizer is None:
      tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
      tokenizer.fit_on_texts(data['X'])
    X = tokenizer.texts_to_sequences(data['X'])
    word_index = tokenizer.word_index
    print(word_index)
    print('Found %s unique tokens.' % len(word_index))
    X = pad_sequences(X)
    Y = pd.get_dummies(data['Y'])
    self.X = X
    self.Y = Y
    self.tokenizer = tokenizer

In [None]:
class helper:
  def __init__(self):
    self.i= 0
  def datasetListMergeMinus(self, dataset, subset):
    wholeData = 1
    for m in dataset:
      if not m.equals(subset):
        if type(wholeData) is int:
          wholeData = m          
        else:
          wholeData = wholeData.append(m)
    return wholeData
  def multiDimStrToUpper(self, string):
    nstring = []
    for strns in string:
      nstring.append([x.upper() for x in strns])
    return nstring
  def multiDimStrToLower(self, string):
    nstring = []
    for strns in string:
      nstring.append([x.lower() for x in strns])
    return nstring
  def oneDimStrToLower(self, string):
    nstring = []
    for i in range(0, len(string)):
      nstring.append(string[i].lower())
    return nstring
  def grabEventsFromHeader(self, header):
    evs = []
    c = 0
    for ev in header:
      if c > 0:
        try:
          num = int(ev)
        except:
          evs.append(ev)
      c = c + 1
    return evs

  def rowIsFirst(self, row, activities, headers):
    foundValues = []
    for i in range(0,len(headers)):
      val = headers[i]
      if row[val] in activities:
        foundValues.append(row[val])
    if (len(foundValues) == 0):
      return True
    return False

  def rowIsLast(self, row, evName):
    rowEv = row.idxmax()
    try:
      if rowEv.lower() == evName.lower():
        return True
    except:
      return False
    return False

  def divideMatrix(self, matrix):
    headers = list(matrix.columns.values)
    leftHeaders = []
    rightHeaders = []
    for i in range(0,len(headers)):
      ev = headers[i]
      try:
          num = int(ev)        
          leftHeaders.append(ev)        
      except:
          rightHeaders.append(ev)
    leftData = matrix[leftHeaders]
    rightData = matrix[rightHeaders]
    return [leftData, rightData]

In [None]:
class training:
  def __init__(self, X, Y):
    self.X = X
    self.Y = Y
    MAX_NB_WORDS =50   #50
    EMBEDDING_DIM =50   #32
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(Y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    self.model = model
  def train(self, tindx = '0'):
    model = self.model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    print('Training...')
    X = self.X
    Y = self.Y
    history = model.fit(X, Y,  epochs=50, batch_size=250, verbose=0)
    self.model = model
    model.save('Orig_model_'+tindx+'.h5')
    return model.evaluate(X, Y)

  def getModelFrom(self, modelprefix):
    try:
      modelName = 'Orig_model_'+modelprefix+'.h5';
      self.model = load_model(modelName)
    except:
      self.train(modelprefix)

  def align(self, from_, to_):
    originalColumnNamesArr = to_.columns.values
    driftedColumnNamesArr = from_.columns.values
    colNum = []
    for i in range(0, len(driftedColumnNamesArr)):
      col = driftedColumnNamesArr[i]
      if(col not in originalColumnNamesArr):
        from_ = from_.drop(col, 1)
        colNum.append(i)
    for i in range(0, len(originalColumnNamesArr)):
      col = originalColumnNamesArr[i]
      if(col not in driftedColumnNamesArr):
        from_[col] = 0
    return from_, colNum

  def validateModel(self, Prep_data, tokenizer, model,  X, Y):
    predict_proba = model.predict(X)
    colName = []
    for i in Y:
        colName.append(i)
    dfObj = pd.DataFrame(list(np.round(predict_proba*100, decimals=0)), columns = colName)
    Seq_Series=Prep_data.X.apply(pd.Series)
    dfObj.reset_index(drop=True, inplace=True)
    Seq_Series.reset_index(drop=True, inplace=True)
    df_new = pd.concat([Seq_Series, dfObj], axis=1)
    return df_new

In [None]:
class resultGraphing:
  def __init__(self):
    self.i= 0
  def decomposeResult(self, results):
    helper_ = helper()
    headers = list(results.columns.values)
    events = helper_.grabEventsFromHeader(headers)
    matrices = []
    matricesLeft = []
    lastIndex = 0
    totalBegin = 0
    ind = 0
    newMatrix = helper_.divideMatrix(results)
    for index, row in newMatrix[1].iterrows():
      ind = index
      rowIsLast = helper_.rowIsLast(row, "end")
      if rowIsLast:
        sequenceList = newMatrix[1].iloc[lastIndex:index+1, :]
        sequenceListLeft = newMatrix[0].iloc[lastIndex:index+1, :]
        matrices.append(sequenceList)
        matricesLeft.append(sequenceListLeft)
        lastIndex = index + 1
    return matrices, matricesLeft

  def rowIsAnewSequence(self, row):
    try:
      return math.isnan(row["1"])
    except:
      return False

  def linkAndProbabilities(self, matrices, count = 0):
    links = []
    probabilities = []
    sequences = []
    uniqueEvs = []
    for i in range(0,len(matrices)):
      thisMatrix = matrices[i]
      lastEvent = "Start"
      sequence = []
      for index, row in thisMatrix.iterrows():
        row = pd.to_numeric(row)
        #print(row) # prints the rows
        evName = row.idxmax(axis=1) # picks event with the highest probability
        link = lastEvent + "<-->" + evName
        sequence.append(evName)
        if link not in links:
          if lastEvent != evName:
            if not (lastEvent.lower() == "start" and evName.lower() == "end"):
              links.append(link)
              prob = row[evName]
              probabilities.append(prob)
        lastEvent = evName
      # The Last element is End and undesirable
      sequence.pop()
      sequences.append(sequence)
    return links, probabilities, sequences

  def linkAndProbabilities_2(self, seqs, probs):
    links = []
    probabilities = []
    sequences = []
    for i in range(0,len(probs)):
      lastEvent = "Start"
      sequence = []
      considprob = probs[i]
      cols = considprob.columns.values
      for index, row in considprob.iterrows():
        evName = row.idxmax()
        link = lastEvent + "<-->" + evName
        sequence.append(evName)
        prob = considprob[evName].max(axis=0)
        if link not in links:
          links.append(link)            
          probabilities.append(prob)              
        lastEvent = evName
      sequence.pop() #
      sequences.append(sequence)
    return links, probabilities, sequences

  def linkAndProbabilities_3(self, seqs, probs):
    links = []
    probabilities = []
    sequences = []
    uniqueEvs = []
    orphanLog = []
    pointedToLog = []
    allowedActivities = probs[0].columns.values
    for i in range(0,len(seqs)):
      thisMatrix = probs[i]
      lastEvent = "Start"
      sequence = []
      completeSequence = seqs[i].iloc[-1]
      for j in range(1, len(completeSequence)):
        if completeSequence[j] in allowedActivities:
          evName = completeSequence[j]
          sequence.append(evName)
          link = lastEvent + "<-->" + evName
          prob = probs[i][evName].max(axis=0)
          if link not in links:
            links.append(link)
            probabilities.append(prob)
          lastEvent = evName
        else:
          link = lastEvent + "<-->END"
          prob = 100
          if link not in links:
            links.append(link)            
            probabilities.append(prob)              
          lastEvent = "END"
          break
      sequences.append(sequence)
    return links, probabilities, sequences
 
  def drawGraph(self, transitions, probabilities, fileName = ''):
    G = Digraph('process_model', filename=fileName+'.gv')
    G.attr(rankdir='TB', size='8,6')
    G.attr('node', shape='doublecircle', style="filled", fillcolor="grey")
    G.node('Start')
    G.node('END')
    G.attr('node', shape='box', style="bold")
    for i in range(0,len(transitions)):
      G.attr('edge', style="bold", penwidth='3.0', label=str(probabilities[i]))
      fromto = transitions[i].split("<-->")
      G.edge(fromto[0], fromto[1])
    G.view()
    return G
  
  def getEventSequence(self, data, X_label, Y_label):
    currentX_label = ''
    sequences = []
    sequence = []
    for index, row in data.iterrows():
      if currentX_label == row[X_label]:
        sequence.append(row[Y_label])
      else:
        if len(sequence) > 1:
          sequences.append(sequence)
        sequence = []
        sequence.append(row[Y_label])
      currentX_label = row[X_label]
    sequences.append(sequence)
    return sequences  

In [None]:
class performance:
  def __init__(self):
    self.i= 0
  def getM(self, a, b):
    M = 0
    _index = 0
    for index, row in a.iterrows():
      columnVal = 0
      for cell in row:
        lCell = b.iloc[_index][columnVal]
        if(lCell > 0 or cell > 0):
          M = M + 1
        columnVal = columnVal + 1
      _index = _index + 1
    return M

  def getMatrix(self, chainTable, uniqueEV, testEvents):
    global dfObjio
    event_size = len(uniqueEV)
    matrix = np.zeros((event_size, event_size))
    counter = 0
    for strings in chainTable[0]:
      evs = strings.split("<-->")
      if "Start" in evs:
        continue
      if testEvents is None or (evs[0] in testEvents and evs[1] in testEvents):
        probability = chainTable[1][counter]
        if probability > 1:
          probability = probability/100
        col = uniqueEV.index(evs[0])
        rw = uniqueEV.index(evs[1])
        # if probability < threshold:
        #   probability = 0
        matrix[col, rw] = probability      
        dfObjio = pd.DataFrame(matrix, index=uniqueEV, columns = uniqueEV)
      counter = counter + 1
    return dfObjio
      
      
  def getAdjacency(self, matrixOne, matrixTwo, eventCount):  
    M = self.getM(matrixOne, matrixTwo)
    #case 1 Probability
    absoluteVal = matrixOne.subtract(matrixTwo)
    sumVal = absoluteVal.abs().sum().sum()
    #case 2 Logicals
    matrixOnelg = matrixOne.apply(np.ceil)
    matrixTwolg = matrixTwo.apply(np.ceil)
    absoluteVal = matrixOnelg.subtract(matrixTwolg)
    sumValLogical = absoluteVal.abs().sum().sum()
    #Estimate adjacency
    print(M)
    print(sumVal)
    print(sumValLogical)
    a = 1- sumVal/M
    b = 1- sumValLogical/M
    return [a, b]

In [None]:
class DatasetDefinitions:
  def getUnique(self, label, dataset):
    chains = []
    chainTag = []
    lastActivity = None
    lastCase = None
    eventLabel = label.event
    caseLabel = label.case
    for index, row in dataset.iterrows():
      if lastCase is None or lastCase != row[caseLabel]:
        lastActivity = None
      if lastActivity is None:
        lastActivity = row[eventLabel]
        lastCase = row[caseLabel]
        continue
      lastCase = row[caseLabel]
      if lastActivity != row[eventLabel]:
        evChain = lastActivity +""+row[eventLabel]
        evChain = evChain.lower().strip()
        evChain = " ".join(evChain.split()).replace(' ', '_')
        if evChain not in chains:
          chains.append(evChain)
          chainTag.append(lastCase)
      lastActivity = row[eventLabel]
    return chains, chainTag

In [None]:
class FindDrift:
  def __init__(self, baseModel):
    self.baseModel = baseModel
  def executeAgainst(self, dataset, label):
    _datasetDefinitions = DatasetDefinitions()
    chain_base, tag_base = _datasetDefinitions.getUnique(label, self.baseModel)
    chain, tag = _datasetDefinitions.getUnique(label, dataset)
    indx = self.getDrifts(chain_base, chain)
    tag_list = [tag[i] for i in indx]
    chain_list = [chain[i] for i in indx]
    return tag_list, chain_list, indx
  def getDrifts(self, chain_base, chain_drift):
    indx = []
    counter = 0
    for c in chain_drift:
      if(c not in chain_base):
        indx.append(counter)
      counter = counter + 1
    return indx

In [None]:
labels = namedtuple("labels", "case event")
label = labels('case', 'event')

In [None]:
parts = 10
dataset = pd.read_csv('rp-2500.csv', low_memory= False)
data_divider = dataDivider(dataset)
p_ = data_divider.setParts(label, parts)

In [None]:
print(data_divider.tracesPerPart)
print(data_divider.totalCount)

250
2500


In [None]:
adjacentScores = []

In [None]:
import csv
from google.colab import files

In [None]:
class windowManager:
  def trainWindow(self, parts):
    for i in range(0, parts):
      j = i + 1
      if(i == parts - 1):
        j = 0
      #Get the log part to be used
      referenceLog = data_divider.getPart(i)
      detectionLog = data_divider.getPart(j)
      referenceLog.to_csv('referenceLogRP_'+str(i)+'.csv')

      #Prepare the data Reference
      prepdata = prepareData(referenceLog, ['case', 'event'])
      X_train, X_test, Y_train, Y_test = prepdata.prepare(None, 0)

      #get built tokenizer and word_index
      tokenizer = prepdata.tokenizer
      X = prepdata.X
      Y = prepdata.Y
      X_O = X
      Y_O = Y
      v = tokenizer.word_index.keys()

      #Prepare the data Detection
      prepdata_t = prepareData(detectionLog, ['case', 'event'])
      X_train_t, X_test_t, Y_train_t, Y_test_t = prepdata_t.prepare(list(v), 0, tokenizer) #list(v)  
      tokenizer_t = prepdata_t.tokenizer
      X_t = prepdata_t.X
      Y_t = prepdata_t.Y
      X_T = X_t
      Y_T = Y_t


      r = str(i)+"_0"
      r2 = str(i)+"_1"
      #Train the Models
      trainModel_O = training(X_train, Y_train)
      trainModel_O.getModelFrom(r)

      trainModel_T = training(X_train_t, Y_train_t)
      trainModel_T.getModelFrom(r2)
      
      #Use on their dataset
      resultDataset_O = trainModel_O.validateModel(prepdata.maindfObj, tokenizer, trainModel_O.model,  X, Y)
      resultDataset_T = trainModel_T.validateModel(prepdata_t.maindfObj, tokenizer_t, trainModel_T.model,  X_t, Y_t)
      resultDataset_O.to_csv('detectionLogRP_'+r+'.csv')
      #resultDataset_T.to_csv('referenceLog_'+r2+'.csv')


      #Result graphing
      resultGraphing_ = resultGraphing()
      #Reference
      probs_O, seqs_O = resultGraphing_.decomposeResult(resultDataset_O)
      link_O, probabilities_O, sequences_O = resultGraphing_.linkAndProbabilities_3(seqs_O, probs_O)
      resultGraphing_.drawGraph(link_O, probabilities_O, r)
      #Adjc log
      probs_T, seqs_T = resultGraphing_.decomposeResult(resultDataset_T)
      link_T, probabilities_T, sequences_T = resultGraphing_.linkAndProbabilities_3(seqs_O, probs_T)
      resultGraphing_.drawGraph(link_T, probabilities_T, r2)

      perf = performance()
       
      h1 = h2 = None
      if len(Y_O.columns.values) > len(Y_T.columns.values): 
        events = list(Y_T.columns.values)
        h1 = perf.getMatrix([link_T, probabilities_T], events, None)
        h2 = perf.getMatrix([link_O, probabilities_O], list(Y_O.columns.values), events)
      else:
        events = list(Y_O.columns.values)
        h1 = perf.getMatrix([link_O, probabilities_O], events, None)
        h2 = perf.getMatrix([link_T, probabilities_T], list(Y_T.columns.values), events)
      result = perf.getAdjacency(h1, h2, len(events))
      ra = (i, j, result)
      adjacentScores.append(ra)

In [None]:
_windowManager = windowManager()
_windowManager.trainWindow(parts)

In [None]:
adjacentScores