<a href="https://colab.research.google.com/github/dijahanga/DL_Approach_To_Process_Mining/blob/main/PGraphDD_QM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import graphviz
import math
import numpy as np
import random
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from matplotlib import pyplot as plt
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import KFold
from keras.layers import Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
import pydotplus as pydot
from graphviz import Digraph
import copy
import csv
from google.colab import files
from collections import namedtuple

In [None]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [None]:
class dataDivider:
  def __init__(self, data):
    self.data = data
    self.tracesPerPart = 0
    self.totalCount = 0
  def getPartDictionary(self, labels):
    indexes = []
    _activeCase = None
    caseLabel = labels.case;
    indexi = []
    index = 0
    for index, row in self.data.iterrows():
      if (_activeCase == None):
        _activeCase = row[caseLabel]
        indexi = [index, -1]
      else:
        if (_activeCase != row[caseLabel]):
          indexi[1] = index
          indexes.append(tuple(indexi))
          indexi[0] = index + 1
          _activeCase = row[caseLabel]
    indexi[1] = index + 1
    indexes.append(tuple(indexi))
    return indexes
  def setParts(self, labels, parts):
    indexes = self.getPartDictionary(labels)
    if parts > len(indexes):
      raise ValueError('Part cannot be greater than total events') 
    approxSize = round(len(indexes)/parts)
    partIndexes = []
    partCount = []
    for i in range(0, parts):
      top = i * approxSize
      bottom = top + approxSize -1
      startIndex = indexes[top][0]
      if len(indexes) <= bottom:
        bottom = len(indexes) - 1
      endIndex = indexes[bottom][1];
      partIndexes.append([startIndex, endIndex])
      partCount.append(approxSize)
    self.partIndexes = partIndexes
    self.totalCount = len(indexes)
    self.tracesPerPart = approxSize
    return partIndexes
  def getPartIndex(self, indx):
    return self.data.iloc[self.partIndexes[indx][0]], self.data.iloc[self.partIndexes[indx][1]];
  def getPart(self, index):
    return self.data.iloc[self.partIndexes[index][0]:self.partIndexes[index][1], :]


In [None]:
class prepareData:
  def __init__(self, data, label):
    self.data = data
    self.label = label
  def create_input_output(self, xy):
    # Define Empty List
    values = []
    xList = []
    _ncols = ('X', 'Y')
    values.append(("NULL", xy[0]))
    i = 0
    while i < len(xy):
        try:
            xList = xy[0: i+1]
            xList.insert(0, "NULL")
            values.append((xList, xy[i + 1]))
        except:
            xList = xy[0: i+1]
            xList.insert(0, "NULL")
            values.append((xList, "END"))
        i = i + 1
    return pd.DataFrame(values, columns=_ncols) 

  def prepare(self, validEvts = None, test_size = 0, tokenizer = None):
    nameLabel = self.label[0]
    valueLabel = self.label[1]
    _activeCase = "NULL"
    _tempxy = []
    _ncols = ('X', 'Y')
    maindfObj = pd.DataFrame([], columns=_ncols)
    if validEvts is not None:
      helperObj = helper()
      validEvts = helperObj.oneDimStrToLower(validEvts)
    for index, row in self.data.iterrows():
      if validEvts is not None and row[valueLabel].lower() not in validEvts:
        continue
      if nameLabel in row and (row[nameLabel] == _activeCase or _activeCase == "NULL"):
        concatenatedString = row[valueLabel]
        _tempxy.append(concatenatedString)
        _activeCase = row[nameLabel]
      else:
        subObject = self.create_input_output(_tempxy)
        maindfObj = maindfObj.append(subObject)
        _activeCase = row[nameLabel]
        _tempxy.clear()
        concatenatedString = row[valueLabel]
        _tempxy.append(concatenatedString)
    self.tokenize(maindfObj, tokenizer)
    self.maindfObj = maindfObj
    return self.custom_split(self.X, self.Y, test_size)

  def append_to_2d(self, former_2d, new_2d):
    for i in range(len(new_2d)):
      former_2d.append(new_2d[i])
    return former_2d

  def custom_split(self, X, Y, test_size):
    Xtrain = []
    Ytrain = []
    Xtest = []
    Ytest = []
    size = X.shape  
    import random
    startList = []
    endList = []
    for i in range(size[0]):
      consid = X[i]
      if consid[len(consid) - 2] == 0:
        startList.append(i)
        if(i > 0):
          endList.append(i-1)
    endList.append(size[0]-1) #Tail End of the Array is the last element of endList
    num_test = int(round(len(startList)*test_size))  
    num_train = len(startList) - num_test    
    t = random.sample(startList, num_test)
    counter = 0
    for i in startList:
      Xcase = np.array(X[i:endList[counter]+1])
      Ycase = np.array(Y[i:endList[counter]+1])
      if (i in t):
        Xtest = self.append_to_2d(Xtest, Xcase)
        Ytest = self.append_to_2d(Ytest, Ycase)
      else:
        Xtrain = self.append_to_2d(Xtrain, Xcase)
        Ytrain = self.append_to_2d(Ytrain, Ycase)
      counter = counter + 1
    return np.array(Xtrain), np.array(Xtest), np.array(Ytrain), np.array(Ytest)

  def tokenize(self, data, tokenizer):
    if tokenizer is None:
      tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
      tokenizer.fit_on_texts(data['X'])
    X = tokenizer.texts_to_sequences(data['X'])
    word_index = tokenizer.word_index
    print(word_index)
    print('Found %s unique tokens.' % len(word_index))
    X = pad_sequences(X)
    Y = pd.get_dummies(data['Y'])
    self.X = X
    self.Y = Y
    self.tokenizer = tokenizer

In [None]:
class helper:
  def __init__(self):
    self.i= 0
  def datasetListMergeMinus(self, dataset, subset):
    wholeData = 1
    for m in dataset:
      if not m.equals(subset):
        if type(wholeData) is int:
          wholeData = m          
        else:
          wholeData = wholeData.append(m)
    return wholeData
  def multiDimStrToUpper(self, string):
    nstring = []
    for strns in string:
      nstring.append([x.upper() for x in strns])
    return nstring
  def multiDimStrToLower(self, string):
    nstring = []
    for strns in string:
      nstring.append([x.lower() for x in strns])
    return nstring
  def oneDimStrToLower(self, string):
    nstring = []
    for i in range(0, len(string)):
      nstring.append(string[i].lower())
    return nstring
  def grabEventsFromHeader(self, header):
    evs = []
    c = 0
    for ev in header:
      if c > 0:
        try:
          num = int(ev)
        except:
          evs.append(ev)
      c = c + 1
    return evs

  def rowIsFirst(self, row, activities, headers):
    foundValues = []
    for i in range(0,len(headers)):
      val = headers[i]
      if row[val] in activities:
        foundValues.append(row[val])
    if (len(foundValues) == 0):
      return True
    return False

  def rowIsLast(self, row, evName):
    rowEv = row.idxmax()
    try:
      if rowEv.lower() == evName.lower():
        return True
    except:
      return False
    return False

  def divideMatrix(self, matrix):
    headers = list(matrix.columns.values)
    leftHeaders = []
    rightHeaders = []
    for i in range(0,len(headers)):
      ev = headers[i]
      try:
          num = int(ev)        
          leftHeaders.append(ev)        
      except:
          rightHeaders.append(ev)
    leftData = matrix[leftHeaders]
    rightData = matrix[rightHeaders]
    return [leftData, rightData]

In [None]:
class training:
  def __init__(self, X, Y):
    self.X = X
    self.Y = Y
    MAX_NB_WORDS =20   #50
    EMBEDDING_DIM =20   #32
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(Y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    self.model = model
  def train(self, tindx = '0'):
    model = self.model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    print('Training...')
    X = self.X
    Y = self.Y
    history = model.fit(X, Y,  epochs=50, batch_size=250, verbose=0)
    self.model = model
    model.save('Orig_model_'+tindx+'.h5')
    return model.evaluate(X, Y)

  def getModelFrom(self, modelprefix):
    try:
      modelName = 'Orig_model_'+modelprefix+'.h5';
      self.model = load_model(modelName)
    except:
      self.train(modelprefix)

  def align(self, from_, to_):
    originalColumnNamesArr = to_.columns.values
    driftedColumnNamesArr = from_.columns.values
    colNum = []
    for i in range(0, len(driftedColumnNamesArr)):
      col = driftedColumnNamesArr[i]
      if(col not in originalColumnNamesArr):
        from_ = from_.drop(col, 1)
        colNum.append(i)
    for i in range(0, len(originalColumnNamesArr)):
      col = originalColumnNamesArr[i]
      if(col not in driftedColumnNamesArr):
        from_[col] = 0
    return from_, colNum

  def validateModel(self, Prep_data, tokenizer, model,  X, Y):
    predict_proba = model.predict(X)
    colName = []
    for i in Y:
        colName.append(i)
    dfObj = pd.DataFrame(list(np.round(predict_proba*100, decimals=0)), columns = colName)
    Seq_Series=Prep_data.X.apply(pd.Series)
    dfObj.reset_index(drop=True, inplace=True)
    Seq_Series.reset_index(drop=True, inplace=True)
    df_new = pd.concat([Seq_Series, dfObj], axis=1)
    return df_new

In [None]:
class resultGraphing:
  def __init__(self):
    self.i= 0
  def decomposeResult(self, results):
    helper_ = helper()
    headers = list(results.columns.values)
    events = helper_.grabEventsFromHeader(headers)
    matrices = []
    matricesLeft = []
    lastIndex = 0
    totalBegin = 0
    ind = 0
    newMatrix = helper_.divideMatrix(results)
    for index, row in newMatrix[1].iterrows():
      ind = index
      rowIsLast = helper_.rowIsLast(row, "end")
      if rowIsLast:
        sequenceList = newMatrix[1].iloc[lastIndex:index+1, :]
        sequenceListLeft = newMatrix[0].iloc[lastIndex:index+1, :]
        matrices.append(sequenceList)
        matricesLeft.append(sequenceListLeft)
        lastIndex = index + 1
    return matrices, matricesLeft

  def rowIsAnewSequence(self, row):
    try:
      return math.isnan(row["1"])
    except:
      return False

  def linkAndProbabilities(self, matrices, count = 0):
    links = []
    probabilities = []
    sequences = []
    uniqueEvs = []
    for i in range(0,len(matrices)):
      thisMatrix = matrices[i]
      lastEvent = "Start"
      sequence = []
      for index, row in thisMatrix.iterrows():
        row = pd.to_numeric(row)
        #print(row) # prints the rows
        evName = row.idxmax(axis=1) # picks event with the highest probability
        link = lastEvent + "<-->" + evName
        sequence.append(evName)
        if link not in links:
          if lastEvent != evName:
            if not (lastEvent.lower() == "start" and evName.lower() == "end"):
              links.append(link)
              prob = row[evName]
              probabilities.append(prob)
        lastEvent = evName
      # The Last element is End and undesirable
      sequence.pop()
      sequences.append(sequence)
    return links, probabilities, sequences

  def drawGraph(self, transitions, counter):
    G = Digraph('process_model', filename='dum_'+str(counter)+'.gv')
    G.attr(rankdir='LR', size='7,5')
    G.attr('node', shape='doublecircle', style="filled", fillcolor="grey")
    G.node('Start')
    G.node('END')
    G.attr('node', shape='box', style="bold")
    for i in range(0,len(transitions)):
      G.attr('edge', style="bold", penwidth='3.0')
      fromto = transitions[i].split("<-->")
      G.edge(fromto[0], fromto[1])
    G.view()
    return G
  
  def getEventSequence(self, data, X_label, Y_label):
    currentX_label = ''
    sequences = []
    sequence = []
    for index, row in data.iterrows():
      if currentX_label == row[X_label]:
        sequence.append(row[Y_label])
      else:
        if len(sequence) > 1:
          sequences.append(sequence)
        sequence = []
        sequence.append(row[Y_label])
      currentX_label = row[X_label]
    sequences.append(sequence)
    return sequences  

In [None]:
class performance:
  def __init__(self):
    self.i= 0
    
  def fitness(self, holdOut, sequences):
    TruePositives = 0
    Count = 0
    searched = []
    for i in range(0,len(holdOut)):
      found = holdOut[i] in sequences      
      alreadySearched = holdOut[i] in searched
      searched.append(holdOut[i])
      if alreadySearched:
        Count = Count + 1
      #else:
        #count = count + 1
      if found:
        if alreadySearched:
          TruePositives = TruePositives + 1
        else:
          TruePositives = TruePositives + 1

    print("The fitness of the discovered model against the holdout part")
    print(" No. of True Positive: " , TruePositives)
    print(" No. of Traces in holdout: ", len(holdOut))
    return (TruePositives/len(holdOut))

  def precision(self, original, sequences):
    TruePositives = 0
    Count = 0
    searched = []
    for i in range(0,len(original)):
      found = original[i] in sequences
      alreadySearched = original[i] in searched
      searched.append(original[i])
      if alreadySearched:
        Count = Count + 1
      if found:
        if alreadySearched:
          TruePositives = TruePositives + 1
        else:
          TruePositives = TruePositives + 1

    print("The precision of the discovered model against the complete log")
    print(" No. of True Positive: ", TruePositives)
    print("No. of Traces in the model: ", len(sequences))
    return (TruePositives/len(original))

  def findFScore(self, fitness, precision):
    a = fitness
    b = precision
    return (2 * (a * b)/(a + b))

In [None]:
class DatasetDefinitions:
  def getUnique(self, label, dataset):
    chains = []
    chainTag = []
    lastActivity = None
    lastCase = None
    eventLabel = label.event
    caseLabel = label.case
    for index, row in dataset.iterrows():
      if lastCase is None or lastCase != row[caseLabel]:
        lastActivity = None
      if lastActivity is None:
        lastActivity = row[eventLabel]
        lastCase = row[caseLabel]
        continue
      lastCase = row[caseLabel]
      if lastActivity != row[eventLabel]:
        evChain = lastActivity +""+row[eventLabel]
        evChain = evChain.lower().strip()
        evChain = " ".join(evChain.split()).replace(' ', '_')
        if evChain not in chains:
          chains.append(evChain)
          chainTag.append(lastCase)
      lastActivity = row[eventLabel]
    return chains, chainTag

In [None]:
class FindDrift:
  def __init__(self, baseModel):
    self.baseModel = baseModel
  def executeAgainst(self, dataset, label):
    _datasetDefinitions = DatasetDefinitions()
    chain_base, tag_base = _datasetDefinitions.getUnique(label, self.baseModel)
    chain, tag = _datasetDefinitions.getUnique(label, dataset)
    indx = self.getDrifts(chain_base, chain)
    tag_list = [tag[i] for i in indx]
    chain_list = [chain[i] for i in indx]
    return tag_list, chain_list, indx
  def getDrifts(self, chain_base, chain_drift):
    indx = []
    counter = 0
    for c in chain_drift:
      if(c not in chain_base):
        indx.append(counter)
      counter = counter + 1
    return indx

In [None]:
labels = namedtuple("labels", "case event")
label = labels('case', 'event')

In [None]:
parts = 20
dataset = pd.read_csv('roi-2500.csv', low_memory= False)
data_divider = dataDivider(dataset)
p_ = data_divider.setParts(label, parts)

In [None]:
fScoreLog = []
DriftLog = []

In [None]:
for i in range(0, parts):
  j = i + 1
  if(i == parts - 1):
    j = 0
  #Get the log part to be used
  referenceLog = data_divider.getPart(i)
  detectionLog = data_divider.getPart(j)

  #Prepare the data Reference
  prepdata = prepareData(referenceLog, ['case', 'event'])
  X_train, X_test, Y_train, Y_test = prepdata.prepare(None, 0)

  #get built tokenizer and word_index
  tokenizer = prepdata.tokenizer
  X = prepdata.X
  Y = prepdata.Y
  v = tokenizer.word_index.keys()

  #Prepare the data Detection
  prepdata_t = prepareData(detectionLog, ['case', 'event'])
  X_train_t, X_test_t, Y_train_t, Y_test_t = prepdata_t.prepare(list(v), 0, tokenizer) #list(v)  
  tokenizer_t = prepdata_t.tokenizer
  X_t = prepdata_t.X
  Y_t = prepdata_t.Y

  #Train the Reference Model
  trainModel = training(X_train, Y_train)
  trainModel.getModelFrom(str(i))

  while True:
    diff = - X.shape[1] + X_t.shape[1]
    if(diff > 0):
      X_t = np.delete(X_t, np.s_[0:1], axis=1)
      print("Reshaped RD. difference was "+str(diff)+" ")
    elif(diff < 0):
      X_t = np.insert(X_t, 0, [0], axis=1)
    else:
      break
  
  #Use on itself
  resultDataset_o = trainModel.validateModel(prepdata.maindfObj, tokenizer, trainModel.model,  X, Y)
  #Use on Detection log
  resultDataset_t = trainModel.validateModel(prepdata_t.maindfObj, tokenizer_t, trainModel.model,  X_t, Y)
  resultGraphing_ = resultGraphing()

  #Reference Log
  probs_o, seqs_o = resultGraphing_.decomposeResult(resultDataset_o)
  link_o, probabilities_o, sequences_o = resultGraphing_.linkAndProbabilities(probs_o)

  #Reference Log
  probs_t, seqs_t = resultGraphing_.decomposeResult(resultDataset_t)
  #Detection Log
  link_t, probabilities_t, sequences_t = resultGraphing_.linkAndProbabilities(probs_t)

  #Calculate performance
  performance_ = performance()
  fitness = performance_.fitness(sequences_o, sequences_t)
  precision = performance_.precision(sequences_o, sequences_t)
  try:
    fScore = performance_.findFScore(fitness, precision)
  except:
    fScore = 0
  if fScore < 0.9:
    DriftLog.append("Drift found in Window "+str(j))
  ra = (i, j, fScore)
  fScoreLog.append(ra)

In [None]:
print(sequences_o)
print(sequences_t)

In [None]:
class windowManager:
  def trainWindow(self, parts):
    for i in range(0, parts):
      j = i + 1
      if(i == parts - 1):
        j = 0
      #Get the log part to be used
      referenceLog = data_divider.getPart(i)
      detectionLog = data_divider.getPart(j)

      #Prepare the data Reference
      prepdata = prepareData(referenceLog, ['case', 'event'])
      X_train, X_test, Y_train, Y_test = prepdata.prepare(None, 0)

      #get built tokenizer and word_index
      tokenizer = prepdata.tokenizer
      X = prepdata.X
      Y = prepdata.Y
      v = tokenizer.word_index.keys()

      #Prepare the data Detection
      prepdata_t = prepareData(detectionLog, ['case', 'event'])
      X_train_t, X_test_t, Y_train_t, Y_test_t = prepdata_t.prepare(list(v), 0, tokenizer) #list(v)  
      tokenizer_t = prepdata_t.tokenizer
      X_t = prepdata_t.X
      Y_t = prepdata_t.Y

      #Train the Reference Model
      trainModel = training(X_train, Y_train)
      trainModel.getModelFrom(str(i))

      while True:
        diff = - X.shape[1] + X_t.shape[1]
        if(diff > 0):
          X_t = np.delete(X_t, np.s_[0:1], axis=1)
          print("Reshaped RD. difference was "+str(diff)+" ")
        elif(diff < 0):
          X_t = np.insert(X_t, 0, [0], axis=1)
        else:
          break
      
      #Use on itself
      resultDataset_o = trainModel.validateModel(prepdata.maindfObj, tokenizer, trainModel.model,  X, Y)
      #Use on Detection log
      resultDataset_t = trainModel.validateModel(prepdata_t.maindfObj, tokenizer_t, trainModel.model,  X_t, Y)
      resultGraphing_ = resultGraphing()

      #Reference Log
      probs_o, seqs_o = resultGraphing_.decomposeResult(resultDataset_o)
      link_o, probabilities_o, sequences_o = resultGraphing_.linkAndProbabilities(probs_o)

      #Reference Log
      probs_t, seqs_t = resultGraphing_.decomposeResult(resultDataset_t)
      #Detection Log
      link_t, probabilities_t, sequences_t = resultGraphing_.linkAndProbabilities(probs_t)

      #Calculate performance
      performance_ = performance()
      fitness = performance_.fitness(sequences_o, sequences_t)
      precision = performance_.precision(sequences_o, sequences_t)

      fScore = performance_.findFScore(fitness, precision);
      if fScore < 0.8:
        DriftLog.append("Drift found in Window "+str(j))
      ra = (i, j, fScore)
      fScoreLog.append(ra)

In [None]:
_windowManager = windowManager()
_windowManager.trainWindow(parts)

In [None]:
  DriftLog

['Drift found in Window 2',
 'Drift found in Window 6',
 'Drift found in Window 10',
 'Drift found in Window 14',
 'Drift found in Window 18']

In [None]:
fScoreLog