In [None]:
# !pip install -U flask-cors

In [1]:
def dfg_create(filtered_event_data , act_perc = 1, path_perc = 1 , view_type = 'act_cnt',unit='hours',**kwargs):

    """
    Function to get Directly Follows Graph
    
    Required Parameters 
    
    filtered_event_data : Event Data (Filtered before pulling in for Process Explorer)
    
    act_perc(default : 1) : Percentage of Activities (top) by cases to be shown. Values between [0.00 , 1.00] 
    
    path_perc(default : 1) : Percentage of Paths (top) by cases to be shown. Values between [0.00 , 1.00] 
    
    view_type : Type of edge values 
                values can range from act_cnt : activity or event count
                                      case_cnt : case count
                                      performace : time between two activities
    
    unit : unit of time in case of performace type metric:
           values can range from (d : days) , (h = hours) , (m = mins) , (s : a - default) 
    
    """
    from pm4py.statistics.start_activities.log import get as start_activities_module
    from pm4py.statistics.end_activities.log import get as end_activities_module
    from pm4py.utils import get_properties

    start_activities = start_activities_module.get_start_activities(filtered_event_data, parameters=get_properties(filtered_event_data))
    end_activities = end_activities_module.get_end_activities(filtered_event_data, parameters=get_properties(filtered_event_data))

    activities_count = pm4py.get_event_attribute_values(filtered_event_data, "concept:name")
    
    if view_type == 'act_cnt':
        variant = pm4py.algo.discovery.dfg.variants.native
    elif view_type == 'performance':
        variant = pm4py.algo.discovery.dfg.variants.performance    
    elif view_type == 'case_cnt':
        variant = pm4py.algo.discovery.dfg.variants.case_attributes
        
    
    dfg = pm4py.algo.discovery.dfg.algorithm.apply(filtered_event_data,parameters={'AGGREGATION_MEASURE':'max'},variant=variant)

    if view_type == 'case_cnt':
        from collections import Counter
        dfg_old = dfg
        dfg = {}
        for key in dfg_old.keys():
            dfg[key] = len(dfg_old[key]['concept:name'])

        dfg=Counter(dfg)
        del(dfg_old)
    
    dfg, start_activities, end_activities, activities_count = pm4py.algo.filtering.dfg.dfg_filtering.filter_dfg_on_activities_percentage(dfg, start_activities, end_activities, activities_count, act_perc)

    dfg, start_activities, end_activities, activities_count = pm4py.algo.filtering.dfg.dfg_filtering.filter_dfg_on_paths_percentage(dfg, start_activities, end_activities, activities_count, path_perc)
    
    activities = set()
    for names in list(dfg.keys()):
        activities.add(names[0])
        activities.add(names[1])
  
    factor = 1
    
    if unit == 'h':
        factor = 3600
    elif unit == 'm':
        factor = 60        
    elif unit == 'd':
        factor = 3600*24 

    values_list = []
    for key in dfg:
        values_list.append([key[0],key[1],dfg[key]/factor,'{:.2f} {}'.format(dfg[key]/factor , unit)])

    for starts in start_activities:
        values_list.append(['Start',starts,start_activities[starts],start_activities[starts]])

    for ends in end_activities:
        values_list.append([ends,'End',end_activities[ends],end_activities[ends]])
    
    
    nodes = [{ 'id': act, 
                'data': { 'label': act,'volume': act+ '['+str(activities_count[act])+ ']' },
             }  for act in activities]
    
    nodes.append({ 'id':'Start', 
#                     'type': 'output',
                    'data': { 'label': 'Start','volume':'Start ['+str(sum(start_activities.values()))+ ']' },     
                 })
    
    nodes.append({'id':'End', 
#                   'type': 'input',
                  'data': { 'label': 'End', 'volume':'End ['+str(sum(end_activities.values()))+ ']' },     
                         
                 })

    edges = [{'source': elements[0], 
               'target': elements[1], 
               'weight': elements[2], 
               'cases': elements[3],
              'label': elements[3],
              'type':'smoothstep',
              'animated': True 
             }  for elements in values_list]
    
    
    return {'dfg':[nodes , edges] , "start":start_activities,"end":end_activities}
            
            

In [2]:
    
import pm4py
def variant_explorer(df):

    """
    return data_dict which has the name of the process steps as the key in the format ('a', 'b', 'c') and their corresponding value is the frequency of occurence of that key in the event_data available    
    
    """
    event_log = pm4py.format_dataframe(df, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
    event_log = pm4py.convert_to_event_log(event_log)

    data_dict = {}

    for keys,values in pm4py.get_variants_as_tuples(event_log).items():
        data_dict[keys] = len(values)
    
    return data_dict

In [3]:
def simulation(dataframe, sample_test_event_dataframe):
  # processing the test_sample_log_dataframe
  list_case = list(sample_test_event_dataframe['concept:name'])
  sample_test_log = [list_case[0:i] for i in range(1, len(list_case)+1)]

  # filter cancelled application from event log
  cancelled_data = dataframe[dataframe['concept:name'] == 'A_Cancelled']

  # flagging the applications having cancelled application
  cancelled_data['Case:cancelled'] = 'Cancelled'
  cancelled_data_filt = cancelled_data[['case:concept:name','Case:cancelled']]
  cancelled_data_filt.reset_index(drop = True, inplace = True)

  # merge flagged application
  dataframe_new = dataframe.merge(cancelled_data_filt, on = 'case:concept:name', how = 'left')
  dataframe_new['Case:cancelled'] = dataframe_new['Case:cancelled'].fillna('Non_Cancelled')

  ##  Add aggregated sequential flow of event for each application 
  dataframe_work =  dataframe_new[['Action','EventOrigin','concept:name','case:concept:name', 'Case:cancelled']]\
  # get all applications of data in list
  all_applications = list(dataframe_work['case:concept:name'].unique())
  column_to_add = []
  # iterate through all application to find the process probability
  for i in all_applications:
    sub = dataframe_work[dataframe_work['case:concept:name'].isin([i])]
    event_flow = list(sub['concept:name'])
    sub_list = [event_flow[0:i] for i in range(1, (len(event_flow)+1))]
    for j in sub_list:
      column_to_add.append(j)
  dataframe_work['sub_case:concept:name'] = column_to_add

  import numpy as np
  dataframe_work['prob_flag'] = np.nan

  ## write funtion to return value required to flag
  def normalise_row(row, value):
      if(row['sub_case:concept:name'] == value):
        return 1
      else:
        return row['prob_flag']

  ### iterate through the test event_log and generate a flag where sub-sequence of events are matching in data
  for i in sample_test_log:
    dataframe_work['prob_flag'] = dataframe_work.apply(lambda row : normalise_row(row, i), axis=1) 

  probability_of_cancellation = np.round((len(dataframe_work[dataframe_work['prob_flag'] == 1])/len(dataframe_work)),2)
  return (probability_of_cancellation)

In [4]:
# variant_explorer(os.path.join('zennovatefiles', 'data.csv'))

In [5]:
import pm4py
import datetime
import pandas as pd
import plotly.express as px
import numpy as np
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.importer.xes import importer as xes_importer

# process mining 
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner

# viz
from pm4py.visualization.dfg import visualizer as dfg_visualization
#from pm4py.visualization.petrinet import visualizer as pn_visualizer
from pm4py.visualization.process_tree import visualizer as pt_visualizer
from pm4py.visualization.heuristics_net import visualizer as hn_visualizer

# misc 
from pm4py.objects.conversion.process_tree import converter as pt_converter

# filters and stats
from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.algo.filtering.log.variants import variants_filter
#from pm4py.statistics.traces.log import case_statistics

## tree visualizer
import math
from sklearn import tree
from pm4py.objects.log.util import get_class_representation
from pm4py.visualization.decisiontree import visualizer as dectree_visualizer
from pm4py.algo.transformation.log_to_features import algorithm as log_to_features
import plotly.io as pio

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


# def activities(event_log):
#   start_activities = pm4py.get_start_activities(event_log)
#   end_activities = pm4py.get_end_activities(event_log)
#   return(start_activities, end_activities)

# start_activities,end_activities = activities(event_log)
# # print(start_activities)
# # print(end_activities)

# """All our 492 cases started with "A_Create Application" while the majority ended with either "W_Validate application", "W_Call after offers", "o_cancelled" and "W_Call incomplete files"

# Filter on variants
# A variant is a set of cases that share the same control-flow perspective, so a set of cases that share the same classified events (activities) in the same order. In this section, we will focus for all methods first on log objects, then we will continue with the dataframe.
# """

#Other variants-based filters are offered. The filters on the top-k variants keeps in the log only the cases following one of the k most frequent variants:
def variants(event_log, n):
  k = n
  filtered_log = pm4py.filter_variants_top_k(event_log, k)
  return (filtered_log)
#filtered_log = variants(event_log, 4)
# print(filtered_log[0])
# print(filtered_log[1])
# print(filtered_log[2])
# print(filtered_log[3])

##  Data filtering for application based

pd.set_option("display.max_rows", None, "display.max_columns", None)
from scipy.stats import skew, mode
def filtered_data(dataframe, stats_of_requested_amount, stats_of_withdrawl_amount):
  agg_func_stats1 = {'case:ApplicationType': [pd.Series.mode], 'case:LoanGoal': [pd.Series.mode]}
  filtered_1 = pd.DataFrame(dataframe.groupby(['case:concept:name']).agg(agg_func_stats1)).reset_index(drop = False)
  ind = pd.Index([e[0] for e in filtered_1.columns.tolist()])
  filtered_1.columns = ind
  filtered_2 = pd.DataFrame(dataframe.groupby(['case:concept:name']).agg({'case:RequestedAmount': stats_of_requested_amount, 'FirstWithdrawalAmount':stats_of_withdrawl_amount})).reset_index(drop = False)
  filtered = pd.merge(filtered_1,filtered_2, on = 'case:concept:name', how = 'left')
  return(filtered)

#filtered = filtered_data(dataframe,'mean', 'sum')
#print(filtered.head())

"""#### Case:LoanGoal distribution"""


def loan_goal_dist(filtered_data):
  data_loan = pd.DataFrame(filtered_data['case:LoanGoal'].value_counts()).reset_index(drop = False)
  data_loan.rename(columns = {'index':'LoanGoal', 'case:LoanGoal':'count'}, inplace = True)
  fig = px.pie(data_loan, values='count', names='LoanGoal', title='distribution of loan goal')
  #fig.show()
  return(fig)

"""#### Case: Application type distribution"""


def application_type_dist(filtered_data):
  data_application = pd.DataFrame(filtered_data['case:ApplicationType'].value_counts()).reset_index(drop = False)
  data_application.rename(columns = {'index':'ApplicationType', 'case:ApplicationType':'count'}, inplace = True)
  fig = px.pie(data_application, values='count', names='ApplicationType', title='Distribution of application type')
  return(fig)

"""#### Distribution of requested amount by applicant"""

def requested_amount_dist(filtered_data):
  data_box = pd.DataFrame(filtered_data['case:RequestedAmount'])
  fig = px.box(data_box, y="case:RequestedAmount")
  return(fig)

"""#### Distribution of loan given """


# create a list of our conditions
def loan_dist(filtered_data):
  loan_info = filtered_data.copy()
  conditions = [
      (loan_info['case:RequestedAmount'] == loan_info['FirstWithdrawalAmount']),
      (loan_info['case:RequestedAmount'] > loan_info['FirstWithdrawalAmount']),
      ((loan_info['case:RequestedAmount'] == 0) & (loan_info['FirstWithdrawalAmount'] > 0)),
      (loan_info['FirstWithdrawalAmount'] == 0),
      (loan_info['case:RequestedAmount'] < loan_info['FirstWithdrawalAmount'])]

  # create a list of the values we want to assign for each condition
  values = ['Full loan given', 'Partial loan given','No amount requested, but got loan', 'No loan given', 'Multiple Withdrawl amount']

  # create a new column and use np.select to assign values to it using our lists as arguments
  loan_info['Loan info'] = np.select(conditions, values)
  loan_info.loc[loan_info['FirstWithdrawalAmount'] == 0.0]
  loan_info.loc[loan_info["FirstWithdrawalAmount"] == 0.0, "Loan info"] = 'No loan given'

 
  data_loan_info = pd.DataFrame(loan_info['Loan info'].value_counts()).reset_index(drop = False)
  data_loan_info.rename(columns = {'index':'Loan info', 'Loan info':'count'}, inplace = True)
  fig = px.pie(data_loan_info, values='count', names='Loan info', title='Distribution of Loan given')
  return(fig)

"""### Process discovery

With Process discovery we aim to find a suitable process model that can describe our business process and the sequence of events (traces) and activities that are performed within each trace. In addition to discovery of process model we can get statistics related such as frequency of events and time-to-execute, which help increase our understanding of the insufficiences are inherited in our process.

#### Before applying one of the many process mining algorithms, it will be informatives if we get some statistics describing our log and process will start by understanding how many variants we have? how many cases in each variant?

A process variant is a unique path from the very beginning to the very end of the process
"""

def total_variants(event_log_data):
  variants = variants_filter.get_variants(event_log_data)
  return(len(variants))


"""If the case would have such a way, that any specific variant is very high in %, then it would have become an interesting story to look into"""

## Let's see what activities do we have in log? including their frequencies and considering all cases(no filter)
def activites_distribution(event_log_data):
  activities = attributes_filter.get_attribute_values(event_log_data, "concept:name")
  activities_dist = pd.DataFrame(activities.items(), columns= ['Activities', 'Counts'])
  activities_dist.sort_values(by=['Counts'], inplace=True, ascending = False)
  activities_dist['Activities_distribution'] = (activities_dist['Counts']/activities_dist['Counts'].sum())*100
  return(activities_dist)


"""Few activities stands out "W_Validate application", "W_Call after offers", "W_Call incomplete files" and "W_Complete application", they have a lot of actions, it could be some sort of self-loop or rework or some other reason ofc, but clearly we should do something to prevent them from becoming bottlenecks

### Dicision tree (RCA)

A decision tree about the duration of a case helps to understand the reasons behind an high case duration (or, at least, a case duration that is above the threshold).First, a log has to be loaded. A representation of a log on a given set of features could be obtained Or an automatic representation (automatic selection of the attributes) could be obtained:
data, feature_names = log_to_features.apply(log)
Then, the target classes are formed. There are two classes: First, traces that are below the specified threshold (here, 200 days). Note that the time is given in seconds. Second, traces that are above the specified threshold. The decision tree could be then calculated and visualized.
"""

def tree_visualizer_RCA(event_log_data):
  data, feature_names = log_to_features.apply(event_log_data,parameters={"str_tr_attr": ["LoanGoal","ApplicationType"],
                                                                      "str_ev_attr": ["concept:name","org:resource","Accepted	"],
                                                                      "num_tr_attr": ["RequestedAmount"],
                                                                      "num_ev_attr": ["MonthlyCost","CreditScore","OfferedAmount"]})
  for row_num in range(len(data)):
      row= data[row_num]
      row = [0 if math.isnan(x) else x for x in row]
      data[row_num] = row

  target, classes = get_class_representation.get_class_representation_by_trace_duration(event_log_data, 20 * 86400)
  clf = tree.DecisionTreeClassifier(max_depth=3)
  clf.fit(data, target)


  gviz = dectree_visualizer.apply(clf, feature_names, classes)
  return(gviz)


def RCA(event_log_data_csv):
  dataframe  = pd.read_csv(event_log_data_csv)
  dataframe = pm4py.format_dataframe(dataframe, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
  event_log = pm4py.convert_to_event_log(dataframe)
  loan_goal_distribution = loan_goal_dist(filtered_data(dataframe,'mean', 'sum'))
  application_type_distribution = application_type_dist(filtered_data(dataframe,'mean', 'sum'))
  requested_amount_distribution = requested_amount_dist(filtered_data(dataframe,'mean', 'sum'))
  loan_distribution = loan_dist(filtered_data(dataframe,'mean', 'sum'))
  #veriant_distributions = veriant_distribution(event_log).head()
  #activites_distributions = activites_distribution(event_log).head()
  tree_visualizer_RCA1 = tree_visualizer_RCA(event_log)# dectree_visualizer.view()
  return {'Loan-distribution':pio.to_json(loan_goal_distribution),
          'application_type_distribution': pio.to_json(application_type_distribution), 
          'requested_amount_distribution':pio.to_json(requested_amount_distribution), 
          'loan_distribution':pio.to_json(loan_distribution),
#           'tree_visualizer_RCA1':tree_visualizer_RCA1
         }




In [None]:
import os
import urllib.request
from flask import Flask, request, redirect, jsonify
from werkzeug.utils import secure_filename
from flask_cors import CORS
import pandas as pd
import pm4py
# app = Flask(__name__)
app = Flask(__name__)
CORS(app)

UPLOAD_FOLDER = 'zennovatefiles'





ALLOWED_EXTENSIONS = set(['txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'])

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS


@app.route('/file-upload', methods=['POST'])
def upload_file():
    if request.method == 'POST':
        # check if the post request has the file part
        if 'file' not in request.files:
            return 'No file part'
        file = request.files['file']
        # if user does not select file, browser also
        # submit an empty part without filename
        if file.filename == '':
            return 'No selected file'
        if file:
            # save the file to a specified location
            dpath = os.path.join(UPLOAD_FOLDER, 'data.csv')
            if os.path.exists(dpath):
                os.remove(dpath)
            file.save(os.path.join(UPLOAD_FOLDER, 'data.csv'))
            return 'File successfully uploaded'

@app.route('/ping', methods=['GET'])
def ping():
    return 'pong'

@app.route('/getrca', methods=['POST','GET'])
def getrca():
    return RCA(os.path.join(UPLOAD_FOLDER, 'data.csv'))

@app.route('/getsimulation', methods=['POST','GET'])
def getsimulations():
#     warnings.filterwarnings("ignore")
    dataframe  = pd.read_csv(os.path.join(UPLOAD_FOLDER, 'data.csv'))
    dataframe = pm4py.format_dataframe(dataframe, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
    event_log = pm4py.convert_to_event_log(dataframe)

    # create test-case
    dataframe_to_test = dataframe[dataframe['case:concept:name'].isin(['Application_1017492916'])]
    prob = simulation(dataframe, dataframe_to_test)
    return {'Cases':{'Cancellation':prob}}
    
@app.route('/getrenderdata', methods=['POST','GET'])
def getrenderdata():
    fdata = request.json['body']
    fdata['act_perc']=float(fdata['act_perc'])
    fdata['path_perc']=float(fdata['path_perc'])
    df = pd.read_csv(os.path.join(UPLOAD_FOLDER, 'data.csv')) 
    event_log = pm4py.format_dataframe(df, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
    event_log = pm4py.convert_to_event_log(event_log)
    return dfg_create(event_log,**fdata)

if __name__ == "__main__":
    app.run()

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000 (Press CTRL+C to quit)
127.0.0.1 - - [04/Jan/2023 13:39:49] "GET /getrca HTTP/1.1" 200 -
127.0.0.1 - - [04/Jan/2023 13:39:54] "GET /getrca HTTP/1.1" 200 -
127.0.0.1 - - [04/Jan/2023 13:39:56] "GET /getrca HTTP/1.1" 200 -
127.0.0.1 - - [04/Jan/2023 13:39:57] "[33mGET /getsimulations HTTP/1.1[0m" 404 -


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

Se



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

127.0.0.1 - - [04/Jan/2023 13:40:03] "GET /getsimulation HTTP/1.1" 200 -
127.0.0.1 - - [04/Jan/2023 13:41:54] "OPTIONS /getrenderdata HTTP/1.1" 200 -
127.0.0.1 - - [04/Jan/2023 13:41:55] "POST /getrenderdata HTTP/1.1" 200 -
127.0.0.1 - - [04/Jan/2023 13:41:57] "GET /getrca HTTP/1.1" 200 -


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

127.0.0.1 - - [04/Jan/2023 13:47:25] "GET /getsimulation HTTP/1.1" 200 -


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [6]:

def variants(event_log, n):
  k = n
  filtered_log = pm4py.filter_variants_top_k(event_log, k)
  return (filtered_log)

In [11]:
dataframe  = pd.read_csv(os.path.join('zennovatefiles', 'data.csv'))
dataframe = pm4py.format_dataframe(dataframe, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
event_log = pm4py.convert_to_event_log(dataframe)
tree_visualizer_RCA1 = tree_visualizer_RCA(event_log)

In [13]:
tree_visualizer_RCA1.render()

ExecutableNotFound: failed to execute PosixPath('dot'), make sure the Graphviz executables are on your systems' PATH

In [8]:
import os
RCA(os.path.join('zennovatefiles', 'data.csv'))

{'Loan-distribution': '{"data":[{"domain":{"x":[0.0,1.0],"y":[0.0,1.0]},"hovertemplate":"LoanGoal=%{label}<br>count=%{value}<extra></extra>","labels":["Car","Home improvement","Existing loan takeover","Other, see explanation","Unknown","Remaining debt home","Not speficied","Extra spending limit","Caravan / Camper","Tax payments","Boat","Motorcycle","Business goal"],"legendgroup":"","name":"","showlegend":true,"values":[147,125,76,50,38,16,13,12,6,4,3,1,1],"type":"pie"}],"layout":{"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"hist

In [30]:
# df = pd.read_csv(os.path.join('zennovatefiles', 'data.csv'))#'BPI Challenge 2017 Sample 20K.csv') 
# event_log = pm4py.format_dataframe(df, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
# event_log = pm4py.convert_to_event_log(event_log)
# variants(event_log,1)

In [31]:
dataframe  = pd.read_csv(os.path.join('zennovatefiles', 'data.csv'))
dataframe = pm4py.format_dataframe(dataframe, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
event_log = pm4py.convert_to_event_log(dataframe)
loan_goal_distribution = loan_goal_dist(filtered_data(dataframe,'mean', 'sum'))


In [36]:
loan_goal_distribution.

(Pie({
     'domain': {'x': [0.0, 1.0], 'y': [0.0, 1.0]},
     'hovertemplate': 'LoanGoal=%{label}<br>count=%{value}<extra></extra>',
     'labels': array(['Car', 'Home improvement', 'Existing loan takeover',
                      'Other, see explanation', 'Unknown', 'Remaining debt home',
                      'Not speficied', 'Extra spending limit', 'Caravan / Camper',
                      'Tax payments', 'Boat', 'Motorcycle', 'Business goal'], dtype=object),
     'legendgroup': '',
     'name': '',
     'showlegend': True,
     'values': array([147, 125,  76,  50,  38,  16,  13,  12,   6,   4,   3,   1,   1])
 }),)

In [39]:
import pm4py
import datetime
import pandas as pd
import plotly.express as px
import numpy as np
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.importer.xes import importer as xes_importer

# process mining 
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner

# viz
from pm4py.visualization.dfg import visualizer as dfg_visualization
#from pm4py.visualization.petrinet import visualizer as pn_visualizer
from pm4py.visualization.process_tree import visualizer as pt_visualizer
from pm4py.visualization.heuristics_net import visualizer as hn_visualizer

# misc 
from pm4py.objects.conversion.process_tree import converter as pt_converter

# filters and stats
from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.algo.filtering.log.variants import variants_filter
#from pm4py.statistics.traces.log import case_statistics

## tree visualizer
import math
from sklearn import tree
from pm4py.objects.log.util import get_class_representation
from pm4py.visualization.decisiontree import visualizer as dectree_visualizer
from pm4py.algo.transformation.log_to_features import algorithm as log_to_features
import plotly.io as pio

# to display max row and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# mounting g-drive to get data
# drive.mount('/content/drive')

# !ls "/content/drive/MyDrive/Colab Notebooks/Process mining"
# path = "/content/drive/MyDrive/Colab Notebooks/Process mining/"

# dataframe  = pd.read_csv(path + "data/BPI Challenge 2017 Sample 20K.csv")
# dataframe = pm4py.format_dataframe(dataframe, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
# event_log = pm4py.convert_to_event_log(dataframe)

# ## look into dataframe
# dataframe.columns

"""### Filter on start activities
In general, PM4Py is able to filter a log or a dataframe on start activities.


First of all, it might be necessary to know the starting activities. Therefore, code snippets are provided. Subsequently, an example of filtering is provided. The first snippet is working with log object, the second one is working on a dataframe. log_start is a dictionary that contains as key the activity and as value the number of occurrence.

### Filter on end activities
In general, PM4Py offers the possibility to filter a log or a dataframe on end activities.

This filter permits to keep only traces with an end activity among a set of specified activities. First of all, it might be necessary to know the end activities. Therefore, a code snippet is provided.
"""

def activities(event_log):
  start_activities = pm4py.get_start_activities(event_log)
  end_activities = pm4py.get_end_activities(event_log)
  return(start_activities, end_activities)

start_activities,end_activities = activities(event_log)
# print(start_activities)
# print(end_activities)

"""All our 492 cases started with "A_Create Application" while the majority ended with either "W_Validate application", "W_Call after offers", "o_cancelled" and "W_Call incomplete files"

Filter on variants
A variant is a set of cases that share the same control-flow perspective, so a set of cases that share the same classified events (activities) in the same order. In this section, we will focus for all methods first on log objects, then we will continue with the dataframe.
"""

#Other variants-based filters are offered. The filters on the top-k variants keeps in the log only the cases following one of the k most frequent variants:
def variants(event_log, n):
  k = n
  filtered_log = pm4py.filter_variants_top_k(event_log, k)
  return (filtered_log)
#filtered_log = variants(event_log, 4)
# print(filtered_log[0])
# print(filtered_log[1])
# print(filtered_log[2])
# print(filtered_log[3])

##  Data filtering for application based

pd.set_option("display.max_rows", None, "display.max_columns", None)
from scipy.stats import skew, mode
def filtered_data(dataframe, stats_of_requested_amount, stats_of_withdrawl_amount):
  agg_func_stats1 = {'case:ApplicationType': [pd.Series.mode], 'case:LoanGoal': [pd.Series.mode]}
  filtered_1 = pd.DataFrame(dataframe.groupby(['case:concept:name']).agg(agg_func_stats1)).reset_index(drop = False)
  ind = pd.Index([e[0] for e in filtered_1.columns.tolist()])
  filtered_1.columns = ind
  filtered_2 = pd.DataFrame(dataframe.groupby(['case:concept:name']).agg({'case:RequestedAmount': stats_of_requested_amount, 'FirstWithdrawalAmount':stats_of_withdrawl_amount})).reset_index(drop = False)
  filtered = pd.merge(filtered_1,filtered_2, on = 'case:concept:name', how = 'left')
  return(filtered)

#filtered = filtered_data(dataframe,'mean', 'sum')
#print(filtered.head())

"""#### Case:LoanGoal distribution"""


def loan_goal_dist(filtered_data):
  data_loan = pd.DataFrame(filtered_data['case:LoanGoal'].value_counts()).reset_index(drop = False)
  data_loan.rename(columns = {'index':'LoanGoal', 'case:LoanGoal':'count'}, inplace = True)
  fig = px.pie(data_loan, values='count', names='LoanGoal', title='distribution of loan goal')
  #fig.show()
  return(fig)

"""#### Case: Application type distribution"""


def application_type_dist(filtered_data):
  data_application = pd.DataFrame(filtered_data['case:ApplicationType'].value_counts()).reset_index(drop = False)
  data_application.rename(columns = {'index':'ApplicationType', 'case:ApplicationType':'count'}, inplace = True)
  fig = px.pie(data_application, values='count', names='ApplicationType', title='Distribution of application type')
  return(fig)

"""#### Distribution of requested amount by applicant"""

def requested_amount_dist(filtered_data):
  data_box = pd.DataFrame(filtered_data['case:RequestedAmount'])
  fig = px.box(data_box, y="case:RequestedAmount")
  return(fig)

"""#### Distribution of loan given """


# create a list of our conditions
def loan_dist(filtered_data):
  loan_info = filtered_data.copy()
  conditions = [
      (loan_info['case:RequestedAmount'] == loan_info['FirstWithdrawalAmount']),
      (loan_info['case:RequestedAmount'] > loan_info['FirstWithdrawalAmount']),
      ((loan_info['case:RequestedAmount'] == 0) & (loan_info['FirstWithdrawalAmount'] > 0)),
      (loan_info['FirstWithdrawalAmount'] == 0),
      (loan_info['case:RequestedAmount'] < loan_info['FirstWithdrawalAmount'])]

  # create a list of the values we want to assign for each condition
  values = ['Full loan given', 'Partial loan given','No amount requested, but got loan', 'No loan given', 'Multiple Withdrawl amount']

  # create a new column and use np.select to assign values to it using our lists as arguments
  loan_info['Loan info'] = np.select(conditions, values)
  loan_info.loc[loan_info['FirstWithdrawalAmount'] == 0.0]
  loan_info.loc[loan_info["FirstWithdrawalAmount"] == 0.0, "Loan info"] = 'No loan given'

 
  data_loan_info = pd.DataFrame(loan_info['Loan info'].value_counts()).reset_index(drop = False)
  data_loan_info.rename(columns = {'index':'Loan info', 'Loan info':'count'}, inplace = True)
  fig = px.pie(data_loan_info, values='count', names='Loan info', title='Distribution of Loan given')
  return(fig)

"""### Process discovery

With Process discovery we aim to find a suitable process model that can describe our business process and the sequence of events (traces) and activities that are performed within each trace. In addition to discovery of process model we can get statistics related such as frequency of events and time-to-execute, which help increase our understanding of the insufficiences are inherited in our process.

#### Before applying one of the many process mining algorithms, it will be informatives if we get some statistics describing our log and process will start by understanding how many variants we have? how many cases in each variant?

A process variant is a unique path from the very beginning to the very end of the process
"""

def total_variants(event_log_data):
  variants = variants_filter.get_variants(event_log_data)
  return(len(variants))

# cAalling the function
#print(f"We have:{total_variants(event_log)} variants in our log")

## Let's try to understand how many cases do those variants have?
# def veriant_distribution(event_log_data):
#   variants_count = case_statistics.get_variant_statistics(event_log_data)
#   variants_count = sorted(variants_count, key=lambda x: x['count'], reverse=True)
#   dict = {}
#   for i in range(0, len(variants_count)):
#     dict.update({'variant_' + str(i+1): variants_count[i]['count']})
#   variants_distribution = pd.DataFrame(dict.items(), columns = ['Variants', 'Counts'])
#   variants_distribution['Variant_distribution'] = (variants_distribution['Counts']/variants_distribution['Counts'].sum())*100
#   return(variants_distribution)

# calling the function


#print(f"Out of {len(event_log)} cases we have in our log, 9 of them (i.e 1.8%) are in 1 variant.\nOnly 1 variant out of {total_variants(event_log)}")

"""If the case would have such a way, that any specific variant is very high in %, then it would have become an interesting story to look into"""

## Let's see what activities do we have in log? including their frequencies and considering all cases(no filter)
def activites_distribution(event_log_data):
  activities = attributes_filter.get_attribute_values(event_log_data, "concept:name")
  activities_dist = pd.DataFrame(activities.items(), columns= ['Activities', 'Counts'])
  activities_dist.sort_values(by=['Counts'], inplace=True, ascending = False)
  activities_dist['Activities_distribution'] = (activities_dist['Counts']/activities_dist['Counts'].sum())*100
  return(activities_dist)

## calling the function  
#activites_distribution(event_log).head()
# loan_goal_dist(filtered_data(dataframe,'mean', 'sum')).show()
# application_type_dist(filtered_data(dataframe,'mean', 'sum')).show()
# requested_amount_dist(filtered_data(dataframe,'mean', 'sum')).show()
# loan_dist(filtered_data(dataframe,'mean', 'sum')).show()
# veriant_distribution(event_log).head()
# activites_distribution(event_log).head()

"""Few activities stands out "W_Validate application", "W_Call after offers", "W_Call incomplete files" and "W_Complete application", they have a lot of actions, it could be some sort of self-loop or rework or some other reason ofc, but clearly we should do something to prevent them from becoming bottlenecks

### Dicision tree (RCA)

A decision tree about the duration of a case helps to understand the reasons behind an high case duration (or, at least, a case duration that is above the threshold).First, a log has to be loaded. A representation of a log on a given set of features could be obtained Or an automatic representation (automatic selection of the attributes) could be obtained:
data, feature_names = log_to_features.apply(log)
Then, the target classes are formed. There are two classes: First, traces that are below the specified threshold (here, 200 days). Note that the time is given in seconds. Second, traces that are above the specified threshold. The decision tree could be then calculated and visualized.
"""

def tree_visualizer_RCA(event_log_data):
  data, feature_names = log_to_features.apply(event_log_data,parameters={"str_tr_attr": ["LoanGoal","ApplicationType"],
                                                                      "str_ev_attr": ["concept:name","org:resource","Accepted	"],
                                                                      "num_tr_attr": ["RequestedAmount"],
                                                                      "num_ev_attr": ["MonthlyCost","CreditScore","OfferedAmount"]})
  for row_num in range(len(data)):
      row= data[row_num]
      row = [0 if math.isnan(x) else x for x in row]
      data[row_num] = row

  target, classes = get_class_representation.get_class_representation_by_trace_duration(event_log_data, 20 * 86400)
  clf = tree.DecisionTreeClassifier(max_depth=3)
  clf.fit(data, target)


  gviz = dectree_visualizer.apply(clf, feature_names, classes)
  return(gviz)


def RCA(event_log_data_csv):
  dataframe  = pd.read_csv(event_log_data_csv)
  dataframe = pm4py.format_dataframe(dataframe, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
  event_log = pm4py.convert_to_event_log(dataframe)
  loan_goal_distribution = loan_goal_dist(filtered_data(dataframe,'mean', 'sum'))
  application_type_distribution = application_type_dist(filtered_data(dataframe,'mean', 'sum'))
  requested_amount_distribution = requested_amount_dist(filtered_data(dataframe,'mean', 'sum'))
  loan_distribution = loan_dist(filtered_data(dataframe,'mean', 'sum'))
  #veriant_distributions = veriant_distribution(event_log).head()
  #activites_distributions = activites_distribution(event_log).head()
  tree_visualizer_RCA1 = tree_visualizer_RCA(event_log)# dectree_visualizer.view()
  return {'Loan-distribution':pio.to_json(loan_goal_distribution),
          'application_type_distribution': pio.to_json(application_type_distribution), 
          'requested_amount_distribution':pio.to_json(requested_amount_distribution), 
          'loan_distribution':pio.to_json(loan_distribution),
#           'tree_visualizer_RCA1':tree_visualizer_RCA1
         }




In [16]:

# !pip install pm4py


In [6]:
import pandas as pd
df = pd.read_csv(os.path.join('zennovatefiles', 'data.csv'))#'BPI Challenge 2017 Sample 20K.csv') 
event_log = pm4py.format_dataframe(df, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
event_log = pm4py.convert_to_event_log(event_log)

ParserError: Error tokenizing data. C error: Expected 1 fields in line 3, saw 5


In [52]:
import pm4py
log = pm4py.read_xes('financial_log.xes') 
# import pm4py
# log = pm4py.format_dataframe(df, case_id='case_id',activity_key='activity',
#                              timestamp_key='timestamp')

parsing log, completed traces :: 100%|████████████████████████████████████████████████| 13087/13087 [00:06<00:00, 2095.70it/s]


In [68]:
re = dfg_create(event_log)

In [70]:
re['end']

{'W_Validate application': 203,
 'W_Call after offers': 135,
 'O_Cancelled': 79,
 'W_Call incomplete files': 73,
 'A_Cancelled': 1,
 'W_Complete application': 1}

In [None]:
a-b-c-f
e-f-g-g
x-y-z

In [None]:
a,e,x    f,g,z

In [None]:
pm4py.analysis

In [36]:
pd = pm4py.convert_to_dataframe(log)

In [30]:
# pmins = pm4py.format_dataframe(pd, case_id='org:resource',activity_key='concept:name',
#                              timestamp_key='time:timestamp')


In [37]:
net, initial_marking, final_marking = pm4py.discover_petri_net_alpha(log)

In [41]:
net, initial_marking, final_marking = pm4py.discover_petri_net_inductive(log)

In [44]:
final_marking

['sink:1']

In [1]:
def dfg_create(filtered_event_data , act_perc = 1, path_perc = 1 , view_type = 'act_cnt',unit='hours'):

    """
    Function to get Directly Follows Graph
    
    Required Parameters 
    
    filtered_event_data : Event Data (Filtered before pulling in for Process Explorer)
    
    act_perc(default : 1) : Percentage of Activities (top) by cases to be shown. Values between [0.00 , 1.00] 
    
    path_perc(default : 1) : Percentage of Paths (top) by cases to be shown. Values between [0.00 , 1.00] 
    
    view_type : Type of edge values 
                values can range from act_cnt : activity or event count
                                      case_cnt : case count
                                      performace : time between two activities
    
    unit : unit of time in case of performace type metric:
           values can range from (d : days) , (h = hours) , (m = mins) , (s : seconds - default) 
    
    """
    from pm4py.statistics.start_activities.log import get as start_activities_module
    from pm4py.statistics.end_activities.log import get as end_activities_module
    from pm4py.utils import get_properties

    start_activities = start_activities_module.get_start_activities(filtered_event_data, parameters=get_properties(filtered_event_data))
    end_activities = end_activities_module.get_end_activities(filtered_event_data, parameters=get_properties(filtered_event_data))

    activities_count = pm4py.get_event_attribute_values(filtered_event_data, "concept:name")
    
    if view_type == 'act_cnt':
        variant = pm4py.algo.discovery.dfg.variants.native
    elif view_type == 'performance':
        variant = pm4py.algo.discovery.dfg.variants.performance    
    elif view_type == 'case_cnt':
        variant = pm4py.algo.discovery.dfg.variants.case_attributes
        
    
    dfg = pm4py.algo.discovery.dfg.algorithm.apply(filtered_event_data,parameters={'AGGREGATION_MEASURE':'max'},variant=variant)

    if view_type == 'case_cnt':
        from collections import Counter
        dfg_old = dfg
        dfg = {}
        for key in dfg_old.keys():
            dfg[key] = len(dfg_old[key]['concept:name'])

        dfg=Counter(dfg)
        del(dfg_old)
    
    dfg, start_activities, end_activities, activities_count = pm4py.algo.filtering.dfg.dfg_filtering.filter_dfg_on_activities_percentage(dfg, start_activities, end_activities, activities_count, act_perc)

    dfg, start_activities, end_activities, activities_count = pm4py.algo.filtering.dfg.dfg_filtering.filter_dfg_on_paths_percentage(dfg, start_activities, end_activities, activities_count, path_perc)
    
    activities = set()
    for names in list(dfg.keys()):
        activities.add(names[0])
        activities.add(names[1])
  
    factor = 1
    
    if unit == 'h':
        factor = 3600
    elif unit == 'm':
        factor = 60        
    elif unit == 'd':
        factor = 3600*24 

    values_list = []
    for key in dfg:
        values_list.append([key[0],key[1],dfg[key]/factor,'{:.2f} {}'.format(dfg[key]/factor , unit)])

    for starts in start_activities:
        values_list.append(['Start',starts,start_activities[starts],start_activities[starts]])

    for ends in end_activities:
        values_list.append([ends,'End',end_activities[ends],end_activities[ends]])
    
    
    nodes = [{ 'id': act, 
                'data': { 'label': act,'volume': act+ '['+str(activities_count[act])+ ']' },
             }  for act in activities]
    
    nodes.append({ 'id':'Start', 
#                     'type': 'output',
                    'data': { 'label': 'Start','volume':'Start ['+str(sum(start_activities.values()))+ ']' },     
                 })
    
    nodes.append({'id':'End', 
#                   'type': 'input',
                  'data': { 'label': 'End', 'volume':'End ['+str(sum(end_activities.values()))+ ']' },     
                         
                 })

    edges = [{'source': elements[0], 
               'target': elements[1], 
               'weight': elements[2], 
               'cases': elements[3],
              'label': elements[3],
              'type':'smoothstep',
              'animated': True 
             }  for elements in values_list]
    
    
    return {'dfg':[nodes , edges] , "start":start_activities,"end":end_activities}
            
            

In [None]:
from pm4py.statistics.start_activities.log import get as start_activities_module
from pm4py.statistics.end_activities.log import get as end_activities_module
from pm4py.utils import get_properties


event_log = pm4py.format_dataframe(df, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
event_log = pm4py.convert_to_event_log(event_log)
    
start_activities = start_activities_module.get_start_activities(filtered_event_data, parameters=get_properties(filtered_event_data))
end_activities = end_activities_module.get_end_activities(filtered_event_data, parameters=get_properties(filtered_event_data))

activities_count = pm4py.get_event_attribute_values(filtered_event_data, "concept:name")

if view_type == 'act_cnt':
    variant = pm4py.algo.discovery.dfg.variants.native
elif view_type == 'performance':
    variant = pm4py.algo.discovery.dfg.variants.performance    
elif view_type == 'case_cnt':
    variant = pm4py.algo.discovery.dfg.variants.case_attributes

In [48]:
dfg, start_activities, end_activities = pm4py.discover_dfg(log)

In [50]:
end_activities

{'W_Valideren aanvraag': 2747,
 'W_Wijzigen contractgegevens': 4,
 'A_DECLINED': 3429,
 'W_Completeren aanvraag': 1939,
 'A_CANCELLED': 655,
 'W_Nabellen incomplete dossiers': 452,
 'W_Afhandelen leads': 2234,
 'W_Nabellen offertes': 1290,
 'W_Beoordelen fraude': 57,
 'O_CANCELLED': 279,
 'A_REGISTERED': 1}

In [47]:
pm4py.view_dfg(dfg, start_activities, end_activities)

ExecutableNotFound: failed to execute PosixPath('dot'), make sure the Graphviz executables are on your systems' PATH

In [46]:
dfg

Counter({('A_SUBMITTED', 'A_PARTLYSUBMITTED'): 13087,
         ('A_PARTLYSUBMITTED', 'A_PREACCEPTED'): 4852,
         ('A_PREACCEPTED', 'W_Completeren aanvraag'): 7367,
         ('W_Completeren aanvraag', 'W_Completeren aanvraag'): 38004,
         ('W_Completeren aanvraag', 'A_ACCEPTED'): 5113,
         ('A_ACCEPTED', 'O_SELECTED'): 2879,
         ('O_SELECTED', 'A_FINALIZED'): 2907,
         ('A_FINALIZED', 'O_CREATED'): 2907,
         ('O_CREATED', 'O_SENT'): 7030,
         ('O_SENT', 'W_Nabellen offertes'): 6633,
         ('W_Nabellen offertes', 'W_Completeren aanvraag'): 5018,
         ('W_Completeren aanvraag', 'W_Nabellen offertes'): 5015,
         ('W_Nabellen offertes', 'W_Nabellen offertes'): 36084,
         ('W_Nabellen offertes', 'O_SENT_BACK'): 3254,
         ('O_SENT_BACK', 'W_Valideren aanvraag'): 3454,
         ('W_Valideren aanvraag', 'W_Nabellen offertes'): 3255,
         ('W_Nabellen offertes', 'W_Valideren aanvraag'): 3208,
         ('W_Valideren aanvraag', 'A_REGIST

In [29]:
pm4py.get_start_activities(pd)

{'W_Nabellen offertes': 22,
 'W_Completeren aanvraag': 12,
 'W_Nabellen incomplete dossiers': 12,
 'W_Valideren aanvraag': 7,
 'A_ACCEPTED': 4,
 'W_Beoordelen fraude': 3,
 'A_DECLINED': 2,
 'W_Wijzigen contractgegevens': 2,
 'A_SUBMITTED': 1,
 'O_SELECTED': 1,
 'O_SENT_BACK': 1,
 'A_PREACCEPTED': 1}