### Imports:

In [1]:
import os
import pandas as pd
import pm4py

### Data Loading:

In [2]:
file_dir = '../data/'
file_name = 'Hospital_Billing_Eventlog.xes'
download_url = 'https://data.4tu.nl/file/6af6d5f0-f44c-49be-aac8-8eaa5fe4f6fd/28b83e72-375e-4da4-8459-a8506e898edf'
# Check if file exists and download it if not

# Check if directory exists
if not os.path.exists(file_dir):
    os.system("mkdir " + file_dir)
if not os.path.exists(file_dir + file_name):
    print('Downloading file...')
    os.system(f'wget -O {file_dir + file_name + ".gz"} {download_url}')
    os.system(f'gzip -d {file_dir + file_name + ".gz"}')


In [3]:
data = pm4py.read_xes(file_dir + file_name)
event_log = pm4py.convert_to_event_log(data)

  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 100000/100000 [00:11<00:00, 8915.86it/s]


In [4]:
# Get overview over data
data

Unnamed: 0,isCancelled,diagnosis,time:timestamp,caseType,speciality,org:resource,concept:name,blocked,isClosed,flagD,...,lifecycle:transition,case:concept:name,closeCode,actRed,actOrange,flagC,msgCount,version,msgType,msgCode
0,False,A,2012-12-16 19:33:10+00:00,A,A,ResA,NEW,False,True,True,...,complete,A,,,,,,,,
1,,,2013-12-15 19:00:37+00:00,,,,FIN,,,,...,complete,A,A,,,,,,,
2,,,2013-12-16 03:53:38+00:00,,,,RELEASE,,,,...,complete,A,,,,,,,,
3,,,2013-12-17 12:56:29+00:00,,,,CODE OK,,,,...,complete,A,,False,False,False,0.0,A,,
4,,,2013-12-19 03:44:31+00:00,,,ResB,BILLED,,,,...,complete,A,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451354,False,OM,2015-12-13 19:31:23+00:00,A,E,ResA,NEW,False,False,False,...,complete,AXQE,,,,,,,,
451355,False,,2016-01-14 21:17:47+00:00,B,L,ResDJ,NEW,False,False,False,...,complete,BXQE,,,,,,,,
451356,True,,2016-01-14 22:00:13+00:00,,,ResFR,DELETE,,,,...,complete,BXQE,,,,,,,,
451357,False,LL,2016-01-11 19:40:47+00:00,A,D,ResA,NEW,False,False,False,...,complete,CXQE,,,,,,,,


### Preprocessing and initial analysis:

Get some useful information first:

In [5]:
start_activities = pm4py.get_start_activities(event_log)
end_activities = pm4py.get_end_activities(event_log)
activities = pm4py.get_event_attribute_values(event_log, "concept:name")
event_attributes = pm4py.get_event_attributes(event_log)
trace_attributes = pm4py.get_trace_attributes(event_log)

print("====================================")
print("Start Activities")
print(start_activities)
print("\nEnd Activities")
print(end_activities)
print("\nActivities")
print(activities)
print("\nEvent Attributes")
print(event_attributes)
print("\nTrace Attributes")
print(trace_attributes)
print("====================================")

Start Activities
{'NEW': 100000}

End Activities
{'BILLED': 63498, 'DELETE': 8215, 'NEW': 22407, 'FIN': 3611, 'SET STATUS': 600, 'CODE OK': 948, 'MANUAL': 85, 'JOIN-PAT': 47, 'CHANGE DIAGN': 19, 'RELEASE': 107, 'CODE NOK': 14, 'EMPTY': 444, 'REJECT': 4, 'STORNO': 1}

Activities
{'NEW': 101289, 'FIN': 74738, 'RELEASE': 70926, 'CODE OK': 68006, 'BILLED': 67448, 'DELETE': 8225, 'REOPEN': 4669, 'CHANGE DIAGN': 45451, 'STORNO': 2973, 'REJECT': 2016, 'CODE NOK': 3620, 'SET STATUS': 705, 'CHANGE END': 38, 'MANUAL': 372, 'JOIN-PAT': 358, 'CODE ERROR': 75, 'ZDBC_BEHAN': 1, 'EMPTY': 449}

Event Attributes
['msgType', 'flagC', 'actOrange', 'state', 'msgCount', 'diagnosis', 'speciality', 'version', 'closeCode', 'concept:name', 'blocked', 'actRed', 'flagB', 'flagA', 'msgCode', 'isCancelled', 'isClosed', 'flagD', 'time:timestamp', 'org:resource', 'caseType']

Trace Attributes
[]


Out of the attributes, 'diagnosis', 'isCancelled' and 'isClosed' seem useful for consideration. We keep them in the dataset for now.

In [6]:
# Get traces
traces = data.groupby('case:concept:name')['concept:name'].apply(tuple).reset_index()
diagnosis = data[['case:concept:name', 'diagnosis']].groupby('case:concept:name')['diagnosis'].apply(lambda x: tuple(set(x.dropna()))).reset_index()
cancelled = data[['case:concept:name', 'isCancelled']].groupby('case:concept:name')['isCancelled'].apply(lambda x: tuple(set(x.dropna()))).reset_index()
closed = data[['case:concept:name', 'isClosed']].groupby('case:concept:name')['isClosed'].apply(lambda x: tuple(set(x.dropna()))).reset_index()

# Merge traces with cancelled and closed
traces = pd.merge(traces, diagnosis, on='case:concept:name')
traces = pd.merge(traces, cancelled, on='case:concept:name')
traces = pd.merge(traces, closed, on='case:concept:name')
traces.rename(columns={'case:concept:name': 'case_id', 'concept:name': 'trace'}, inplace=True)


In [7]:
traces

Unnamed: 0,case_id,trace,diagnosis,isCancelled,isClosed
0,A,"(NEW, FIN, RELEASE, CODE OK, BILLED)","(A,)","(False,)","(True,)"
1,AA,"(NEW, CHANGE DIAGN, FIN, RELEASE, CODE OK, BIL...","(CA,)","(False,)","(True,)"
2,AAA,"(NEW,)",(),"(False,)","(True,)"
3,AAAA,"(NEW, CHANGE DIAGN, FIN, RELEASE, CODE OK, BIL...","(JA,)","(False,)","(True,)"
4,AAAB,"(NEW, CHANGE DIAGN, FIN, RELEASE, CODE OK, BIL...","(Z,)","(False,)","(True,)"
...,...,...,...,...,...
99995,ZZZ,"(NEW, CHANGE DIAGN, FIN, RELEASE, CODE OK, BIL...","(EY,)","(False,)","(True,)"
99996,ZZZA,"(NEW, DELETE)",(),"(False, True)","(False,)"
99997,ZZZB,"(NEW, CHANGE DIAGN, DELETE)","(BC,)","(False, True)","(False,)"
99998,ZZZC,"(NEW,)",(),"(False,)","(True,)"


In [8]:
# Save as new, smaller csv
traces.to_csv('../output/preprocessed_data.csv', sep=';', index=False)

We might filter out outliers, that is traces that only show up once or twice. We can do this later but keep the information for now in a dict.

In [8]:
traces_dict = traces['trace'].value_counts().to_dict()
traces_dict

{('NEW', 'CHANGE DIAGN', 'FIN', 'RELEASE', 'CODE OK', 'BILLED'): 33673,
 ('NEW',): 22373,
 ('NEW', 'FIN', 'RELEASE', 'CODE OK', 'BILLED'): 20902,
 ('NEW', 'DELETE'): 4813,
 ('NEW', 'FIN'): 3508,
 ('NEW',
  'CHANGE DIAGN',
  'CHANGE DIAGN',
  'FIN',
  'RELEASE',
  'CODE OK',
  'BILLED'): 2121,
 ('NEW', 'CHANGE DIAGN', 'DELETE'): 1555,
 ('NEW', 'FIN', 'RELEASE', 'CODE NOK', 'BILLED'): 977,
 ('NEW',
  'CHANGE DIAGN',
  'FIN',
  'RELEASE',
  'CODE OK',
  'REOPEN',
  'FIN',
  'RELEASE',
  'CODE OK',
  'BILLED'): 869,
 ('NEW', 'FIN', 'RELEASE', 'CODE OK', 'BILLED', 'CODE OK'): 512,
 ('NEW', 'FIN', 'RELEASE', 'CODE NOK', 'EMPTY'): 400,
 ('NEW', 'FIN', 'RELEASE', 'BILLED'): 359,
 ('NEW',
  'CHANGE DIAGN',
  'FIN',
  'RELEASE',
  'CODE OK',
  'BILLED',
  'STORNO',
  'REJECT',
  'BILLED'): 314,
 ('NEW',
  'FIN',
  'RELEASE',
  'CODE OK',
  'REOPEN',
  'FIN',
  'RELEASE',
  'CODE OK',
  'BILLED'): 273,
 ('NEW',
  'CHANGE DIAGN',
  'FIN',
  'RELEASE',
  'CODE OK',
  'BILLED',
  'STORNO',
  'REJECT

In [9]:
# Get all traces where length of diagnosis is greater than 1
diag = traces[traces['diagnosis'].apply(lambda x: len(x) > 1)]

# Is there any trace that does not include 'CHANGE DIAGN'?
diag[diag['trace'].apply(lambda x: 'CHANGE DIAGN' not in x)]

Unnamed: 0,case_id,trace,diagnosis,isCancelled,isClosed
26,AAE,"(NEW, FIN, RELEASE, REOPEN, FIN, RELEASE, CODE...","(MN, Z)","(False,)","(True,)"
1968,ANH,"(NEW, NEW, NEW, DELETE)","(MN, Z, DJ)","(False, True)","(False,)"
2053,ANW,"(NEW, FIN, RELEASE, CODE OK, REOPEN, FIN, RELE...","(ZP, EM)","(False,)","(True,)"
2264,APH,"(NEW, NEW, NEW, NEW, NEW, NEW, NEW, NEW, NEW, ...","(RE, Z, J, MN, NC, TJ, NE, RB, TB, AJ, CM, TQ,...","(False,)","(True,)"
2436,AQL,"(NEW, NEW, NEW, NEW, NEW, NEW, NEW, NEW, NEW, ...","(RE, Z, J, MN, NC, TJ, NE, RB, TB, AJ, CM, TQ,...","(False,)","(True,)"
...,...,...,...,...,...
97974,ZMH,"(NEW, NEW, NEW, NEW, DELETE)","(T, UB, TQ, X)","(False, True)","(False,)"
98307,ZONA,"(NEW, FIN, RELEASE, BILLED, STORNO, REJECT, BI...","(K, RE)","(False,)","(True,)"
98862,ZSH,"(NEW, NEW, FIN, DELETE)","(M, L)","(False, True)","(True,)"
98864,ZSHB,"(NEW, FIN, RELEASE, CODE OK, MANUAL)","(DT, KT)","(False,)","(True,)"


In [10]:
# Get all traces where isCancelled includes True
cancelled = traces[traces['isCancelled'].apply(lambda x: True in x)]
cancelled.groupby('isCancelled')['trace'].value_counts()

# What is the last activity in the traces where isCancelled includes True?
copy = cancelled[['case_id', 'trace', 'isCancelled']].copy()
copy['last_activity'] = cancelled['trace'].apply(lambda x: x[-1])
copy.groupby('last_activity')['isCancelled'].value_counts()

last_activity  isCancelled  
DELETE         (False, True)    8215
JOIN-PAT       (False, True)       6
MANUAL         (False, True)       1
Name: count, dtype: int64

In [11]:
# Get all traces where isCancelled includes False
not_cancelled = traces[traces['isCancelled'].apply(lambda x: False in x)]
not_cancelled.groupby('isCancelled')['trace'].value_counts()

# What is the last activity in the traces where isCancelled includes False?
copy = not_cancelled[['case_id', 'trace', 'isCancelled']].copy()
copy['last_activity'] = not_cancelled['trace'].apply(lambda x: x[-1])
copy.groupby('last_activity')['isCancelled'].value_counts()

last_activity  isCancelled  
BILLED         (False,)         63498
CHANGE DIAGN   (False,)            19
CODE NOK       (False,)            14
CODE OK        (False,)           948
DELETE         (False, True)     8215
EMPTY          (False,)           444
FIN            (False,)          3611
JOIN-PAT       (False,)            41
               (False, True)        6
MANUAL         (False,)            84
               (False, True)        1
NEW            (False,)         22407
REJECT         (False,)             4
RELEASE        (False,)           107
SET STATUS     (False,)           600
STORNO         (False,)             1
Name: count, dtype: int64

In [12]:
# Get all traces where isClosed includes True
closed = traces[traces['isClosed'].apply(lambda x: True in x)]
closed.groupby('isClosed')['trace'].value_counts()

# What is the last activity in the traces where isClosed includes True?
copy = closed[['case_id', 'trace', 'isClosed']].copy()
copy['last_activity'] = closed['trace'].apply(lambda x: x[-1])
copy.groupby('last_activity')['isClosed'].value_counts()

last_activity  isClosed     
BILLED         (True,)          63466
               (False, True)       32
CODE NOK       (True,)             14
CODE OK        (True,)            948
DELETE         (True,)            803
               (False, True)        1
EMPTY          (True,)            444
FIN            (True,)           3611
JOIN-PAT       (True,)             42
MANUAL         (True,)             83
NEW            (True,)          22311
REJECT         (True,)              4
RELEASE        (True,)            107
SET STATUS     (True,)            582
               (False, True)        1
STORNO         (True,)              1
Name: count, dtype: int64

In [13]:
# Get all traces where isClosed includes False
not_closed = traces[traces['isClosed'].apply(lambda x: False in x)]
not_closed.groupby('isClosed')['trace'].value_counts()

# What is the last activity in the traces where isClosed includes False?
copy = not_closed[['case_id', 'trace', 'isClosed']].copy()
copy['last_activity'] = not_closed['trace'].apply(lambda x: x[-1])
copy.groupby('last_activity')['isClosed'].value_counts()

last_activity  isClosed     
BILLED         (False, True)      32
CHANGE DIAGN   (False,)           19
DELETE         (False,)         7411
               (False, True)       1
JOIN-PAT       (False,)            5
MANUAL         (False,)            2
NEW            (False,)           96
SET STATUS     (False,)           17
               (False, True)       1
Name: count, dtype: int64

In [14]:
# Get the number of unique diagnoses
traces.groupby('diagnosis')['trace'].value_counts()

diagnosis  trace                                                             
()         (NEW,)                                                                21833
           (NEW, DELETE)                                                          1900
           (NEW, NEW, JOIN-PAT, JOIN-PAT, JOIN-PAT, JOIN-PAT)                       27
           (NEW, SET STATUS)                                                         5
           (NEW, NEW, JOIN-PAT, JOIN-PAT, JOIN-PAT, JOIN-PAT, DELETE, DELETE)        2
                                                                                 ...  
(ZY,)      (NEW, DELETE)                                                             1
           (NEW, FIN, RELEASE, CODE OK, BILLED)                                      1
(ZZ,)      (NEW, CHANGE DIAGN, FIN, RELEASE, CODE OK, BILLED)                        3
           (NEW, FIN, RELEASE, CODE OK, BILLED)                                      1
(ZZ, HN)   (NEW, CHANGE DIAGN, CHANGE DIAGN, FIN, RE

#### Build profiles based on 
- ~~Transition~~ _(too tedious)_
- ~~Case Attributes~~ _(not available)_
- Event attributes
- Performance

See [1] for more details.

[1] Song, Minseok & Günther, Christian & Aalst, Wil. (2008). Trace Clustering in Process Mining. Lecture Notes in Business Information Processing. 17. 109-120. 10.1007/978-3-642-00328-8_11. 

<font color='red'>This notebook is abandoned from here</font>

Event attributes:

Performance: