In [104]:
import pandas as pd
import pm4py
from pm4py.algo.filtering.pandas.attributes import attributes_filter
import os

##### All data sets you can find here: http://www.processmining.org/event-data.html

##### "Detail incident activity" data set

In [105]:
# get data from csv file
df = pd.read_csv(os.path.join("logs", "detail_incident_activity.csv"))
df.head(10)

Unnamed: 0,Incident ID,DateStamp,IncidentActivity_Number,IncidentActivity_Type,Assignment Group,KM number,Interaction ID
0,IM0000004,7/1/2013 8:17,001A3689763,Reassignment,TEAM0001,KM0000553,SD0000007
1,IM0000004,4/11/2013 13:41,001A5852941,Reassignment,TEAM0002,KM0000553,SD0000007
2,IM0000004,4/11/2013 13:41,001A5852943,Update from customer,TEAM0002,KM0000553,SD0000007
3,IM0000004,4/11/2013 12:09,001A5849980,Operator Update,TEAM0003,KM0000553,SD0000007
4,IM0000004,4/11/2013 12:09,001A5849979,Assignment,TEAM0003,KM0000553,SD0000007
5,IM0000004,4/11/2013 13:41,001A5852942,Assignment,TEAM0002,KM0000553,SD0000007
6,IM0000004,4/11/2013 13:51,001A5852172,Closed,TEAM0003,KM0000553,SD0000007
7,IM0000004,4/11/2013 13:51,001A5852173,Caused By CI,TEAM0003,KM0000553,SD0000007
8,IM0000004,4/11/2013 12:09,001A5849978,Reassignment,TEAM0003,KM0000553,SD0000007
9,IM0000004,25-09-2013 08:27:40,001A5544096,Operator Update,TEAM0003,KM0000553,SD0000007


In [106]:
# dataframe formatting
df = pm4py.format_dataframe(df, case_id="Incident ID", activity_key="IncidentActivity_Type", timestamp_key="DateStamp")

# KeyError: "['org:resource'] not in index"
# org:resource = who performed a given activity
df.rename(columns={'Assignment Group':'org:resource'}, inplace=True) 

# drop unnecessary columns
df.drop(['Incident ID', 'DateStamp', 'IncidentActivity_Number','IncidentActivity_Type', 'KM number','Interaction ID','@@index'], inplace=True, axis=1)

df.head(10)

Unnamed: 0,org:resource,case:concept:name,concept:name,time:timestamp
0,TEAM0003,IM0000004,Operator Update,2013-04-11 12:09:00+00:00
1,TEAM0003,IM0000004,Assignment,2013-04-11 12:09:00+00:00
2,TEAM0003,IM0000004,Reassignment,2013-04-11 12:09:00+00:00
3,TEAM0002,IM0000004,Reassignment,2013-04-11 13:41:00+00:00
4,TEAM0002,IM0000004,Update from customer,2013-04-11 13:41:00+00:00
5,TEAM0002,IM0000004,Assignment,2013-04-11 13:41:00+00:00
6,TEAM0003,IM0000004,Closed,2013-04-11 13:51:00+00:00
7,TEAM0003,IM0000004,Caused By CI,2013-04-11 13:51:00+00:00
8,TEAM0001,IM0000004,Reassignment,2013-07-01 08:17:00+00:00
9,TEAM0003,IM0000004,Operator Update,2013-09-25 08:27:40+00:00


In [107]:
# convert dataframe to logs format
logs = pm4py.convert_to_event_log(df)

#### Filtering examples
##### Based on documentation : https://pm4py.fit.fraunhofer.de/documentation#filtering

##### 1. Filtering on timeframe

In [108]:
# All the events happend between 2014-03-31 00:00:00 and 2014-03-31 10:00:00
filtered_log = pm4py.filter_time_range(logs, "2014-03-31 00:00:00", "2014-03-31 10:00:00", mode='traces_contained')
filtered_log_df = pm4py.convert_to_dataframe(filtered_log)
filtered_log_df.head(10)

Unnamed: 0,org:resource,concept:name,time:timestamp,case:concept:name
0,TEAM0008,Open,2014-03-31 08:06:10+00:00,IM0046838
1,TEAM0176,Update,2014-03-31 08:57:52+00:00,IM0046838
2,TEAM0176,Status Change,2014-03-31 08:57:52+00:00,IM0046838
3,TEAM0176,Assignment,2014-03-31 08:57:52+00:00,IM0046838
4,TEAM0176,Resolved,2014-03-31 09:55:42+00:00,IM0046838
5,TEAM0176,Closed,2014-03-31 09:55:49+00:00,IM0046838
6,TEAM0008,Open,2014-03-31 08:19:13+00:00,IM0046839
7,TEAM0190,Assignment,2014-03-31 08:50:45+00:00,IM0046839
8,TEAM0190,Closed,2014-03-31 09:41:44+00:00,IM0046839
9,TEAM0008,Open,2014-03-31 08:19:22+00:00,IM0046840


##### 2. Filter on case performance

In [109]:
# All the events which duration is no more than 1 hour
filtered_log = pm4py.filter_case_performance(logs, 0, 3600) # 1h = 3600s
filtered_log_df = pm4py.convert_to_dataframe(filtered_log)
filtered_log_df.head(10)

Unnamed: 0,org:resource,concept:name,time:timestamp,case:concept:name
0,TEAM0170,Closed,2014-01-13 15:13:13+00:00,IM0000006
1,TEAM0170,Caused By CI,2014-01-13 15:13:13+00:00,IM0000006
2,TEAM0006,Closed,2013-12-12 10:24:00+00:00,IM0000023
3,TEAM0006,Caused By CI,2013-12-12 10:24:00+00:00,IM0000023
4,TEAM0008,Closed,2013-11-30 10:39:10+00:00,IM0000033
5,TEAM0018,Open,2013-09-18 15:55:10+00:00,IM0000567
6,TEAM0018,Closed,2013-09-18 15:56:15+00:00,IM0000567
7,TEAM0018,Caused By CI,2013-09-18 15:56:16+00:00,IM0000567
8,TEAM0008,Open,2013-01-10 08:26:00+00:00,IM0001227
9,TEAM9999,Operator Update,2013-01-10 08:28:00+00:00,IM0001227


In [110]:
# check duration of sample event
sample_event = filtered_log_df.loc[filtered_log_df["case:concept:name"]=="IM0000567"].sort_values(by=["time:timestamp"])
start = sample_event.iloc[0]["time:timestamp"]
end = sample_event.iloc[-1]["time:timestamp"]
duration = end - start
duration
# OK

Timedelta('0 days 00:01:06')

##### 3. Filter on start activities
We can list all the cases which starting with a given activity

In [111]:
start_activities = pm4py.filter_start_activities(logs, ["Update"])
start_activities = pm4py.convert_to_dataframe(start_activities)
start_activities.head(18)
# Look the event IM0000015 really starting with the activity "Update"

Unnamed: 0,org:resource,concept:name,time:timestamp,case:concept:name
0,TEAM0003,Update,2013-01-05 13:58:00+00:00,IM0000015
1,TEAM0003,Update,2013-01-05 14:06:00+00:00,IM0000015
2,TEAM0002,Update from customer,2013-02-13 12:31:12+00:00,IM0000015
3,TEAM0002,Update from customer,2013-02-14 11:07:09+00:00,IM0000015
4,TEAM0002,Assignment,2013-02-14 11:07:09+00:00,IM0000015
5,TEAM0002,Reassignment,2013-02-14 11:07:09+00:00,IM0000015
6,TEAM0003,Assignment,2013-02-14 12:05:15+00:00,IM0000015
7,TEAM0002,Update from customer,2013-08-11 14:06:00+00:00,IM0000015
8,TEAM0002,Assignment,2013-08-11 14:06:00+00:00,IM0000015
9,TEAM0002,Reassignment,2013-08-11 14:06:00+00:00,IM0000015


In [112]:
events_starting_with_update_activity = pd.DataFrame(pd.unique(start_activities["case:concept:name"]),columns=["case:concept:name"])
events_starting_with_update_activity.head(10)

Unnamed: 0,case:concept:name
0,IM0000015
1,IM0000020
2,IM0000049
3,IM0000092
4,IM0000109
5,IM0000128
6,IM0000133
7,IM0000176
8,IM0000181
9,IM0000195


In [113]:
# We have 534 events which starting with the update activity
events_starting_with_update_activity.count()

case:concept:name    534
dtype: int64

##### 4. Filter on end activities
We can list all the cases which ending with a given activity

In [114]:
end_activities = pm4py.filter_end_activities(logs, ["Assignment"])
end_activities = pm4py.convert_to_dataframe(end_activities)
end_activities.tail(10)
# Look the event IM0045367 really ending with activity "Assignment"

Unnamed: 0,org:resource,concept:name,time:timestamp,case:concept:name
25976,TEAM0008,Vendor Reference,2014-03-21 11:18:33+00:00,IM0045173
25977,TEAM0008,Assignment,2014-03-21 11:18:33+00:00,IM0045173
25978,TEAM0008,Open,2014-03-21 15:41:04+00:00,IM0045367
25979,TEAM0008,Status Change,2014-03-21 15:41:46+00:00,IM0045367
25980,TEAM0008,External Vendor Assignment,2014-03-21 15:41:46+00:00,IM0045367
25981,TEAM0008,Pending vendor,2014-03-21 15:41:46+00:00,IM0045367
25982,TEAM0008,Vendor Reference,2014-03-24 08:06:14+00:00,IM0045367
25983,TEAM0008,Communication with vendor,2014-03-25 10:49:39+00:00,IM0045367
25984,TEAM0008,Communication with vendor,2014-03-26 10:56:21+00:00,IM0045367
25985,TEAM0008,Assignment,2014-03-26 10:56:21+00:00,IM0045367


In [115]:
cases_ending_with_assignment_activity = pd.DataFrame(pd.unique(end_activities["case:concept:name"]),columns=["case:concept:name"])
cases_ending_with_assignment_activity.head(10)

Unnamed: 0,case:concept:name
0,IM0000042
1,IM0000060
2,IM0000067
3,IM0000068
4,IM0000084
5,IM0000111
6,IM0000114
7,IM0000142
8,IM0000183
9,IM0000187


In [116]:
# We have 1312 events which ending with the assignment activity
cases_ending_with_assignment_activity.count()

case:concept:name    1312
dtype: int64

##### 5. Filter on variants
A variant is defined as activities flow. We can analyze all possible variants.

In [117]:
variants_dict = pm4py.get_variants(logs)
variants_list = list(variants_dict.keys())  # all posiible variants
sample_variant = variants_list[0]
sample_variant



'Operator Update,Assignment,Reassignment,Reassignment,Update from customer,Assignment,Closed,Caused By CI,Reassignment,Operator Update'

In [118]:
# Our sample variant is 'Operator Update,Assignment,Reassignment,Reassignment,Update from customer,Assignment,Closed,Caused By CI,Reassignment,Operator Update'

cases = variants_dict["Operator Update,Assignment,Reassignment,Reassignment,Update from customer,Assignment,Closed,Caused By CI,Reassignment,Operator Update"]
cases

# As we see below, the variant occured with the case IM0000004

[{'attributes': {'concept:name': 'IM0000004'}, 'events': [{'org:resource': 'TEAM0003', 'concept:name': 'Operator Update', 'time:timestamp': Timestamp('2013-04-11 12:09:00+0000', tz='UTC')}, '..', {'org:resource': 'TEAM0003', 'concept:name': 'Operator Update', 'time:timestamp': Timestamp('2013-09-25 08:27:40+0000', tz='UTC')}]}]

##### 6. Filter on attributes values

In [119]:
# We can look for a case by the given attribute
tracefilter_log_pos = pm4py.filter_event_attribute_values(logs, "case:concept:name", ["IM0000004"], level="event", retain=True)
tracefilter_log_pos = pm4py.convert_to_dataframe(tracefilter_log_pos)
tracefilter_log_pos

Unnamed: 0,org:resource,concept:name,time:timestamp,case:concept:name
0,TEAM0003,Operator Update,2013-04-11 12:09:00+00:00,IM0000004
1,TEAM0003,Assignment,2013-04-11 12:09:00+00:00,IM0000004
2,TEAM0003,Reassignment,2013-04-11 12:09:00+00:00,IM0000004
3,TEAM0002,Reassignment,2013-04-11 13:41:00+00:00,IM0000004
4,TEAM0002,Update from customer,2013-04-11 13:41:00+00:00,IM0000004
5,TEAM0002,Assignment,2013-04-11 13:41:00+00:00,IM0000004
6,TEAM0003,Closed,2013-04-11 13:51:00+00:00,IM0000004
7,TEAM0003,Caused By CI,2013-04-11 13:51:00+00:00,IM0000004
8,TEAM0001,Reassignment,2013-07-01 08:17:00+00:00,IM0000004
9,TEAM0003,Operator Update,2013-09-25 08:27:40+00:00,IM0000004


In [120]:
tracefilter_log_pos.columns

Index(['org:resource', 'concept:name', 'time:timestamp', 'case:concept:name'], dtype='object')

##### 7. Filter on numeric attribute values

In [121]:
# all cases with case ID between IM0000004 and IM0000006
filtered_df_cases = attributes_filter.apply_numeric(df, "IM0000004", "IM0000006", parameters={attributes_filter.Parameters.ATTRIBUTE_KEY: "case:concept:name"})
filtered_df_cases

Unnamed: 0,org:resource,case:concept:name,concept:name,time:timestamp
0,TEAM0003,IM0000004,Operator Update,2013-04-11 12:09:00+00:00
1,TEAM0003,IM0000004,Assignment,2013-04-11 12:09:00+00:00
2,TEAM0003,IM0000004,Reassignment,2013-04-11 12:09:00+00:00
3,TEAM0002,IM0000004,Reassignment,2013-04-11 13:41:00+00:00
4,TEAM0002,IM0000004,Update from customer,2013-04-11 13:41:00+00:00
5,TEAM0002,IM0000004,Assignment,2013-04-11 13:41:00+00:00
6,TEAM0003,IM0000004,Closed,2013-04-11 13:51:00+00:00
7,TEAM0003,IM0000004,Caused By CI,2013-04-11 13:51:00+00:00
8,TEAM0001,IM0000004,Reassignment,2013-07-01 08:17:00+00:00
9,TEAM0003,IM0000004,Operator Update,2013-09-25 08:27:40+00:00


##### 8. Between Filter

In [122]:
# all the subcases going from a source activity to a target activity
filtered_log = pm4py.filter_between(logs, "Operator Update", "Operator Update")
filtered_log = pm4py.convert_to_dataframe(filtered_log)
filtered_log.tail(10)

# Look cases IM0047010 and IM0047045 start with "Operator Update" and end also with "Operator Update", so it works

Unnamed: 0,org:resource,concept:name,time:timestamp,case:concept:name
100103,TEAM0176,Operator Update,2014-03-31 14:05:55+00:00,IM0047010
100104,TEAM0176,Reassignment,2014-03-31 14:05:55+00:00,IM0047010
100105,TEAM0176,Assignment,2014-03-31 14:05:55+00:00,IM0047010
100106,TEAM0070,Assignment,2014-03-31 14:45:20+00:00,IM0047010
100107,TEAM0070,Operator Update,2014-03-31 14:45:20+00:00,IM0047010
100108,TEAM0173,Operator Update,2014-03-31 14:38:08+00:00,IM0047045
100109,TEAM0173,Status Change,2014-03-31 15:08:25+00:00,IM0047045
100110,TEAM0173,Assignment,2014-03-31 15:08:25+00:00,IM0047045
100111,TEAM0173,Reassignment,2014-03-31 15:08:25+00:00,IM0047045
100112,TEAM0176,Operator Update,2014-03-31 15:52:29+00:00,IM0047045


##### 9. Case Size Filter
We can search for cases according to number of activities (events)

In [123]:
small_cases = pm4py.filter_case_size(logs, 0, 2)
small_cases = pm4py.convert_to_dataframe(small_cases)
small_cases.head(2)
# case IM0000006 has only two activities (Closed, Caused By CI)

Unnamed: 0,org:resource,concept:name,time:timestamp,case:concept:name
0,TEAM0170,Closed,2014-01-13 15:13:13+00:00,IM0000006
1,TEAM0170,Caused By CI,2014-01-13 15:13:13+00:00,IM0000006


In [124]:
small_cases.tail(2)
# case IM0042372 also has only two activities (Open, Closed)

Unnamed: 0,org:resource,concept:name,time:timestamp,case:concept:name
69,TEAM0008,Open,2014-10-03 13:10:00+00:00,IM0042372
70,TEAM0199,Closed,2014-10-03 13:36:00+00:00,IM0042372


##### 10. Rework Filter
All the cases where any activities are repeated

In [125]:
filtered_log = pm4py.filter_activities_rework(logs, "Update", 6) # all the cases having at least 6 occurrences of the activity "Update"
filtered_log = pm4py.convert_to_dataframe(filtered_log)
filtered_log.tail(12)

Unnamed: 0,org:resource,concept:name,time:timestamp,case:concept:name
47393,TEAM0008,Open,2014-03-31 14:14:13+00:00,IM0046994
47394,TEAM0181,Status Change,2014-03-31 16:02:56+00:00,IM0046994
47395,TEAM0181,Update,2014-03-31 16:02:56+00:00,IM0046994
47396,TEAM0181,Assignment,2014-03-31 16:02:56+00:00,IM0046994
47397,TEAM0181,Update,2014-03-31 16:03:16+00:00,IM0046994
47398,TEAM0181,Update,2014-03-31 16:50:57+00:00,IM0046994
47399,TEAM0181,Update,2014-03-31 17:00:33+00:00,IM0046994
47400,TEAM0181,Update,2014-03-31 17:52:59+00:00,IM0046994
47401,TEAM0181,Update,2014-03-31 18:17:38+00:00,IM0046994
47402,TEAM0181,Update,2014-03-31 18:19:12+00:00,IM0046994


##### 11. Paths Performance Filter
All the cases in which a given path between two activities takes a specified duration

In [126]:
two_minutes_envents = pm4py.filter_paths_performance(logs, ("Open", "Closed"), 0, 120)
two_minutes_envents = pm4py.convert_to_dataframe(two_minutes_envents)
two_minutes_envents.tail(3)

Unnamed: 0,org:resource,concept:name,time:timestamp,case:concept:name
764,TEAM0008,Open,2014-03-28 16:34:55+00:00,IM0046800
765,TEAM0191,Closed,2014-03-28 16:36:51+00:00,IM0046800
766,TEAM0191,Caused By CI,2014-03-28 16:36:51+00:00,IM0046800
