In [1]:
## BPIC2012
caseid = 'Case ID'
activity = 'Activity'
ts = 'Complete Timestamp'
label = {'Activity' : 'O_ACCEPTED-COMPLETE'}
other_features = ['Resource', 'Variant index', '(case) AMOUNT_REQ']

## BPIC2017
# caseid = 'Case ID'
# activity = 'Activity'
# ts = 'Complete Timestamp'
# label = {'column' : 'Accepted'}
# other_features = ['Resource', 'CreditScore', 'FirstWithdrawalAmount', 'MonthlyCost', 'NumberOfTerms','OfferedAmount']

## BPIC2015
# caseid = 'Case ID'
# activity = 'Activity'
# ts = 'Complete Timestamp'
# label = {'column' : 'Label'}
# other_features = ['Resource', 'monitoringResource', '(case) Includes_subCases','(case) Responsible_actor','(case) caseProcedure','(case) caseStatus','(case) last_phase','(case) parts',
#                 '(case) requestComplete','(case) termName', '(case) SUMleges']

combi = ['bucketing', 'encoding', 'drop_act', 'params']

options = {
    'bucketing' : (1,40), # a number of partitions
    
    'encoding' : ['index', 'aggregate'],
    
    'drop_act' : [2,4,6,8], # a number of activities to drop
    
    'models' : ['Decision Tree','Random Forest','LightGBM','Xgboost'],

    'params' : {'Decision Tree':{'max_depth': (2,20),
                           'min_samples_leaf': (5,100),
                           'criterion': ["gini", "entropy"]
            }, 
            'Random Forest':{"n_estimators": (10,1000), 
                           "max_depth": (2,20),
                           "max_features": ["auto", "log2"], 
                           "bootstrap": [True, False],
                           "criterion": ["gini", "entropy"]
            },
            'LightGBM':{'max_depth': (2,20),
                      'num_leaves' : (10,500),
                      'min_child_samples' : (2,10)
            },
            'Xgboost':{"max_depth": (2,20),
                     "n_estimators": (10,1000),
                     "learning_rate": [0.01, 0.05, 0.1]
                     
            }
            }
}

In [4]:
from tqdm import tqdm
import pandas as pd
import pickle

def add_label(df):       
    Label = []
    if 'Activity' in label:
        label_col = label['Activity']                
        for case, group in df.groupby(caseid):
            for i in range(len(group)):
                if label_col in group[activity].tolist():
                    Label.append(1)
                else:
                    Label.append(0)
        label_df = pd.DataFrame(Label, columns = ['Label'])
        df = pd.concat([df, label_df], axis=1)

    elif 'column' in label:
        label_col = label['column']
        df = df.rename(columns={label_col : 'Label'})
    return df

def drop_activity(df, n):
    trace_num = df['Case ID'].nunique()
    act = df['Activity'].value_counts()
    df = df.iloc[[i for i in range(len(df)) if df.iloc[i]['Activity'] not in act[-n:]]]
    return df

def whole_bucket(df):
    result = []
    
    for prefix in tqdm(range(2,42)):
        bucket=[]
        for case, group in df.groupby(caseid):
            group = group.sort_values(by=ts, ascending = True).reset_index(drop=True)
            if len(group) >= prefix:
                bucket.append(group.iloc[:prefix,:])
        new_df = pd.concat(bucket)
        result.append(new_df)
    
    return result

def prefix_bound(m, drop_low=False):
    if m == 1:
        if drop_low == True:
            return [[4,40]]
        else:
            return [[2,40]]
    
    if drop_low == True:
        if m > 37:
            m = 37
        prefix_len = 37//m
        remain = 37%m
        prev = 4
        bound=[]
        for i in range(m):
            if i < remain:
                bound.append([prev,prev+prefix_len+1])
                prev = prev+prefix_len+1
            else:
                bound.append([prev,prev+prefix_len])
                prev = prev+prefix_len
    else:  
        prefix_len = 39//m
        remain = 39%m
        prev = 2
        bound=[]
        for i in range(m):
            if i < remain:
                bound.append([prev,prev+prefix_len+1])
                prev = prev+prefix_len+1
            else:
                bound.append([prev,prev+prefix_len])
                prev = prev+prefix_len
        
    return bound

In [5]:
"""
GA : Genetic Algorithm
RS : Random Search
"""
df = pd.read_csv('/Users/nahyun/Desktop/Project/rule-based-predictive-monitoring-master/data/dataset/bpic/BPIC12.csv')
df = add_label(df)

In [5]:
df1 = drop_activity(df, 2)
df2 = drop_activity(df, 4)
df3 = drop_activity(df, 6)
df4 = drop_activity(df, 8)

df_list1 = whole_bucket(df1)
df_list2 = whole_bucket(df2)
df_list3 = whole_bucket(df3)
df_list4 = whole_bucket(df4)

with open('df_list1.pkl', 'wb') as f1:
    pickle.dump(df_list1, f1)
    
with open('df_list2.pkl', 'wb') as f2:
    pickle.dump(df_list2, f2)
    
with open('df_list3.pkl', 'wb') as f3:
    pickle.dump(df_list3, f3)
    
with open('df_list4.pkl', 'wb') as f4:
    pickle.dump(df_list4, f4)