# Data Preparation for Process Analytics with bupaR

In [None]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

In [None]:
def get_activity_instance(x):
    x['activity_instance'] = np.arange(1, len(x)+1)
    return x

def prep_log(filename):
    log = pd.read_csv(filename)
    ## data wrangling
    # convert timestamp
    log['time:timestamp'] = log['time:timestamp'].map(lambda m: pd.to_datetime(m, format='%Y-%m-%d %H:%M:%S').strftime('%d.%m.%Y %H:%M:%S'))
    # sort values by increasing timestamp and process instance ID
    log = log.sort_values(by=['case:concept:name', 'time:timestamp'], ascending=True, kind="mergesort")
    # create activity instances 
    #g = log.groupby('concept:name', as_index=False)
    #log = g.apply(get_activity_instance)
    log['activity_instance'] = log.groupby('case:concept:name', as_index=False)['concept:name'].cumcount()
    log['activity_instance_new'] = log['case:concept:name'].astype(str) + '_' + log['activity_instance'].astype(str)
    # append lifecycle status
    log['status'] = 'start'
    # add resource
    log['resource'] ='empty'
    # rename case id
    log = log.rename(columns={"case:concept:name":"case_id"})
    
    ## drop columns
    log = log.drop(['Unnamed: 0', '@@index'], axis=1)
    # list of columns to drop
    eventlog_list = ["Activity_kproto", 'Activity_dbscan', 'Activity_Aglo_Sgl', 'Activity_Aglo_Cmp', 'Activity_Aglo_Avg', 'Activity_BOB']
    
    # prepare list for each column containing clustered activities
    d = {}
    for i in range(len(eventlog_list)-1): 
        t = eventlog_list.copy()
        t.pop(i)
        d['{0}'.format(i)] = t
        
    # drop columns and write .csv-file
    for x in range(0, len(d)):
        ev = log.copy()
        ev = ev.drop(d['{0}'.format(x)], axis=1)
        
        clust = eventlog_list[x]
        ev.to_csv('ev_prep_{0}'.format(clust) + '.csv')


def process_data_properties(log):
    # number of events
    num_events = len(log)
    # number of process instances
    num_processinstances = len(log['case_id'].unique())
    # number of activity types
    n_activities = len(log['concept:name'].unique())
    # activities per instance (min, max, avg)
    return num_events, num_processinstances, n_activities
    

In [None]:
path = os.getcwd() 
prep_log(path + '/ev_raw/ev_ohnecluster-2.csv')

### Quick EDA

In [None]:
testlog = pd.read_csv('ev_prep_Activity_kproto.csv')

In [None]:
process_data_properties(testlog)

In [None]:
testlog.head()

***