# Generate Core Data Set

In [1]:
import pandas as pd
import numpy as np
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pickle as pickle

from src.log import Reformat

## Read raw file

In [2]:
bpi_log = pd.read_csv('../../Data/Raw/BPI_2019_Dataset_Classified_based_on_Sub_Spend_Area.zip')
sap_log = pd.read_csv('../../Data/Raw/CELONIS_2018_Dataset_Classified_based_on_Sub_Spend_Area.zip')

## Extract core columns

In [3]:
sap_log.columns

Index(['index', 'Case_ID', 'Purchase_Order_ID', 'Event_Name', 'Event_ID',
       'Timestamp', 'Company_Code', 'Spend_Area', 'Spend_Sub_Area',
       'EC_Case_Type', 'GR_Classification', 'Vendor_ID', 'Source_System',
       'User_ID', 'Organisation_ID', 'Case_Type', 'Process_Flow',
       'Process_Structure', 'Item_Category', 'Case_Name', 'Item_ID', 'Value',
       'Goods_Receipt', 'Classification', 'AVG_Throughput', 'Events_2',
       'Sequence'],
      dtype='object')

In [4]:
core_col = ['Case_ID', 'Event_Name', 'Timestamp', 'Purchase_Order_ID']
sap_log_core = sap_log[core_col]
bpi_log_core = bpi_log[core_col]
sap_log_core["Timestamp"] = pd.to_datetime(sap_log["Timestamp"])
bpi_log_core["Timestamp"] = pd.to_datetime(bpi_log["Timestamp"]) 
sap_log_core = sap_log_core.sort_values(by='Timestamp', ascending=True)
bpi_log_core = bpi_log_core.sort_values(by='Timestamp', ascending=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sap_log_core["Timestamp"] = pd.to_datetime(sap_log["Timestamp"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bpi_log_core["Timestamp"] = pd.to_datetime(bpi_log["Timestamp"])


## Inspect event log and save

In [5]:
for log in [sap_log_core, bpi_log_core]:
    print("Generated event log with num of events: ", log.shape[0])

Generated event log with num of events:  287956
Generated event log with num of events:  1593366


In [7]:
sap_log_core.to_pickle("../../Data/Log/source_log.pkl")
bpi_log_core.to_pickle("../../Data/Log/target_log.pkl")

In [8]:
source_trace = Reformat.roll_sequence(sap_log_core, time_column="Timestamp", case_column="Case_ID")
target_trace = Reformat.roll_sequence(bpi_log_core, time_column="Timestamp", case_column="Case_ID")

## Feature generation

In [9]:
def cal_case_druation(timestamps, resolution="m"):
    return (timestamps[-1] - timestamps[0]) / np.timedelta64(1, resolution)

source_trace["CaseDuration"] = source_trace["Timestamp"].apply(lambda x: cal_case_druation(x, "m"))
target_trace["CaseDuration"] = target_trace["Timestamp"].apply(lambda x: cal_case_druation(x, "m"))

In [10]:
def cal_case_interval(timestamps, resolution="m"):
    time_diff = (timestamps - np.roll(timestamps, 1)) / np.timedelta64(1, resolution)
    time_diff[0] = 0
    return time_diff

source_trace["CaseInterval"] = source_trace["Timestamp"].apply(lambda x: cal_case_interval(x, "m"))
target_trace["CaseInterval"] = target_trace["Timestamp"].apply(lambda x: cal_case_interval(x, "m"))

In [11]:
def cal_case_lapse(timestamps, resolution="m"):
    time_diff = (timestamps - timestamps[0]) / np.timedelta64(1, resolution)
    return time_diff

source_trace["CaseLapse"] = source_trace["Timestamp"].apply(lambda x: cal_case_lapse(x, "m"))
target_trace["CaseLapse"] = target_trace["Timestamp"].apply(lambda x: cal_case_lapse(x, "m"))

In [12]:
source_trace.to_pickle("../../Data/Trace/source_trace.pkl")
target_trace.to_pickle("../../Data/Trace/target_trace.pkl")