# Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pickle as pickle

## Load trace data

In [2]:
source_trace = pd.read_pickle("../../Data/Trace/source_trace.pkl")
target_trace = pd.read_pickle("../../Data/Trace/target_trace.pkl")

## Purge instant case

In [3]:
print("Num cases before cleaning: ", source_trace.shape[0], "   ", target_trace.shape[0])
source_trace = source_trace[source_trace["CaseDuration"] > 0]
target_trace = target_trace[target_trace["CaseDuration"] > 0]
print("Num cases after cleaning: ", source_trace.shape[0], "   ", target_trace.shape[0])

Num cases before cleaning:  35214     76327
Num cases after cleaning:  33316     75269


## Cut ultra long

In [4]:
print("Num cases before cleaning: ", source_trace.shape[0], "   ", target_trace.shape[0])
source_trace_c1 = source_trace[source_trace["CaseDuration"] < np.quantile(source_trace["CaseDuration"].values, 0.95)]
target_trace_c1 = target_trace[target_trace["CaseDuration"] < np.quantile(target_trace["CaseDuration"].values, 0.95)]
print("Num cases after cleaning: ", source_trace_c1.shape[0], "   ", target_trace_c1.shape[0])

Num cases before cleaning:  33316     75269
Num cases after cleaning:  31650     71505


## Cut unfinished

In [5]:
target_case_duration_90 = np.quantile(target_trace_c1["CaseDuration"].values, 0.9)
source_case_duration_90 = np.quantile(source_trace_c1["CaseDuration"].values, 0.9)

In [6]:
print("Num cases before cleaning: ", source_trace_c1.shape[0], "   ", target_trace_c1.shape[0])
target_trace_c1["EndTime"] = target_trace_c1["Timestamp"].apply(lambda x: x[-1])
log_end = target_trace_c1["EndTime"].max()
target_trace_c1["LogSlack"] = (log_end - target_trace_c1["Start Time"]) / np.timedelta64(1, "m")
target_trace_c2 = target_trace_c1[target_trace_c1["LogSlack"] > target_case_duration_90]

source_trace_c1["EndTime"] = source_trace_c1["Timestamp"].apply(lambda x: x[-1])
log_end = source_trace_c1["EndTime"].max()
source_trace_c1["LogSlack"] = (log_end - source_trace_c1["Start Time"]) / np.timedelta64(1, "m")
source_trace_c2 = source_trace_c1[source_trace_c1["LogSlack"] > source_case_duration_90]
print("Num cases before cleaning: ", source_trace_c2.shape[0], "   ", target_trace_c2.shape[0])

Num cases before cleaning:  31650     71505
Num cases before cleaning:  28607     56452


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_trace_c1["EndTime"] = target_trace_c1["Timestamp"].apply(lambda x: x[-1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_trace_c1["LogSlack"] = (log_end - target_trace_c1["Start Time"]) / np.timedelta64(1, "m")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  source_trace_c1["EndTime"] 

In [7]:
target_trace.shape[0]

75269

In [9]:
cut_ratio = 0.8
source_trace = source_trace.iloc[: int(source_trace.shape[0] * 0.8)]
target_trace = target_trace.iloc[: int(target_trace.shape[0] * 0.8)]

## Save cleaned trace data

In [10]:
source_trace_c2.to_pickle("../../Data/Trace/source_trace_cleaned.pkl")
target_trace_c2.to_pickle("../../Data/Trace/target_trace_cleaned.pkl")