In [2]:
!pip install pm4py

In [8]:
import pandas as pd

text_dataset_path = "./path/to/loan_applications.csv"

keyword_map_title = {
    "Home improvement": ["home", "bedroom", "bathroom", "basement", "kitchen", "floor",
                          "property", "house", "relocation", "remodel",
                          "renovation", "apartment"],
    "Student Loan": ["student", "fee", "university", "tuition", "school", "degree", "class", "grad",
                      "graduate"],
    "Consume": ["mustang", "car", "machine", "auto", "purchase", "replacement", "sport", "christmas",
                 "game", "gift", "bike", "scooter"],
    "Medical": ["hospital", "cancer", "medical", "doctor", "uninsured",
                 "medicine", "surgery", "insurance", "drug", "treatment", "dental"],
    "Vacation": ["vacation", "summer", "winter", "country", "travel", "family", "wedding", "ring",
                  "swim", "pool", "hotel"],
    "Consolidation": ["refinance", "debt", "interest", "consolidation", "banks", "rate", "cut",
                       "payoff", "limit", "reduction", "credit"],
}

split = [(1.0, 0.0), (0.0, 1.0), (1.0, 0.0), (0.0, 1.0), (1.0, 0.0), (0.0, 1.0)]

text_dataset = pd.read_csv(text_dataset_path)
print("Columns:", text_dataset.columns.tolist())
print(text_dataset["desc"].tolist()[:50])
print(f"Total descriptions: {len(text_dataset)}")
text_dataset['desc_word_count'] = text_dataset['desc'].str.count(' ') + 1
text_dataset = text_dataset[
    text_dataset['desc'].notnull() & text_dataset['title'].notnull() & text_dataset['emp_title'].notnull()]
print(f"Descriptions after filtering: {len(text_dataset)}")
text_dataset = text_dataset[text_dataset['desc_word_count'] > 20]

desc_list = text_dataset['desc'].tolist()
print(f"Descriptions after filtering: {len(desc_list)}")

# --- Categorize each description ---
categories = list(keyword_map_title.keys())
accepted = []
rejected = []

for desc in desc_list:
    desc_lower = desc.lower()
    keyword_counts = []
    
    # Count keyword matches per category
    for cat, keywords in keyword_map_title.items():
        count = sum(kw in desc_lower for kw in keywords)
        keyword_counts.append(count)
    
    # Pick the category with most matches
    max_idx = keyword_counts.index(max(keyword_counts))
    cat = categories[max_idx]
    label = split[max_idx]
    
    # Assign to accepted or rejected
    if label == (1.0, 0.0):
        accepted.append(desc)
    else:
        rejected.append(desc)

print(f"Accepted: {len(accepted)} descriptions")
print(f"Rejected: {len(rejected)} descriptions")

In [8]:
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
from pm4py.objects.log.obj import EventLog, Trace
import os
import random

# print current working directory
print("Current Working Directory:")
print(os.getcwd())
# === Configuration ===
log_name = "bpi_2012"
text_target = "event"
input_xes = f"./path/to/{log_name}.xes"     # Path to the input .xes file
output_xes = f"./path/to/{log_name}_enriched_filtered_A.xes"   # Path to save the modified .xes file
target_activities = {"A_Denied", "A_Cancelled", "A_DECLINED", "A_CANCELLED"}  # List of activities to check
attr_name = "text"


def enrich_log(log, activity_list, attr_name="text", text_target="case"):
    for trace in log:
        # Get the activity names from the trace
        activities_in_trace = {event["concept:name"] for event in trace}

        # Determine assigned value
        value = random.choice(rejected) if activities_in_trace.intersection(activity_list) \
                else random.choice(accepted)

        # Write attribute according to target
        if text_target == "event":
            if len(trace) > 0:
                trace[0][attr_name] = value
            else:
                # Fallback for empty traces
                trace.attributes[attr_name] = value
        else:
            # Default: write as case attribute
            trace.attributes[attr_name] = value

    return log

def filter_log_by_prefix(log, filter_A=False, filter_O=False):
    # --- IMPORT LOG ---
    print(f"Original log: {len(log)} traces, total events = {sum(len(t) for t in log)}")

    # --- FILTER OUT TRACES WITH MISSING TIMESTAMPS ---
    filtered_traces = []
    for trace in log:
        has_missing = any("time:timestamp" not in e or e["time:timestamp"] is None for e in trace)
        if not has_missing:
            filtered_traces.append(trace)

    # --- FILTER ACTIVITIES BASED ON PREFIX ---
    if filter_A or filter_O:
        prefixes_to_keep = []
        if filter_A:
            prefixes_to_keep.append("A_")
        if filter_O:
            prefixes_to_keep.append("O_")

        filtered_traces2 = []
        for trace in filtered_traces:
            new_events = [e for e in trace if any(e["concept:name"].startswith(p) for p in prefixes_to_keep)]
            if len(new_events) > 0:
                new_trace = Trace(new_events, attributes=trace.attributes)
                filtered_traces2.append(new_trace)
    else:
        filtered_traces2 = filtered_traces

    # --- WRAP BACK INTO EVENT LOG ---
    clean_log = EventLog(filtered_traces2)
    print(f"Filtered log: {len(clean_log)} traces, total events = {sum(len(t) for t in clean_log)}")

    return clean_log

print("Reading log...")
log = xes_importer.apply(input_xes)

print("Adding binary case attribute...")
log = filter_log_by_prefix(log, filter_A=True, filter_O=False)
log = enrich_log(log, target_activities, text_target=text_target, attr_name=attr_name)

print("\n=== First 10 Cases (for validation) ===")
for i, trace in enumerate(log[:10]):
    case_id = trace.attributes.get("concept:name", f"Case_{i}")
    activities = [event["concept:name"] for event in trace]
    text_value = trace[0].get("text", "N/A") if text_target == "event" else trace.attributes.get("text", "N/A")
    print(f"Case: {case_id}")
    print(f"  Activities: {activities}")
    print(f"  Binary flag: {text_value}")
    print("-" * 60)

xes_exporter.apply(log, output_xes)
print(f"\nNew case attribute added and saved to: {output_xes}")