In [2]:
!pip install pm4py

Looking in indexes: https://nexus.iisys.de/repository/ki-awz-pypi-group/simple, https://pypi.org/simple


In [8]:
import pandas as pd

text_dataset_path = "./reddit-mining/data/loan_applications.csv"

keyword_map_title = {
    "Home improvement": ["home", "bedroom", "bathroom", "basement", "kitchen", "floor",
                          "property", "house", "relocation", "remodel",
                          "renovation", "apartment"],
    "Student Loan": ["student", "fee", "university", "tuition", "school", "degree", "class", "grad",
                      "graduate"],
    "Consume": ["mustang", "car", "machine", "auto", "purchase", "replacement", "sport", "christmas",
                 "game", "gift", "bike", "scooter"],
    "Medical": ["hospital", "cancer", "medical", "doctor", "uninsured",
                 "medicine", "surgery", "insurance", "drug", "treatment", "dental"],
    "Vacation": ["vacation", "summer", "winter", "country", "travel", "family", "wedding", "ring",
                  "swim", "pool", "hotel"],
    "Consolidation": ["refinance", "debt", "interest", "consolidation", "banks", "rate", "cut",
                       "payoff", "limit", "reduction", "credit"],
}

split = [(1.0, 0.0), (0.0, 1.0), (1.0, 0.0), (0.0, 1.0), (1.0, 0.0), (0.0, 1.0)]

text_dataset = pd.read_csv(text_dataset_path)
print("Columns:", text_dataset.columns.tolist())
print(text_dataset["desc"].tolist()[:50])
print(f"Total descriptions: {len(text_dataset)}")
text_dataset['desc_word_count'] = text_dataset['desc'].str.count(' ') + 1
text_dataset = text_dataset[
    text_dataset['desc'].notnull() & text_dataset['title'].notnull() & text_dataset['emp_title'].notnull()]
print(f"Descriptions after filtering: {len(text_dataset)}")
text_dataset = text_dataset[text_dataset['desc_word_count'] > 20]

desc_list = text_dataset['desc'].tolist()
print(f"Descriptions after filtering: {len(desc_list)}")

# --- Categorize each description ---
categories = list(keyword_map_title.keys())
accepted = []
rejected = []

for desc in desc_list:
    desc_lower = desc.lower()
    keyword_counts = []
    
    # Count keyword matches per category
    for cat, keywords in keyword_map_title.items():
        count = sum(kw in desc_lower for kw in keywords)
        keyword_counts.append(count)
    
    # Pick the category with most matches
    max_idx = keyword_counts.index(max(keyword_counts))
    cat = categories[max_idx]
    label = split[max_idx]
    
    # Assign to accepted or rejected
    if label == (1.0, 0.0):
        accepted.append(desc)
    else:
        rejected.append(desc)

print(f"Accepted: {len(accepted)} descriptions")
print(f"Rejected: {len(rejected)} descriptions")


  text_dataset = pd.read_csv(text_dataset_path)


Columns: ['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'last_fico_range_high', 'last_fico_range_low', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'acc_now

In [8]:
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
import os

# print current working directory
print("Current Working Directory:")
print(os.getcwd())
# === Configuration ===
log_name = "bpi_2012"
input_xes = f"./reddit-mining/data/{log_name}.xes"     # Path to the input .xes file
output_xes = f"./reddit-mining/data/{log_name}_simple_labels.xes"   # Path to save the modified .xes file
target_activities = {"A_Denied", "A_Cancelled", "A_DECLINED", "A_CANCELLED"}  # List of activities to check


def add_binary_case_attribute(log, activity_list, attr_name="binary_flag"):
    """
    Adds a binary case attribute to each trace.
    If any event in the trace has an activity name in 'activity_list',
    assigns value 'A'; otherwise assigns 'B'.
    """
    print(log)
    for trace in log:
        # Get the set of all activities in this trace
        activities_in_trace = {event["concept:name"] for event in trace}

        # Check for intersection with the given activity list
        if activities_in_trace.intersection(activity_list):
            trace.attributes[attr_name] = "one more god rejected"
        else:
            trace.attributes[attr_name] = "i am the bug inside you"

    return log

# --- Step 1: Read the XES log ---
print("Reading log...")
log = xes_importer.apply(input_xes)

# --- Step 2: Add binary attribute ---
print("Adding binary case attribute...")
log = add_binary_case_attribute(log, target_activities)

# --- Step 3: Print first 10 cases for validation ---
print("\n=== First 10 Cases (for validation) ===")
for i, trace in enumerate(log[:10]):
    case_id = trace.attributes.get("concept:name", f"Case_{i}")
    activities = [event["concept:name"] for event in trace]
    binary_value = trace.attributes.get("binary_flag")
    print(f"Case: {case_id}")
    print(f"  Activities: {activities}")
    print(f"  Binary flag: {binary_value}")
    print("-" * 60)

# --- Step 4: Save updated log ---
xes_exporter.apply(log, output_xes)
print(f"\n✅ New case attribute added and saved to: {output_xes}")


Current Working Directory:
/home/jovyan
Reading log...


parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

Adding binary case attribute...
[{'attributes': {'REG_DATE': datetime.datetime(2011, 10, 1, 0, 38, 44, 546000, tzinfo=datetime.timezone.utc), 'concept:name': '173688', 'AMOUNT_REQ': '20000'}, 'events': [{'org:resource': '112', 'lifecycle:transition': 'COMPLETE', 'concept:name': 'A_SUBMITTED', 'time:timestamp': datetime.datetime(2011, 10, 1, 0, 38, 44, 546000, tzinfo=datetime.timezone.utc)}, '..', {'org:resource': '10629', 'lifecycle:transition': 'COMPLETE', 'concept:name': 'W_Valideren aanvraag', 'time:timestamp': datetime.datetime(2011, 10, 13, 10, 37, 37, 26000, tzinfo=datetime.timezone.utc)}]}, '....', {'attributes': {'REG_DATE': datetime.datetime(2012, 2, 29, 23, 51, 16, 799000, tzinfo=datetime.timezone.utc), 'concept:name': '214376', 'AMOUNT_REQ': '15000'}, 'events': [{'org:resource': '112', 'lifecycle:transition': 'COMPLETE', 'concept:name': 'A_SUBMITTED', 'time:timestamp': datetime.datetime(2012, 2, 29, 23, 51, 16, 799000, tzinfo=datetime.timezone.utc)}, '..', {'org:resource': '

exporting log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]


✅ New case attribute added and saved to: ./reddit-mining/data/bpi_2012_simple_labels.xes
