In [1]:
import re
import pandas as pd
import numpy as np

pred_path = "ftgemini1.jsonl"  # your raw text file

def coerce_value(v: str):
    v = v.strip()
    if v == "" or v.lower() in {"null", "none", "nan"}:
        return None
    if v.lower() == "true":
        return True
    if v.lower() == "false":
        return False
    # numeric
    if re.fullmatch(r"-?\d+", v):
        return int(v)
    if re.fullmatch(r"-?\d+\.\d+", v):
        return float(v)
    return v

def parse_events_linewise(path: str):
    text = open(path, "r", encoding="utf-8").read()

    # split events by blank lines
    blocks = [b.strip() for b in re.split(r"\n\s*\n+", text) if b.strip()]

    events = []
    for block in blocks:
        d = {}
        for ln in block.splitlines():
            ln = ln.strip()
            if not ln:
                continue
            # skip headers like "Event 2:"
            if re.match(r"^Event\s+\d+\s*:\s*$", ln, flags=re.I):
                continue

            # IMPORTANT: split only on ': ' (colon-space), once
            if ": " not in ln:
                continue
            key, val = ln.split(": ", 1)
            key = key.strip()
            val = val.strip()
            d[key] = coerce_value(val)

        if d:
            events.append(d)

    return events

pred_events = parse_events_linewise(pred_path)
pred_df = pd.DataFrame(pred_events)

print("Parsed events:", len(pred_df))
print("Columns include case:ApplicationID:", "case:ApplicationID" in pred_df.columns)
print("Columns include time:timestamp:", "time:timestamp" in pred_df.columns)
pred_df.head()


Parsed events: 168
Columns include case:ApplicationID: True
Columns include time:timestamp: True


Unnamed: 0,"{""Unnamed",Unnamed,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:concept:name,...,"""case:concept:name""","""case:MonthlyCost""","""case:Selected""","""case:ApplicationID""","""case:FirstWithdrawalAmount""","""case:CreditScore""","""case:OfferedAmount""","""case:NumberOfTerms""","""case:Accepted""","""OfferID"""
0,"0"": 98094, ""Action"": ""statechange"", ""org:resou...",,,,,,,,,,...,,,,,,,,,,
1,"0"": 98132, ""Action"": ""statechange"", ""org:resou...",,,,,,,,,,...,,,,,,,,,,
2,,0: 98083,statechange,User_49,O_Created,Offer,OfferState_839732054,complete,2016-07-15 13:25:43.521000+00:00,Offer_743621544,...,,,,,,,,,,
3,,0: 98084,statechange,User_49,O_Sent (mail and online),Offer,OfferState_1778925853,complete,2016-07-15 13:26:09.463000+00:00,Offer_743621544,...,,,,,,,,,,
4,,0: 99643,statechange,User_119,O_Returned,Offer,OfferState_1387568136,complete,2016-07-22 08:00:30.107000+00:00,Offer_743621544,...,,,,,,,,,,


In [2]:
CASE_COL = "case:ApplicationID"
TS_COL   = "time:timestamp"
Y_COL    = "case:Accepted"

pred_df[CASE_COL] = pred_df[CASE_COL].astype(str).str.strip()
pred_df[TS_COL] = pd.to_datetime(pred_df[TS_COL], utc=True, errors="coerce")

print("Timestamp parse success rate:", pred_df[TS_COL].notna().mean())
pred_df[[CASE_COL, TS_COL, "concept:name", Y_COL]].head(15)


Timestamp parse success rate: 0.9583333333333334


Unnamed: 0,case:ApplicationID,time:timestamp,concept:name,case:Accepted
0,,NaT,,
1,,NaT,,
2,Application_1224457292,2016-07-15 13:25:43.521000+00:00,O_Created,
3,Application_1224457292,2016-07-15 13:26:09.463000+00:00,O_Sent (mail and online),
4,Application_1224457292,2016-07-22 08:00:30.107000+00:00,O_Returned,
5,Application_1224457292,2016-07-22 09:00:26.493000+00:00,O_Accepted,True
6,,NaT,,
7,Application_426309193,2016-07-15 13:27:02.668000+00:00,O_Created,
8,Application_426309193,2016-07-15 13:27:24.180000+00:00,O_Sent (mail and online),
9,Application_426309193,2016-07-26 09:06:53.572000+00:00,O_Returned,


In [3]:
CASE_COL = "case:ApplicationID"
TS_COL   = "time:timestamp"
Y_COL    = "case:Accepted"

pred_df = pred_df.dropna(subset=[CASE_COL, TS_COL]).copy()
pred_df[CASE_COL] = pred_df[CASE_COL].astype(str).str.strip()

pred_by_case = {
    cid: grp.sort_values(TS_COL)
    for cid, grp in pred_df.groupby(CASE_COL)
}

print("Predicted cases:", len(pred_by_case))


Predicted cases: 43


In [4]:
import pandas as pd

X_COLS = [
    "case:FirstWithdrawalAmount",
    "case:CreditScore",
    "case:OfferedAmount",
    "case:NumberOfTerms",
]

pred_case_rows = []

for case_id, grp in pred_by_case.items():
    grp = grp.sort_values(TS_COL)

    # Steps = number of EVENTS (do not deduplicate concept:name)
    pred_step_count = len(grp)

    pred_first_ts = grp[TS_COL].iloc[0]
    pred_last_ts  = grp[TS_COL].iloc[-1]
    predicted_time_spent = (pred_last_ts - pred_first_ts).total_seconds()

    # IMPORTANT: y only from LAST event
    last_event = grp.iloc[-1]
    y_pred_accepted = last_event.get(Y_COL, None)

    # Case attributes from first event (stable per case)
    first_event = grp.iloc[0]

    pred_case_rows.append({
        CASE_COL: case_id,
        "pred_step_count": pred_step_count,
        "predicted_time_spent": predicted_time_spent,
        "y_pred_accepted": y_pred_accepted,
        **{c: first_event.get(c) for c in X_COLS},
    })

df_pred_case = pd.DataFrame(pred_case_rows)

print("y_pred_accepted distribution (raw):")
print(df_pred_case["y_pred_accepted"].value_counts(dropna=False))

df_pred_case.head()


y_pred_accepted distribution (raw):
y_pred_accepted
True     25
False    14
NaN       4
Name: count, dtype: int64


Unnamed: 0,case:ApplicationID,pred_step_count,predicted_time_spent,y_pred_accepted,case:FirstWithdrawalAmount,case:CreditScore,case:OfferedAmount,case:NumberOfTerms
0,Application_1029247764,4,1136569.663,True,5000.0,0.0,5000.0,100.0
1,Application_1030731936,3,2654497.319,,0.0,0.0,6000.0,138.0
2,Application_1037363488,4,1122716.063,True,6000.0,892.0,6000.0,48.0
3,Application_1056924645,4,1093784.341,False,12956.0,0.0,20700.0,80.0
4,Application_1101006464,3,324515.554,False,22317.0,0.0,47000.0,120.0


In [5]:
real_log = pd.read_csv("C:/Users/Deniz/Desktop/thesisData_part2.csv")  # adjust path

real_log[CASE_COL] = real_log[CASE_COL].astype(str).str.strip()
real_log[TS_COL] = pd.to_datetime(real_log[TS_COL], utc=True, errors="coerce")

# filter to only predicted cases
real_log_sub = real_log[real_log[CASE_COL].isin(df_pred_case[CASE_COL])].copy()
real_log_sub = real_log_sub.sort_values([CASE_COL, TS_COL])

real_case = (
    real_log_sub
    .groupby(CASE_COL)
    .agg(
        y_real_accepted=("case:Accepted", "last"),
    )
    .reset_index()
)

print("Matched real cases:", real_case[CASE_COL].nunique())
real_case.head()


Matched real cases: 43


Unnamed: 0,case:ApplicationID,y_real_accepted
0,Application_1029247764,True
1,Application_1030731936,True
2,Application_1037363488,True
3,Application_1056924645,False
4,Application_1101006464,False


In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df_case = df_pred_case.merge(real_case, on=CASE_COL, how="inner")

# keep only cases where both are boolean
df_case = df_case[
    df_case["y_pred_accepted"].isin([True, False]) &
    df_case["y_real_accepted"].isin([True, False])
].copy()

df_case["y_pred_accepted"] = df_case["y_pred_accepted"].astype(int)
df_case["y_real_accepted"] = df_case["y_real_accepted"].astype(int)

y_true = df_case["y_real_accepted"]
y_pred = df_case["y_pred_accepted"]

print("Cases used for metrics:", len(df_case))
print("Accuracy :", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, zero_division=0))
print("Recall   :", recall_score(y_true, y_pred, zero_division=0))
print("F1-score :", f1_score(y_true, y_pred, zero_division=0))


Cases used for metrics: 39
Accuracy : 0.8461538461538461
Precision: 0.88
Recall   : 0.88
F1-score : 0.88


In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import shap

FEATURES = [
    "predicted_time_spent",
    "case:FirstWithdrawalAmount",
    "case:CreditScore",
    "case:OfferedAmount",
    "case:NumberOfTerms",
    "pred_step_count",
]

TARGET = "y_pred_accepted"

df_model = df_case[FEATURES + [TARGET]].copy()

# numeric X
for c in FEATURES:
    df_model[c] = pd.to_numeric(df_model[c], errors="coerce")

df_model = df_model.dropna()

X = df_model[FEATURES]
y = df_model[TARGET].astype(int)

print("Rows:", len(df_model))
print("y distribution:")
print(y.value_counts())

# scale
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

# fit logistic regression
model = LogisticRegression(max_iter=2000, solver="liblinear")
model.fit(X_scaled, y)

# coefficients table
coef_df = (
    pd.DataFrame({
        "feature": X_scaled.columns,
        "coef": model.coef_[0],
        "odds_ratio": np.exp(model.coef_[0]),
    })
    .sort_values("coef", ascending=False)
    .reset_index(drop=True)
)

print("=== ALL COEFFICIENTS ===")
display(coef_df)

# SHAP values (tables only)
explainer = shap.LinearExplainer(model, X_scaled)
shap_values = explainer.shap_values(X_scaled)

shap_matrix = pd.DataFrame(shap_values, columns=X_scaled.columns, index=X_scaled.index)

shap_overview = (
    pd.DataFrame({
        "feature": X_scaled.columns,
        "mean_abs_shap": shap_matrix.abs().mean().values,
        "mean_shap": shap_matrix.mean().values,
    })
    .sort_values("mean_abs_shap", ascending=False)
    .reset_index(drop=True)
)

print("=== SHAP OVERVIEW ===")
display(shap_overview)


Rows: 39
y distribution:
y_pred_accepted
1    25
0    14
Name: count, dtype: int64
=== ALL COEFFICIENTS ===


Unnamed: 0,feature,coef,odds_ratio
0,case:CreditScore,1.410107,4.096394
1,case:NumberOfTerms,0.839856,2.316034
2,pred_step_count,0.62233,1.863264
3,case:FirstWithdrawalAmount,0.385104,1.469768
4,predicted_time_spent,0.2536,1.288656
5,case:OfferedAmount,-0.101136,0.90381


=== SHAP OVERVIEW ===


Unnamed: 0,feature,mean_abs_shap,mean_shap
0,case:CreditScore,1.373616,-1.708035e-17
1,case:NumberOfTerms,0.771594,-3.4160710000000005e-17
2,pred_step_count,0.586738,-5.693451e-18
3,case:FirstWithdrawalAmount,0.311576,6.405133e-18
4,predicted_time_spent,0.187913,0.0
5,case:OfferedAmount,0.075098,0.0
