In [14]:
import re
import pandas as pd
import numpy as np

pred_path = "ftgpt1.jsonl"  # your raw text file

def coerce_value(v: str):
    v = v.strip()
    if v == "" or v.lower() in {"null", "none", "nan"}:
        return None
    if v.lower() == "true":
        return True
    if v.lower() == "false":
        return False
    # numeric
    if re.fullmatch(r"-?\d+", v):
        return int(v)
    if re.fullmatch(r"-?\d+\.\d+", v):
        return float(v)
    return v

def parse_events_linewise(path: str):
    text = open(path, "r", encoding="utf-8").read()

    # split events by blank lines
    blocks = [b.strip() for b in re.split(r"\n\s*\n+", text) if b.strip()]

    events = []
    for block in blocks:
        d = {}
        for ln in block.splitlines():
            ln = ln.strip()
            if not ln:
                continue
            # skip headers like "Event 2:"
            if re.match(r"^Event\s+\d+\s*:\s*$", ln, flags=re.I):
                continue

            # IMPORTANT: split only on ': ' (colon-space), once
            if ": " not in ln:
                continue
            key, val = ln.split(": ", 1)
            key = key.strip()
            val = val.strip()
            d[key] = coerce_value(val)

        if d:
            events.append(d)

    return events

pred_events = parse_events_linewise(pred_path)
pred_df = pd.DataFrame(pred_events)

print("Parsed events:", len(pred_df))
print("Columns include case:ApplicationID:", "case:ApplicationID" in pred_df.columns)
print("Columns include time:timestamp:", "time:timestamp" in pred_df.columns)
pred_df.head()


Parsed events: 183
Columns include case:ApplicationID: True
Columns include time:timestamp: True


Unnamed: 0,Unnamed,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:concept:name,case:MonthlyCost,case:Selected,case:ApplicationID,case:FirstWithdrawalAmount,case:CreditScore,case:OfferedAmount,case:NumberOfTerms,OfferID,case:Accepted,transition
0,0: 15936,statechange,User_46,O_Created,Offer,OfferState_230946698,complete,2016-07-15 13:22:15.587000+00:00,Offer_2026305556,300.0,False,Application_1703731762,0.0,0,25000.0,101,Offer_2026305556,,
1,0: 12561,statechange,User_42,O_Returned,Offer,OfferState_801146511,complete,2016-07-15 15:01:08.121000+00:00,Offer_2026305556,300.0,False,Application_1703731762,0.0,0,25000.0,101,Offer_2026305556,,
2,0: 327,statechange,User_63,O_Cancelled,Offer,OfferState_2036456641,complete,2016-07-22 07:40:20.673000+00:00,Offer_2026305556,300.0,False,Application_1703731762,0.0,0,25000.0,101,Offer_2026305556,False,
3,0: 4547,statechange,User_17,O_Created,Offer,OfferState_2075865286,complete,2016-07-15 13:22:58.793000+00:00,Offer_1944055847,205.49,True,Application_1218775385,20000.0,713,20000.0,120,Offer_1944055847,,
4,0: 12147,statechange,User_17,O_Sent,Offer,OfferState_72931051,complete,2016-07-15 13:23:39.101000+00:00,Offer_1944055847,205.49,True,Application_1218775385,20000.0,713,20000.0,120,Offer_1944055847,,


In [15]:
CASE_COL = "case:ApplicationID"
TS_COL   = "time:timestamp"
Y_COL    = "case:Accepted"

pred_df[CASE_COL] = pred_df[CASE_COL].astype(str).str.strip()
pred_df[TS_COL] = pd.to_datetime(pred_df[TS_COL], utc=True, errors="coerce")

print("Timestamp parse success rate:", pred_df[TS_COL].notna().mean())
pred_df[[CASE_COL, TS_COL, "concept:name", Y_COL]].head(15)


Timestamp parse success rate: 0.9836065573770492


Unnamed: 0,case:ApplicationID,time:timestamp,concept:name,case:Accepted
0,Application_1703731762,2016-07-15 13:22:15.587000+00:00,O_Created,
1,Application_1703731762,2016-07-15 15:01:08.121000+00:00,O_Returned,
2,Application_1703731762,2016-07-22 07:40:20.673000+00:00,O_Cancelled,False
3,Application_1218775385,2016-07-15 13:22:58.793000+00:00,O_Created,
4,Application_1218775385,2016-07-15 13:23:39.101000+00:00,O_Sent,
5,Application_1218775385,2016-07-22 06:36:11.841000+00:00,O_Returned,
6,Application_1218775385,2016-07-22 12:24:55.568000+00:00,O_Accepted,True
7,Application_1224457292,2016-07-15 13:25:43.648000+00:00,O_Created,
8,Application_1224457292,2016-07-15 13:26:22.514000+00:00,O_Sent,
9,Application_1224457292,2016-07-18 07:43:53.537000+00:00,O_Returned,


In [16]:
def clean_accepted(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return None
    if isinstance(x, bool):
        return x
    s = str(x).strip().lower()
    if s == "true":
        return True
    if s == "false":
        return False
    return None

pred_df[Y_COL] = pred_df[Y_COL].apply(clean_accepted)
pred_df[Y_COL].value_counts(dropna=False)



case:Accepted
None     128
True      28
False     27
Name: count, dtype: int64

In [17]:
CASE_COL = "case:ApplicationID"
TS_COL   = "time:timestamp"
Y_COL    = "case:Accepted"

pred_df = pred_df.dropna(subset=[CASE_COL, TS_COL]).copy()
pred_df[CASE_COL] = pred_df[CASE_COL].astype(str).str.strip()

pred_by_case = {
    cid: grp.sort_values(TS_COL)
    for cid, grp in pred_df.groupby(CASE_COL)
}

print("Predicted cases:", len(pred_by_case))


Predicted cases: 51


In [18]:
import pandas as pd

X_COLS = [
    "case:FirstWithdrawalAmount",
    "case:CreditScore",
    "case:OfferedAmount",
    "case:NumberOfTerms",
]

pred_case_rows = []

for case_id, grp in pred_by_case.items():
    grp = grp.sort_values(TS_COL)

    # Steps = number of EVENTS (do not deduplicate concept:name)
    pred_step_count = len(grp)

    pred_first_ts = grp[TS_COL].iloc[0]
    pred_last_ts  = grp[TS_COL].iloc[-1]
    predicted_time_spent = (pred_last_ts - pred_first_ts).total_seconds()

    # IMPORTANT: y only from LAST event
    last_event = grp.iloc[-1]
    y_pred_accepted = last_event.get(Y_COL, None)

    # Case attributes from first event (stable per case)
    first_event = grp.iloc[0]

    pred_case_rows.append({
        CASE_COL: case_id,
        "pred_step_count": pred_step_count,
        "predicted_time_spent": predicted_time_spent,
        "y_pred_accepted": y_pred_accepted,
        **{c: first_event.get(c) for c in X_COLS},
    })

df_pred_case = pd.DataFrame(pred_case_rows)

print("y_pred_accepted distribution (raw):")
print(df_pred_case["y_pred_accepted"].value_counts(dropna=False))

df_pred_case.head()


y_pred_accepted distribution (raw):
y_pred_accepted
True     26
False    16
None      9
Name: count, dtype: int64


Unnamed: 0,case:ApplicationID,pred_step_count,predicted_time_spent,y_pred_accepted,case:FirstWithdrawalAmount,case:CreditScore,case:OfferedAmount,case:NumberOfTerms
0,Application_1029247764,3,526492.843,False,5000.0,0,5000.0,100
1,Application_1030731936,4,492633.397,True,0.0,0,6000.0,138
2,Application_1037363488,4,1011605.185,True,6000.0,892,6000.0,48
3,Application_1056924645,3,315293.08,False,12956.0,0,20700.0,80
4,Application_1101006464,3,4601.484,,22317.0,0,47000.0,120


In [20]:
real_log = pd.read_csv("C:/Users/Deniz/Desktop/thesisData_part2.csv")  # adjust path

real_log[CASE_COL] = real_log[CASE_COL].astype(str).str.strip()
real_log[TS_COL] = pd.to_datetime(real_log[TS_COL], utc=True, errors="coerce")

# filter to only predicted cases
real_log_sub = real_log[real_log[CASE_COL].isin(df_pred_case[CASE_COL])].copy()
real_log_sub = real_log_sub.sort_values([CASE_COL, TS_COL])

real_case = (
    real_log_sub
    .groupby(CASE_COL)
    .agg(
        y_real_accepted=("case:Accepted", "last"),
    )
    .reset_index()
)

print("Matched real cases:", real_case[CASE_COL].nunique())
real_case.head()


Matched real cases: 50


Unnamed: 0,case:ApplicationID,y_real_accepted
0,Application_1029247764,True
1,Application_1030731936,True
2,Application_1037363488,True
3,Application_1056924645,False
4,Application_1101006464,False


In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df_case = df_pred_case.merge(real_case, on=CASE_COL, how="inner")

# keep only cases where both are boolean
df_case = df_case[
    df_case["y_pred_accepted"].isin([True, False]) &
    df_case["y_real_accepted"].isin([True, False])
].copy()

df_case["y_pred_accepted"] = df_case["y_pred_accepted"].astype(int)
df_case["y_real_accepted"] = df_case["y_real_accepted"].astype(int)

y_true = df_case["y_real_accepted"]
y_pred = df_case["y_pred_accepted"]

print("Cases used for metrics:", len(df_case))
print("Accuracy :", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, zero_division=0))
print("Recall   :", recall_score(y_true, y_pred, zero_division=0))
print("F1-score :", f1_score(y_true, y_pred, zero_division=0))


Cases used for metrics: 41
Accuracy : 0.7804878048780488
Precision: 0.92
Recall   : 0.7666666666666667
F1-score : 0.8363636363636363


In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import shap

FEATURES = [
    "predicted_time_spent",
    "case:FirstWithdrawalAmount",
    "case:CreditScore",
    "case:OfferedAmount",
    "case:NumberOfTerms",
    "pred_step_count",
]

TARGET = "y_pred_accepted"

df_model = df_case[FEATURES + [TARGET]].copy()

# numeric X
for c in FEATURES:
    df_model[c] = pd.to_numeric(df_model[c], errors="coerce")

df_model = df_model.dropna()

X = df_model[FEATURES]
y = df_model[TARGET].astype(int)

print("Rows:", len(df_model))
print("y distribution:")
print(y.value_counts())

# scale
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

# fit logistic regression
model = LogisticRegression(max_iter=2000, solver="liblinear")
model.fit(X_scaled, y)

# coefficients table
coef_df = (
    pd.DataFrame({
        "feature": X_scaled.columns,
        "coef": model.coef_[0],
        "odds_ratio": np.exp(model.coef_[0]),
    })
    .sort_values("coef", ascending=False)
    .reset_index(drop=True)
)

print("=== ALL COEFFICIENTS ===")
display(coef_df)

# SHAP values (tables only)
explainer = shap.LinearExplainer(model, X_scaled)
shap_values = explainer.shap_values(X_scaled)

shap_matrix = pd.DataFrame(shap_values, columns=X_scaled.columns, index=X_scaled.index)

shap_overview = (
    pd.DataFrame({
        "feature": X_scaled.columns,
        "mean_abs_shap": shap_matrix.abs().mean().values,
        "mean_shap": shap_matrix.mean().values,
    })
    .sort_values("mean_abs_shap", ascending=False)
    .reset_index(drop=True)
)

print("=== SHAP OVERVIEW ===")
display(shap_overview)


Rows: 41
y distribution:
y_pred_accepted
1    25
0    16
Name: count, dtype: int64
=== ALL COEFFICIENTS ===


Unnamed: 0,feature,coef,odds_ratio
0,case:NumberOfTerms,0.361658,1.435708
1,case:CreditScore,0.354195,1.425033
2,case:FirstWithdrawalAmount,0.18579,1.204169
3,predicted_time_spent,0.092659,1.097088
4,case:OfferedAmount,-0.198133,0.820261
5,pred_step_count,-0.367712,0.692317


=== SHAP OVERVIEW ===


Unnamed: 0,feature,mean_abs_shap,mean_shap
0,case:CreditScore,0.347889,1.3539310000000001e-17
1,case:NumberOfTerms,0.331092,-5.4157220000000004e-18
2,pred_step_count,0.260918,2.030896e-18
3,case:FirstWithdrawalAmount,0.1509,-5.0772390000000004e-18
4,case:OfferedAmount,0.142748,2.030896e-18
5,predicted_time_spent,0.070617,3.3848259999999997e-19
