In [3]:
import json
from pathlib import Path

pred_path = Path("basegemini1.jsonl")  # change filename

pred_events = []
with pred_path.open("r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        line = line.strip()
        if not line:
            continue
        try:
            pred_events.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Skipping line {i}: JSON error -> {e}")

print("Total predicted events:", len(pred_events))
print("Example event keys:", pred_events[0].keys())


Total predicted events: 247
Example event keys: dict_keys(['Unnamed: 0', 'Action', 'org:resource', 'concept:name', 'EventOrigin', 'EventID', 'lifecycle:transition', 'time:timestamp', 'case:concept:name', 'case:MonthlyCost', 'case:Selected', 'case:ApplicationID', 'case:FirstWithdrawalAmount', 'case:CreditScore', 'case:OfferedAmount', 'case:NumberOfTerms', 'case:Accepted', 'OfferID'])


In [4]:
import pandas as pd

CASE_COL = "case:ApplicationID"
TS_COL   = "time:timestamp"
ACT_COL  = "concept:name"
Y_COL    = "case:Accepted"

# Convert to DataFrame for easier handling
pred_df = pd.DataFrame(pred_events)

# Normalize types
pred_df[CASE_COL] = pred_df[CASE_COL].astype(str).str.strip()
pred_df[TS_COL] = pd.to_datetime(pred_df[TS_COL], utc=True, errors="coerce")

# Group into traces
pred_by_case = {
    cid: grp.sort_values(TS_COL).to_dict("records")
    for cid, grp in pred_df.groupby(CASE_COL)
}

print("Unique predicted cases:", len(pred_by_case))


Unique predicted cases: 50


In [5]:
real_log = pd.read_csv("C:/Users/Deniz/Desktop/thesisData_part2.csv")

real_log[CASE_COL] = real_log[CASE_COL].astype(str).str.strip()
real_log[TS_COL] = pd.to_datetime(real_log[TS_COL], utc=True, errors="coerce")

pred_case_ids = set(pred_by_case.keys())

real_log_sub = real_log[real_log[CASE_COL].isin(pred_case_ids)].copy()
real_log_sub = real_log_sub.sort_values([CASE_COL, TS_COL])

gt_map = real_log_sub.groupby(CASE_COL)[ACT_COL].apply(list).to_dict()

print("Matched real cases:", real_log_sub[CASE_COL].nunique())


Matched real cases: 50


In [6]:
def normalize_pred_events_to_start_after_first(gt_seq, pred_events, act_col=ACT_COL):
    if not gt_seq or not pred_events:
        return pred_events

    pred_names = [e.get(act_col) for e in pred_events]
    if not pred_names:
        return pred_events

    gt0 = gt_seq[0]
    gt1 = gt_seq[1] if len(gt_seq) > 1 else None
    p0  = pred_names[0]

    if p0 == gt0:
        return pred_events[1:]
    if gt1 is not None and p0 == gt1:
        return pred_events
    if gt0 in pred_names:
        idx = pred_names.index(gt0)
        return pred_events[idx+1:]
    return pred_events


norm_pred_by_case = {}

for case_id, events in pred_by_case.items():
    gt_seq = gt_map.get(case_id, [])
    norm_pred_by_case[case_id] = normalize_pred_events_to_start_after_first(gt_seq, events)


In [7]:
X_COLS = [
    "case:FirstWithdrawalAmount",
    "case:CreditScore",
    "case:OfferedAmount",
    "case:NumberOfTerms",
]

real_case = (
    real_log_sub
    .groupby(CASE_COL)
    .agg(
        real_first_ts=(TS_COL, "min"),
        real_last_ts=(TS_COL, "max"),
        real_step_count=(ACT_COL, "count"),
        **{c: (c, "first") for c in X_COLS},
    )
    .reset_index()
)

real_case["real_time_spent"] = (
    real_case["real_last_ts"] - real_case["real_first_ts"]
).dt.total_seconds()


In [8]:
pred_rows = []

for case_id, events in pred_by_case.items():
    norm_events = norm_pred_by_case.get(case_id, [])

    # y from LAST predicted event
    last_event = events[-1] if events else {}
    y_pred_accepted = last_event.get(Y_COL, None)

    # step count = number of EVENTS
    pred_step_count = len(norm_events)

    # predicted time spent = last predicted ts âˆ’ first real ts
    norm_times = [
        pd.to_datetime(e.get(TS_COL), utc=True, errors="coerce")
        for e in norm_events
        if TS_COL in e
    ]
    pred_last_ts = max([t for t in norm_times if pd.notna(t)], default=pd.NaT)

    pred_rows.append({
        CASE_COL: case_id,
        "y_pred_accepted": y_pred_accepted,
        "pred_step_count": pred_step_count,
        "pred_last_ts": pred_last_ts,
    })

pred_case = pd.DataFrame(pred_rows)

print("Predicted acceptance distribution:")
print(pred_case["y_pred_accepted"].value_counts(dropna=False))


Predicted acceptance distribution:
y_pred_accepted
True     28
False    21
False     1
Name: count, dtype: int64


In [9]:
df_case = real_case.merge(pred_case, on=CASE_COL, how="inner")

df_case["predicted_time_spent"] = (
    df_case["pred_last_ts"] - df_case["real_first_ts"]
).dt.total_seconds()

# clean y
df_case = df_case[df_case["y_pred_accepted"].isin([True, False])].copy()
df_case["y_pred_accepted"] = df_case["y_pred_accepted"].astype(int)

df_case.head()


Unnamed: 0,case:ApplicationID,real_first_ts,real_last_ts,real_step_count,case:FirstWithdrawalAmount,case:CreditScore,case:OfferedAmount,case:NumberOfTerms,real_time_spent,y_pred_accepted,pred_step_count,pred_last_ts,predicted_time_spent
0,Application_1029247764,2016-07-16 08:31:25.076000+00:00,2016-07-26 13:21:00.362000+00:00,5,5000.0,0,5000.0,100,881375.286,1,6,2016-07-16 09:07:29.627000+00:00,2164.551
1,Application_1030731936,2016-07-15 13:40:04.844000+00:00,2016-08-29 18:32:13.126000+00:00,5,0.0,0,6000.0,138,3905528.282,1,5,2016-07-15 13:40:05.564000+00:00,0.72
2,Application_1037363488,2016-07-15 13:29:51.190000+00:00,2016-07-22 14:21:14.151000+00:00,5,6000.0,892,6000.0,48,607882.961,0,6,2016-07-15 14:17:51.190000+00:00,2880.0
3,Application_1056924645,2016-07-15 16:10:56.236000+00:00,2016-07-25 12:14:39.185000+00:00,9,12956.0,0,20700.0,80,849822.949,1,2,NaT,
5,Application_1213104145,2016-07-15 15:31:18.015000+00:00,2016-07-26 13:44:53.918000+00:00,5,0.0,889,8000.0,45,944015.903,1,3,2016-07-15 15:31:52.687000+00:00,34.672


In [10]:
# real acceptance from real log (last real event)
real_accept_map = (
    real_log_sub
    .sort_values([CASE_COL, TS_COL])
    .groupby(CASE_COL)["case:Accepted"]
    .last()
    .to_dict()
)

df_case["y_real_accepted"] = df_case[CASE_COL].map(real_accept_map)

df_metrics = df_case[
    df_case["y_real_accepted"].isin([True, False])
].copy()

df_metrics["y_real_accepted"] = df_metrics["y_real_accepted"].astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_true = df_metrics["y_real_accepted"]
y_pred = df_metrics["y_pred_accepted"]

print("Accuracy :", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, zero_division=0))
print("Recall   :", recall_score(y_true, y_pred, zero_division=0))
print("F1-score :", f1_score(y_true, y_pred, zero_division=0))


Accuracy : 0.42857142857142855
Precision: 0.6428571428571429
Recall   : 0.5
F1-score : 0.5625


In [11]:
FEATURES = [
    "predicted_time_spent",
    "case:FirstWithdrawalAmount",
    "case:CreditScore",
    "case:OfferedAmount",
    "case:NumberOfTerms",
    "pred_step_count",
]

TARGET = "y_pred_accepted"

# df_case should contain these columns (from the case-level build)
df_model_small = df_case[FEATURES + [TARGET]].copy()

# keep only valid boolean targets
df_model_small = df_model_small[df_model_small[TARGET].isin([0, 1, True, False])].copy()
df_model_small[TARGET] = df_model_small[TARGET].astype(int)

# ensure numeric X
for c in FEATURES:
    df_model_small[c] = pd.to_numeric(df_model_small[c], errors="coerce")

df_model_small = df_model_small.dropna()

X = df_model_small[FEATURES].copy()
y = df_model_small[TARGET].copy()

print("Rows:", len(df_model_small))
print("y distribution:")
print(y.value_counts())
X.head()


Rows: 47
y distribution:
y_pred_accepted
1    27
0    20
Name: count, dtype: int64


Unnamed: 0,predicted_time_spent,case:FirstWithdrawalAmount,case:CreditScore,case:OfferedAmount,case:NumberOfTerms,pred_step_count
0,2164.551,5000.0,0,5000.0,100,6
1,0.72,0.0,0,6000.0,138,5
2,2880.0,6000.0,892,6000.0,48,6
5,34.672,0.0,889,8000.0,45,3
6,0.0,20000.0,713,20000.0,120,1


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)

model = LogisticRegression(max_iter=2000, solver="liblinear")
model.fit(X_scaled, y)

coef_df = (
    pd.DataFrame({
        "feature": X_scaled.columns,
        "coef": model.coef_[0],
        "odds_ratio": np.exp(model.coef_[0])
    })
    .sort_values("coef", ascending=False)
    .reset_index(drop=True)
)

coef_df

Unnamed: 0,feature,coef,odds_ratio
0,pred_step_count,0.734887,2.085247
1,case:CreditScore,0.509831,1.665009
2,case:NumberOfTerms,-0.027496,0.972879
3,case:OfferedAmount,-0.188125,0.828511
4,predicted_time_spent,-0.224737,0.798726
5,case:FirstWithdrawalAmount,-0.98824,0.372231


In [16]:
import shap

explainer = shap.LinearExplainer(model, X_scaled)
shap_values = explainer.shap_values(X_scaled)

shap_matrix = pd.DataFrame(shap_values, columns=X_scaled.columns, index=X_scaled.index)

# Global SHAP importance table
shap_overview = (
    pd.DataFrame({
        "feature": X_scaled.columns,
        "mean_abs_shap": shap_matrix.abs().mean().values,
        "mean_shap": shap_matrix.mean().values
    })
    .sort_values("mean_abs_shap", ascending=False)
    .reset_index(drop=True)
)

print("=== Coefficients ===")
display(coef_df)

print("\n=== SHAP overview (global) ===")
display(shap_overview)

=== Coefficients ===


Unnamed: 0,feature,coef,odds_ratio
0,pred_step_count,0.734887,2.085247
1,case:CreditScore,0.509831,1.665009
2,case:NumberOfTerms,-0.027496,0.972879
3,case:OfferedAmount,-0.188125,0.828511
4,predicted_time_spent,-0.224737,0.798726
5,case:FirstWithdrawalAmount,-0.98824,0.372231



=== SHAP overview (global) ===


Unnamed: 0,feature,mean_abs_shap,mean_shap
0,case:FirstWithdrawalAmount,0.801987,-6.318823000000001e-17
1,pred_step_count,0.631069,-3.307047e-17
2,case:CreditScore,0.50209,-1.1810880000000001e-17
3,case:OfferedAmount,0.141278,7.677074e-18
4,predicted_time_spent,0.095667,-2.362177e-18
5,case:NumberOfTerms,0.025313,1.033452e-18
