In [2]:
CASE_COL = "case:ApplicationID"
ACT_COL  = "concept:name"
TS_COL   = "time:timestamp"
Y_COL    = "case:Accepted"

X_COLS = [
    "case:FirstWithdrawalAmount",
    "case:CreditScore",
    "case:OfferedAmount",
    "case:NumberOfTerms",
]
import json
from pathlib import Path

pred_path = Path("basegpt1.jsonl")   # change filename
text = pred_path.read_text(encoding="utf-8")

def extract_top_level_json_arrays(text: str):
    arrays = []
    i, n = 0, len(text)

    while i < n:
        if text[i] != '[':
            i += 1
            continue

        start = i
        depth = 1
        in_str = False
        esc = False
        i += 1

        while i < n and depth > 0:
            ch = text[i]
            if in_str:
                if esc:
                    esc = False
                elif ch == '\\':
                    esc = True
                elif ch == '"':
                    in_str = False
            else:
                if ch == '"':
                    in_str = True
                elif ch == '[':
                    depth += 1
                elif ch == ']':
                    depth -= 1
            i += 1

        arrays.append(json.loads(text[start:i]))

    return arrays

pred_traces = extract_top_level_json_arrays(text)
print("Predicted traces:", len(pred_traces))
print("First trace length:", len(pred_traces[0]) if pred_traces else None)


Predicted traces: 50
First trace length: 4


In [3]:
pred_by_case = {}

for trace in pred_traces:
    if not trace:
        continue
    case_id = str(trace[0].get(CASE_COL)).strip()
    pred_by_case[case_id] = trace

print("Unique predicted cases:", len(pred_by_case))


Unique predicted cases: 50


In [5]:
import pandas as pd

real_log = pd.read_csv("C:/Users/Deniz/Desktop/thesisData_part2.csv")   # change filename

real_log[CASE_COL] = real_log[CASE_COL].astype(str).str.strip()
real_log[TS_COL] = pd.to_datetime(real_log[TS_COL], utc=True, errors="coerce")

pred_case_ids = set(pred_by_case.keys())

real_log_sub = real_log[real_log[CASE_COL].isin(pred_case_ids)].copy()
real_log_sub = real_log_sub.sort_values([CASE_COL, TS_COL])

gt_map = real_log_sub.groupby(CASE_COL)[ACT_COL].apply(list).to_dict()

print("Matched real cases:", real_log_sub[CASE_COL].nunique())


Matched real cases: 50


In [6]:
def normalize_pred_events_to_start_after_first(gt_seq, pred_events, act_col=ACT_COL):
    """
    Returns predicted events representing the continuation after the first real event.
    Does NOT deduplicate activity names; counts events as steps.
    """
    if not gt_seq or not pred_events:
        return pred_events, "empty_or_no_gt"

    pred_names = [e.get(act_col) for e in pred_events]
    if not pred_names:
        return pred_events, "no_names"

    gt0 = gt_seq[0]
    gt1 = gt_seq[1] if len(gt_seq) > 1 else None
    p0  = pred_names[0]

    if p0 == gt0:
        return pred_events[1:], "dropped_prefix_gt0"
    if gt1 is not None and p0 == gt1:
        return pred_events, "already_next_step"
    if gt0 in pred_names:
        idx = pred_names.index(gt0)
        return pred_events[idx+1:], "trimmed_to_gt0_inside_pred"
    return pred_events, "unaligned"


norm_pred_by_case = {}
status_rows = []

for case_id, trace in pred_by_case.items():
    pred_events = [e for e in trace if ACT_COL in e]  # keep events
    gt_seq = gt_map.get(case_id, [])

    norm_events, status = normalize_pred_events_to_start_after_first(gt_seq, pred_events)

    norm_pred_by_case[case_id] = norm_events
    status_rows.append((case_id, status, len(pred_events), len(norm_events), len(gt_seq)))

status_df = pd.DataFrame(status_rows, columns=[CASE_COL, "status", "pred_events_raw", "pred_events_norm", "gt_len"])
status_df["status"].value_counts()


status
unaligned    50
Name: count, dtype: int64

In [7]:
real_case = (
    real_log_sub
    .groupby(CASE_COL)
    .agg(
        real_first_ts=(TS_COL, "min"),
        real_last_ts=(TS_COL, "max"),
        real_step_count=(ACT_COL, "count"),
        **{c: (c, "first") for c in X_COLS},
    )
    .reset_index()
)

real_case["real_time_spent"] = (real_case["real_last_ts"] - real_case["real_first_ts"]).dt.total_seconds()
real_case.head()


Unnamed: 0,case:ApplicationID,real_first_ts,real_last_ts,real_step_count,case:FirstWithdrawalAmount,case:CreditScore,case:OfferedAmount,case:NumberOfTerms,real_time_spent
0,Application_1029247764,2016-07-16 08:31:25.076000+00:00,2016-07-26 13:21:00.362000+00:00,5,5000.0,0,5000.0,100,881375.286
1,Application_1030731936,2016-07-15 13:40:04.844000+00:00,2016-08-29 18:32:13.126000+00:00,5,0.0,0,6000.0,138,3905528.282
2,Application_1037363488,2016-07-15 13:29:51.190000+00:00,2016-07-22 14:21:14.151000+00:00,5,6000.0,892,6000.0,48,607882.961
3,Application_1056924645,2016-07-15 16:10:56.236000+00:00,2016-07-25 12:14:39.185000+00:00,9,12956.0,0,20700.0,80,849822.949
4,Application_1101006464,2016-07-15 13:54:09.652000+00:00,2016-07-28 08:09:44.724000+00:00,10,22317.0,0,47000.0,120,1102535.072


In [8]:
import numpy as np

pred_rows = []

for case_id, trace in pred_by_case.items():
    # sort by timestamp for safety
    trace_sorted = sorted(
        trace,
        key=lambda e: pd.to_datetime(e.get(TS_COL), utc=True, errors="coerce")
    )

    # y from LAST predicted event (as you requested)
    last_event = trace_sorted[-1] if trace_sorted else {}
    y_pred_accepted = last_event.get(Y_COL, None)

    # normalized predicted continuation EVENTS (steps counted as events)
    norm_events = norm_pred_by_case.get(case_id, [])
    pred_step_count = len(norm_events)

    # predicted last timestamp from normalized events
    norm_times = [
        pd.to_datetime(e.get(TS_COL), utc=True, errors="coerce")
        for e in norm_events if TS_COL in e
    ]
    pred_last_ts = max([t for t in norm_times if pd.notna(t)], default=pd.NaT)

    pred_rows.append({
        CASE_COL: case_id,
        "y_pred_accepted": y_pred_accepted,
        "pred_step_count": pred_step_count,
        "pred_last_ts": pred_last_ts,
    })

pred_case = pd.DataFrame(pred_rows)

print("y_pred_accepted distribution (raw):")
print(pred_case["y_pred_accepted"].value_counts(dropna=False))

pred_case.head()


y_pred_accepted distribution (raw):
y_pred_accepted
True     29
False    21
Name: count, dtype: int64


Unnamed: 0,case:ApplicationID,y_pred_accepted,pred_step_count,pred_last_ts
0,Application_1218775385,True,4,2016-07-15 13:27:00+00:00
1,Application_1703731762,False,4,2016-07-15 13:26:30+00:00
2,Application_1224457292,True,4,2016-07-15 13:30:00+00:00
3,Application_1808303712,False,4,2016-07-15 13:31:00+00:00
4,Application_426309193,False,4,2016-07-15 13:31:00+00:00


In [9]:
df_case = real_case.merge(pred_case, on=CASE_COL, how="inner")

df_case["predicted_time_spent"] = (df_case["pred_last_ts"] - df_case["real_first_ts"]).dt.total_seconds()

# Keep only True/False accepted predictions
df_case = df_case[df_case["y_pred_accepted"].isin([True, False])].copy()
df_case["y_pred_accepted"] = df_case["y_pred_accepted"].astype(int)

df_case.head()


Unnamed: 0,case:ApplicationID,real_first_ts,real_last_ts,real_step_count,case:FirstWithdrawalAmount,case:CreditScore,case:OfferedAmount,case:NumberOfTerms,real_time_spent,y_pred_accepted,pred_step_count,pred_last_ts,predicted_time_spent
0,Application_1029247764,2016-07-16 08:31:25.076000+00:00,2016-07-26 13:21:00.362000+00:00,5,5000.0,0,5000.0,100,881375.286,0,4,2016-07-16 08:35:00+00:00,214.924
1,Application_1030731936,2016-07-15 13:40:04.844000+00:00,2016-08-29 18:32:13.126000+00:00,5,0.0,0,6000.0,138,3905528.282,0,4,2016-07-15 13:44:00+00:00,235.156
2,Application_1037363488,2016-07-15 13:29:51.190000+00:00,2016-07-22 14:21:14.151000+00:00,5,6000.0,892,6000.0,48,607882.961,1,4,2016-07-15 13:34:00+00:00,248.81
3,Application_1056924645,2016-07-15 16:10:56.236000+00:00,2016-07-25 12:14:39.185000+00:00,9,12956.0,0,20700.0,80,849822.949,0,4,2016-07-15 16:14:00+00:00,183.764
4,Application_1101006464,2016-07-15 13:54:09.652000+00:00,2016-07-28 08:09:44.724000+00:00,10,22317.0,0,47000.0,120,1102535.072,0,4,2016-07-15 13:58:00+00:00,230.348


In [10]:
df_model = df_case[
    X_COLS + [
        "real_step_count",
        "pred_step_count",
        "real_time_spent",
        "predicted_time_spent",
        "y_pred_accepted",
    ]
].copy()

# ensure numeric
for c in X_COLS + ["real_step_count", "pred_step_count", "real_time_spent", "predicted_time_spent"]:
    df_model[c] = pd.to_numeric(df_model[c], errors="coerce")

df_model = df_model.dropna(subset=["y_pred_accepted"]).dropna()
print("Model rows:", df_model.shape[0])
print("y distribution:")
print(df_model["y_pred_accepted"].value_counts())

df_model.head()


Model rows: 50
y distribution:
y_pred_accepted
1    29
0    21
Name: count, dtype: int64


Unnamed: 0,case:FirstWithdrawalAmount,case:CreditScore,case:OfferedAmount,case:NumberOfTerms,real_step_count,pred_step_count,real_time_spent,predicted_time_spent,y_pred_accepted
0,5000.0,0,5000.0,100,5,4,881375.286,214.924,0
1,0.0,0,6000.0,138,5,4,3905528.282,235.156,0
2,6000.0,892,6000.0,48,5,4,607882.961,248.81,1
3,12956.0,0,20700.0,80,9,4,849822.949,183.764,0
4,22317.0,0,47000.0,120,10,4,1102535.072,230.348,0


In [11]:
import pandas as pd
import numpy as np

FEATURES = [
    "predicted_time_spent",
    "case:FirstWithdrawalAmount",
    "case:CreditScore",
    "case:OfferedAmount",
    "case:NumberOfTerms",
    "pred_step_count",
]

TARGET = "y_pred_accepted"

# df_case should contain these columns (from the case-level build)
df_model_small = df_case[FEATURES + [TARGET]].copy()

# keep only valid boolean targets
df_model_small = df_model_small[df_model_small[TARGET].isin([0, 1, True, False])].copy()
df_model_small[TARGET] = df_model_small[TARGET].astype(int)

# ensure numeric X
for c in FEATURES:
    df_model_small[c] = pd.to_numeric(df_model_small[c], errors="coerce")

df_model_small = df_model_small.dropna()

X = df_model_small[FEATURES].copy()
y = df_model_small[TARGET].copy()

print("Rows:", len(df_model_small))
print("y distribution:")
print(y.value_counts())
X.head()


Rows: 50
y distribution:
y_pred_accepted
1    29
0    21
Name: count, dtype: int64


Unnamed: 0,predicted_time_spent,case:FirstWithdrawalAmount,case:CreditScore,case:OfferedAmount,case:NumberOfTerms,pred_step_count
0,214.924,5000.0,0,5000.0,100,4
1,235.156,0.0,0,6000.0,138,4
2,248.81,6000.0,892,6000.0,48,4
3,183.764,12956.0,0,20700.0,80,4
4,230.348,22317.0,0,47000.0,120,4


In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)

model = LogisticRegression(max_iter=2000, solver="liblinear")
model.fit(X_scaled, y)

coef_df = (
    pd.DataFrame({
        "feature": X_scaled.columns,
        "coef": model.coef_[0],
        "odds_ratio": np.exp(model.coef_[0])
    })
    .sort_values("coef", ascending=False)
    .reset_index(drop=True)
)

coef_df


Unnamed: 0,feature,coef,odds_ratio
0,case:CreditScore,1.640691,5.158731
1,predicted_time_spent,0.957547,2.605297
2,case:FirstWithdrawalAmount,0.409492,1.506052
3,pred_step_count,-0.235652,0.790056
4,case:OfferedAmount,-0.48678,0.614602
5,case:NumberOfTerms,-0.587341,0.555803


In [13]:
import shap

explainer = shap.LinearExplainer(model, X_scaled)
shap_values = explainer.shap_values(X_scaled)

shap_matrix = pd.DataFrame(shap_values, columns=X_scaled.columns, index=X_scaled.index)

# Global SHAP importance table
shap_overview = (
    pd.DataFrame({
        "feature": X_scaled.columns,
        "mean_abs_shap": shap_matrix.abs().mean().values,
        "mean_shap": shap_matrix.mean().values
    })
    .sort_values("mean_abs_shap", ascending=False)
    .reset_index(drop=True)
)

print("=== Coefficients ===")
display(coef_df)

print("\n=== SHAP overview (global) ===")
display(shap_overview)


=== Coefficients ===


Unnamed: 0,feature,coef,odds_ratio
0,case:CreditScore,1.640691,5.158731
1,predicted_time_spent,0.957547,2.605297
2,case:FirstWithdrawalAmount,0.409492,1.506052
3,pred_step_count,-0.235652,0.790056
4,case:OfferedAmount,-0.48678,0.614602
5,case:NumberOfTerms,-0.587341,0.555803



=== SHAP overview (global) ===


Unnamed: 0,feature,mean_abs_shap,mean_shap
0,case:CreditScore,1.619781,9.325873e-17
1,case:NumberOfTerms,0.535549,-4.2188470000000006e-17
2,predicted_time_spent,0.387886,-3.2196470000000005e-17
3,case:OfferedAmount,0.367795,5.551115e-18
4,case:FirstWithdrawalAmount,0.336363,-1.44329e-17
5,pred_step_count,0.065983,-3.885781e-18


In [14]:
# Extract real acceptance per case (last real event)
real_accept_map = (
    real_log_sub
    .sort_values([CASE_COL, TS_COL])
    .groupby(CASE_COL)["case:Accepted"]
    .last()
    .to_dict()
)

df_case["y_real_accepted"] = df_case[CASE_COL].map(real_accept_map)

# Keep only cases where both labels exist
df_metrics = df_case[
    df_case["y_real_accepted"].isin([True, False]) &
    df_case["y_pred_accepted"].isin([0, 1])
].copy()

df_metrics["y_real_accepted"] = df_metrics["y_real_accepted"].astype(int)
df_metrics["y_pred_accepted"] = df_metrics["y_pred_accepted"].astype(int)

print("Cases used for metrics:", len(df_metrics))
print("Real acceptance distribution:")
print(df_metrics["y_real_accepted"].value_counts())
print("Predicted acceptance distribution:")
print(df_metrics["y_pred_accepted"].value_counts())


Cases used for metrics: 50
Real acceptance distribution:
y_real_accepted
1    36
0    14
Name: count, dtype: int64
Predicted acceptance distribution:
y_pred_accepted
1    29
0    21
Name: count, dtype: int64


In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

y_true = df_metrics["y_real_accepted"]
y_pred = df_metrics["y_pred_accepted"]

metrics = {
    "accuracy": accuracy_score(y_true, y_pred),
    "precision": precision_score(y_true, y_pred, zero_division=0),
    "recall": recall_score(y_true, y_pred, zero_division=0),
    "f1_score": f1_score(y_true, y_pred, zero_division=0),
}

metrics_df = pd.DataFrame(metrics, index=["BaseGPT1"]).T
metrics_df


Unnamed: 0,BaseGPT1
accuracy,0.54
precision,0.724138
recall,0.583333
f1_score,0.646154
