In [1]:
import re
import pandas as pd
import numpy as np

pred_path = "ftgemini2.jsonl"  # <-- your predictions text dump

def coerce_value(v: str):
    v = v.strip()
    if v == "" or v.lower() in {"null", "none", "nan"}:
        return None
    if v.lower() == "true":
        return True
    if v.lower() == "false":
        return False
    if re.fullmatch(r"-?\d+", v):
        return int(v)
    if re.fullmatch(r"-?\d+\.\d+", v):
        return float(v)
    return v

def parse_events_linewise(path: str):
    text = open(path, "r", encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"\n\s*\n+", text) if b.strip()]

    events = []
    for block in blocks:
        d = {}
        for ln in block.splitlines():
            ln = ln.strip()
            if not ln:
                continue
            if re.match(r"^Event\s+\d+\s*:\s*$", ln, flags=re.I):
                continue

            # split only on ': ' once
            if ": " not in ln:
                continue
            key, val = ln.split(": ", 1)
            d[key.strip()] = coerce_value(val)

        if d:
            events.append(d)

    return events

pred_events = parse_events_linewise(pred_path)
pred_df = pd.DataFrame(pred_events)

print("Parsed predicted events:", len(pred_df))
print("Columns:", list(pred_df.columns)[:25])
pred_df.head()


Parsed predicted events: 208
Columns: ['id', 'org:resource', 'concept:name', 'time:timestamp', 'org:role', 'case:Rfp_id', 'case:Project', 'case:Task', 'case:concept:name', 'case:OrganizationalEntity', 'case:Cost Type', 'case:RequestedAmount', 'case:Activity', 'case:RfpNumber']


Unnamed: 0,id,org:resource,concept:name,time:timestamp,org:role,case:Rfp_id,case:Project,case:Task,case:concept:name,case:OrganizationalEntity,case:Cost Type,case:RequestedAmount,case:Activity,case:RfpNumber
0,st_step 159530_0,STAFF MEMBER,Request For Payment APPROVED by ADMINISTRATION,2018-02-20 15:43:24+00:00,ADMINISTRATION,request for payment 159525,project 155217,UNKNOWN,request for payment 159525,organizational unit 65454,0,983.391441,activity 505,request for payment number 159526
1,st_step 159528_0,STAFF MEMBER,Request For Payment APPROVED by BUDGET OWNER,2018-02-21 08:17:02+00:00,BUDGET OWNER,request for payment 159525,project 155217,UNKNOWN,request for payment 159525,organizational unit 65454,0,983.391441,activity 505,request for payment number 159526
2,st_step 159527_0,STAFF MEMBER,Request For Payment FINAL_APPROVED by SUPERVISOR,2018-02-22 12:35:47+00:00,SUPERVISOR,request for payment 159525,project 155217,UNKNOWN,request for payment 159525,organizational unit 65454,0,983.391441,activity 505,request for payment number 159526
3,rp_request for payment 159525_15,SYSTEM,Request Payment,2018-02-26 11:21:55+00:00,UNDEFINED,request for payment 159525,project 155217,UNKNOWN,request for payment 159525,organizational unit 65454,0,983.391441,activity 505,request for payment number 159526
4,rp_request for payment 159525_16,SYSTEM,Payment Handled,2018-03-01 17:31:08+00:00,UNDEFINED,request for payment 159525,project 155217,UNKNOWN,request for payment 159525,organizational unit 65454,0,983.391441,activity 505,request for payment number 159526


In [2]:
CASE_COL = "case:Rfp_id"
TS_COL   = "time:timestamp"
ACT_COL  = "concept:name"

pred_df[CASE_COL] = pred_df[CASE_COL].astype(str).str.strip()
pred_df[TS_COL] = pd.to_datetime(pred_df[TS_COL], utc=True, errors="coerce")

pred_df = pred_df.dropna(subset=[CASE_COL, TS_COL]).copy()

print("Pred timestamp parse rate:", pred_df[TS_COL].notna().mean())
print("Unique predicted cases:", pred_df[CASE_COL].nunique())
pred_df[[CASE_COL, TS_COL, ACT_COL]].head(10)


Pred timestamp parse rate: 1.0
Unique predicted cases: 50


Unnamed: 0,case:Rfp_id,time:timestamp,concept:name
0,request for payment 159525,2018-02-20 15:43:24+00:00,Request For Payment APPROVED by ADMINISTRATION
1,request for payment 159525,2018-02-21 08:17:02+00:00,Request For Payment APPROVED by BUDGET OWNER
2,request for payment 159525,2018-02-22 12:35:47+00:00,Request For Payment FINAL_APPROVED by SUPERVISOR
3,request for payment 159525,2018-02-26 11:21:55+00:00,Request Payment
4,request for payment 159525,2018-03-01 17:31:08+00:00,Payment Handled
5,request for payment 167294,2018-05-18 09:22:04+00:00,Request For Payment APPROVED by ADMINISTRATION
6,request for payment 167294,2018-05-21 14:46:31+00:00,Request For Payment APPROVED by BUDGET OWNER
7,request for payment 167294,2018-05-28 07:13:40+00:00,Request For Payment FINAL_APPROVED by SUPERVISOR
8,request for payment 167294,2018-05-28 11:05:12+00:00,Request Payment
9,request for payment 167294,2018-05-31 17:31:22+00:00,Payment Handled


In [3]:
pred_by_case = {
    cid: grp.sort_values(TS_COL)
    for cid, grp in pred_df.groupby(CASE_COL)
}

print("Predicted cases:", len(pred_by_case))


Predicted cases: 50


In [5]:
real_log = pd.read_csv("C:/Users/Deniz/Downloads/thesisData2.csv")  # <-- your real RFP log

real_log[CASE_COL] = real_log[CASE_COL].astype(str).str.strip()
real_log[TS_COL] = pd.to_datetime(real_log[TS_COL], utc=True, errors="coerce")

pred_case_ids = set(pred_by_case.keys())
real_log_sub = real_log[real_log[CASE_COL].isin(pred_case_ids)].copy()
real_log_sub = real_log_sub.sort_values([CASE_COL, TS_COL])

print("Real events after filtering:", real_log_sub.shape)
print("Matched real cases:", real_log_sub[CASE_COL].nunique())


Real events after filtering: (266, 15)
Matched real cases: 50


In [6]:
gt_map = real_log_sub.groupby(CASE_COL)[ACT_COL].apply(list).to_dict()
print("GT sequences built:", len(gt_map))


GT sequences built: 50


In [7]:
def normalize_pred_events_to_start_after_first(gt_seq, pred_events):
    if not gt_seq or pred_events is None or len(pred_events) == 0:
        return pred_events

    pred_names = pred_events[ACT_COL].tolist()

    gt0 = gt_seq[0]
    gt1 = gt_seq[1] if len(gt_seq) > 1 else None

    if pred_names and pred_names[0] == gt0:
        return pred_events.iloc[1:].copy()
    if gt1 is not None and pred_names and pred_names[0] == gt1:
        return pred_events.copy()
    if gt0 in pred_names:
        idx = pred_names.index(gt0)
        return pred_events.iloc[idx+1:].copy()
    return pred_events.copy()

norm_pred_by_case = {}

for case_id, grp in pred_by_case.items():
    gt_seq = gt_map.get(case_id, [])
    norm_pred_by_case[case_id] = normalize_pred_events_to_start_after_first(gt_seq, grp)

print("Example normalized lengths:",
      list((cid, len(pred_by_case[cid]), len(norm_pred_by_case[cid])) for cid in list(pred_by_case.keys())[:3]))


Example normalized lengths: [('request for payment 148298', 4, 4), ('request for payment 148339', 4, 4), ('request for payment 148598', 4, 4)]


In [8]:
def is_activity_known_fn(activity_value):
    # adjust rule if needed:
    # Here: known if non-null and not empty and not "nan"
    if activity_value is None:
        return 0
    s = str(activity_value).strip().lower()
    return 0 if (s == "" or s == "nan" or s == "none" or s == "null") else 1

case_rows = []

for case_id in pred_case_ids:
    # real events
    gt_events = real_log_sub[real_log_sub[CASE_COL] == case_id].sort_values(TS_COL)
    if gt_events.empty:
        continue

    real_first_ts = gt_events[TS_COL].iloc[0]
    real_last_ts  = gt_events[TS_COL].iloc[-1]
    real_step_count = len(gt_events)
    real_time_spent = (real_last_ts - real_first_ts).total_seconds()

    # predicted events (normalized)
    pred_events_norm = norm_pred_by_case.get(case_id)
    if pred_events_norm is None or len(pred_events_norm) == 0:
        pred_step_count = 0
        pred_last_ts = pd.NaT
        predicted_time_spent = np.nan
    else:
        pred_step_count = len(pred_events_norm)
        pred_last_ts = pred_events_norm[TS_COL].iloc[-1]
        predicted_time_spent = (pred_last_ts - real_first_ts).total_seconds()

    # X variables from real first row (stable per case)
    first_row = gt_events.iloc[0]
    requested_amount = first_row.get("case:RequestedAmount", np.nan)
    organizational_entity = first_row.get("case:OrganizationalEntity", None)
    project = first_row.get("case:Project", None)
    activity_val = first_row.get("case:Activity", None)
    is_activity_known = is_activity_known_fn(activity_val)

    # labels
    real_messy = int(real_step_count > 5)     # real has first step
    pred_messy = int(pred_step_count > 4)     # predictions start from second step

    case_rows.append({
        CASE_COL: case_id,
        "requested_amount": requested_amount,
        "organizational_entity": organizational_entity,
        "project": project,
        "is_activity_known": is_activity_known,
        "real_step_count": real_step_count,
        "pred_step_count": pred_step_count,
        "real_time_spent": real_time_spent,
        "predicted_time_spent": predicted_time_spent,
        "real_messy": real_messy,
        "pred_messy": pred_messy,
    })

df_case = pd.DataFrame(case_rows)
print("Case-level table:", df_case.shape)
df_case.head()


Case-level table: (50, 11)


Unnamed: 0,case:Rfp_id,requested_amount,organizational_entity,project,is_activity_known,real_step_count,pred_step_count,real_time_spent,predicted_time_spent,real_messy,pred_messy
0,request for payment 159525,983.391441,organizational unit 65454,project 155217,1,5,5,3203278.0,784075.0,0,1
1,request for payment 168286,123.277282,organizational unit 65468,project 147582,1,6,4,952907.0,348083.0,1,0
2,request for payment 177507,737.465719,organizational unit 65468,project 147546,1,5,4,1233044.0,628199.0,0,0
3,request for payment 166508,405.798802,organizational unit 65462,project 147556,1,5,4,1229591.0,624772.0,0,0
4,request for payment 167294,101.156679,organizational unit 65474,project 154479,1,5,5,3571821.0,1152607.0,0,1


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

y_true = df_case["real_messy"]
y_pred = df_case["pred_messy"]

print("Accuracy :", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, zero_division=0))
print("Recall   :", recall_score(y_true, y_pred, zero_division=0))
print("F1-score :", f1_score(y_true, y_pred, zero_division=0))

cm = confusion_matrix(y_true, y_pred)
pd.DataFrame(
    cm,
    index=["Real not messy (0)", "Real messy (1)"],
    columns=["Pred not messy (0)", "Pred messy (1)"]
)


Accuracy : 0.66
Precision: 0.5
Recall   : 0.29411764705882354
F1-score : 0.37037037037037035


Unnamed: 0,Pred not messy (0),Pred messy (1)
Real not messy (0),28,5
Real messy (1),12,5


In [10]:
import shap
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

FEATURES = ["requested_amount", "project", "organizational_entity", "is_activity_known"]
TARGET = "pred_messy"

df_model = df_case[FEATURES + [TARGET]].copy()

# encode categoricals with one-hot
df_model["project"] = df_model["project"].astype(str)
df_model["organizational_entity"] = df_model["organizational_entity"].astype(str)

X = pd.get_dummies(df_model[FEATURES], columns=["project", "organizational_entity"], drop_first=False)
y = df_model[TARGET].astype(int)

# scale numeric part (requested_amount, is_activity_known)
scaler = StandardScaler(with_mean=False)  # with_mean=False ok for sparse-like dummies
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

# Fit
model = LogisticRegression(max_iter=2000, solver="liblinear")
model.fit(X_scaled, y)

# Coefficients
import numpy as np
coef_df = (
    pd.DataFrame({
        "feature": X_scaled.columns,
        "coef": model.coef_[0],
        "odds_ratio": np.exp(model.coef_[0]),
    })
    .sort_values("coef", ascending=False)
    .reset_index(drop=True)
)

print("=== ALL COEFFICIENTS ===")
display(coef_df)

# SHAP (tables only)
explainer = shap.LinearExplainer(model, X_scaled)
shap_values = explainer.shap_values(X_scaled)

shap_matrix = pd.DataFrame(shap_values, columns=X_scaled.columns, index=X_scaled.index)

shap_overview = (
    pd.DataFrame({
        "feature": X_scaled.columns,
        "mean_abs_shap": shap_matrix.abs().mean().values,
        "mean_shap": shap_matrix.mean().values,
    })
    .sort_values("mean_abs_shap", ascending=False)
    .reset_index(drop=True)
)

print("=== SHAP OVERVIEW ===")
display(shap_overview)


=== ALL COEFFICIENTS ===


Unnamed: 0,feature,coef,odds_ratio
0,project_project 155217,0.784417,2.191128
1,organizational_entity_organizational unit 65468,0.685364,1.984494
2,project_project 149268,0.683642,1.98108
3,project_project 147572,0.608209,1.837138
4,project_UNKNOWN,0.58229,1.790133
5,organizational_entity_organizational unit 65474,0.288647,1.334621
6,project_project 154479,0.288647,1.334621
7,project_project 148052,0.2873,1.332823
8,organizational_entity_organizational unit 65473,0.2873,1.332823
9,organizational_entity_organizational unit 65458,0.276681,1.318746


=== SHAP OVERVIEW ===


Unnamed: 0,feature,mean_abs_shap,mean_shap
0,project_project 147546,0.970821,3.5527140000000005e-17
1,organizational_entity_organizational unit 65468,0.411218,-2.2204460000000003e-17
2,organizational_entity_organizational unit 65463,0.402601,-2.220446e-18
3,requested_amount,0.280068,-1.9984010000000002e-17
4,organizational_entity_organizational unit 65454,0.257766,1.3322680000000001e-17
5,project_project 155217,0.219637,7.771561e-18
6,organizational_entity_organizational unit 65469,0.205529,-8.881784e-18
7,project_project 503,0.201671,-1.7486010000000002e-17
8,project_project 149268,0.19142,7.21645e-18
9,organizational_entity_organizational unit 65462,0.182588,-4.440892e-18
