In [1]:
import sys
sys.path.append('..')
import datetime
import time
from collections import Counter, defaultdict, deque
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from kneed import KneeLocator
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, log_loss, roc_auc_score
import json
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

In [2]:
import warnings

# suppress only the “y_pred values do not sum to one” warning
warnings.filterwarnings(
    "ignore",
    message=".*y_pred values do not sum to one.*"
)

In [3]:
def cross_entropy_loss(model, x_test, y_test):
    """
    For each sample i:
      loss_i = -∑_c [1{c = y_true_i} · log P_model(c | x_i)]
    If the true label isn’t in model.classes_, returns a default high loss.
    Works for any len(x_test) >= 1, including the single-class case.
    """
    probs = model.predict_proba(x_test)
    default = log_loss([[1, 0]], [[0, 1]]) + 1  # fallback loss

    losses = []
    for i, true_label in enumerate(y_test):
        sample_probs = probs[i]
        classes = model.classes_

        # if only one class in the model
        if sample_probs.size == 1:
            if classes[0] == true_label:
                losses.append(0.0)  # perfect prediction
            else:
                losses.append(default)
            continue

        # find index of the true label
        idx_arr = np.where(classes == true_label)[0]
        if idx_arr.size == 0:
            losses.append(default)
        else:
            y_true_onehot = np.zeros_like(sample_probs)
            y_true_onehot[idx_arr[0]] = 1

            # normalize just in case
            sample_probs = sample_probs / sample_probs.sum()
            y_true_onehot = y_true_onehot / y_true_onehot.sum()

            loss_i = log_loss([y_true_onehot], [sample_probs])
            losses.append(loss_i)

    return np.array(losses)


In [4]:
def normal_loss(model, x_test, y_test):
    """
    For each sample i:
      loss_i = 1 - P_model(y_true_i | x_i)
    If the true label isn’t in model.classes_, we return 1.1 as before.
    Works for any len(x_test) >= 1.
    """
    # predict_proba returns shape (n_samples, n_classes)
    probs = model.predict_proba(x_test)
    
    losses = []
    for i, true_label in enumerate(y_test):
        sample_probs = probs[i]
        # find index of the true label in model.classes_
        idx_arr = np.where(model.classes_ == true_label)[0]
        if idx_arr.size == 0:
            losses.append(1.1)
        else:
            col_index = idx_arr[0]
            losses.append(1 - sample_probs[col_index])
    
    return np.array(losses)

In [5]:
def safe_transform_target(encoder, targets, unknown_value=-1):
    classes = set(encoder.classes_)
    transformed = []
    for t in targets:
        if t in classes:
            transformed.append(encoder.transform([t])[0])
        else:
            transformed.append(unknown_value)
    return np.array(transformed)

In [6]:
def get_clean_loss(normal_loss_value, cross_entropy_loss_value):
    normal_loss_dist = []
    cross_loss_dist = []
    for pos, prediction in  enumerate(normal_loss_value):
        if prediction != 1:
            cross_loss_dist.append(cross_entropy_loss_value[pos])
            normal_loss_dist.append(prediction)

    return np.array(normal_loss_dist), np.array(cross_loss_dist)

In [7]:
def cleaning_cls_result(classification_result):
    
    for i in classification_result.keys():
        print(i, classification_result[i].keys())

        if '1' not in classification_result[i].keys():
            classification_result[i]['1'] = {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0.0}
    return classification_result

In [8]:
def sample_with_min_anomalies(gt_labels, num_samples=10, min_anomalies=3, random_state=None):
    """
    Randomly sample `num_samples` indices from gt_labels (0/1 array),
    ensuring at least `min_anomalies` true-anomaly (1) indices are included.

    Parameters
    ----------
    gt_labels : array-like, shape (n_samples,)
        Ground-truth labels (0 = normal, 1 = anomaly).
    num_samples : int, default=10
        Total number of indices to sample.
    min_anomalies : int, default=3
        Minimum number of anomaly indices to include.
    random_state : int or None
        Seed for reproducibility.

    Returns
    -------
    selected_indices : ndarray, shape (<= num_samples,)
        Shuffled indices, containing at least `min_anomalies` anomalies
        (or as many as available if fewer exist).
    """
    gt_labels = np.asarray(gt_labels)
    if random_state is not None:
        np.random.seed(random_state)

    # locate anomaly vs normal indices
    anomaly_idx = np.where(gt_labels == 1)[0]
    normal_idx  = np.where(gt_labels == 0)[0]

    # determine how many anomalies we can pick
    n_anom = min(len(anomaly_idx), min_anomalies)
    # pick anomalies without replacement
    picked_anom = np.random.choice(anomaly_idx, n_anom, replace=False) if n_anom > 0 else np.array([], dtype=int)

    # fill the rest from normals
    n_normal = num_samples - n_anom
    n_normal = min(n_normal, len(normal_idx))
    picked_norm = np.random.choice(normal_idx, n_normal, replace=False) if n_normal > 0 else np.array([], dtype=int)

    # combine and shuffle
    selected = np.concatenate([picked_anom, picked_norm])
    np.random.shuffle(selected)

    return selected

In [10]:
# ----------------------------
# Step 1: Read and Process the Data
# ----------------------------
dataset = '0.099_noise.csv'
df = pd.read_csv("../data/%s" % (dataset))
df = df.sort_values(by='Timestamp')
# Process the 'noise' column:
# - If NaN, assume Normal (0).
# - Otherwise, treat True/1/'True' as anomaly (1); everything else as Normal (0).
df['noise'] = df['noise'].fillna(0).apply(lambda x: 1 if (x == True or x == 1 or x == 'True' or x=='true') else 0)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
print(dataset)

# ----------------------------
# 2) Prefixes & global window settings
# ----------------------------
prefix_range = range(2, 35)   # prefix lengths 2..15
WINDOW_EVENTS = 2500          # keep the last 2 500 raw events

0.099_noise.csv


In [11]:
# ----------------------------
# 3) Pre-fit encoders
# ----------------------------
all_activities = df["Activity"].unique()
le = LabelEncoder().fit(all_activities)

# build one OHE per prefix length
ohe_dict = {}
for p in prefix_range:
    # we need p-1 columns, each with same category set
    dummy = np.array([[act] * (p - 1) for act in all_activities])
    ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    ohe.fit(dummy)
    ohe_dict[p] = ohe

In [12]:
# ----------------------------
# 4) Prepare global window & per-prefix buffers
# ----------------------------
# a single sliding window of the last WINDOW_EVENTS raw events
global_events = deque(maxlen=WINDOW_EVENTS)

# per-prefix feature/label/noise buffers (we’ll manage evictions manually)
buffers = {}
for p in prefix_range:
    buffers[p] = {
        "X": deque(),               # feature vectors for prefix length p
        "y": deque(),               # target labels
        "noise": deque(),           # noise/anomaly flags
        "model": None,              
        "filled": False,            
        "update_counter": 0,        
        # retrain every half-window by default; adjust if you like
        "retrain_batch": max(1, WINDOW_EVENTS // 2)
    }

case_events = defaultdict(list)    # to accumulate per-case histories
online_reports = []                # to collect results


In [13]:
# ----------------------------
# 5) Simulate streaming & online learning
# ----------------------------
total = len(df)

# global retrain trigger
global_update_counter = 0
global_retrain_batch = WINDOW_EVENTS // 2   # 1250

for i, (_, row) in enumerate(df.iterrows(), start=1):
    # progress logging
    if i % 10000 == 0 or i == total:
        pct = i / total * 100
        print(f"Processed {i}/{total} rows ({pct:.1f}%)")

    # accumulate per-case history
    cid = row["Case ID"]
    case_events[cid].append(row)
    cur_len = len(case_events[cid])

    # only act when a case reaches a prefix length p
    for p in prefix_range:
        if cur_len != p:
            continue

        # build feature, target, noise flag
        group = case_events[cid]
        feats = [e["Activity"] for e in group[: p - 1]]
        target = group[p - 1]["Activity"]
        noise_flag = group[p - 1]["noise"]

        # 1) Prequential prediction if the model is ready
        if buffers[p]["filled"]:
            rf = buffers[p]["model"]
            Xp = ohe_dict[p].transform([feats])
            encoded_target = le.transform([target])[0]

            nl  = normal_loss(rf, Xp, [encoded_target] )
            cel = cross_entropy_loss(rf, Xp, [encoded_target])
            pred_anom = (nl > 1 - 0.01).astype(int)

            online_reports.append({
                "i":          i,
                "prefix":     p,
                "case_id":    cid,
                "true_noise": noise_flag,
                "pred_noise": int(pred_anom[0]),
                "nap_prob":    nl,
                "nap_class":  rf.classes_,
                "predict_act":rf.predict(Xp),
                "actual_act": le.transform([target])[0]
       
            })

        # 2) Slide the global window: if full, peek the to-be-dropped item
        dropped = None
        if len(global_events) == WINDOW_EVENTS:
            dropped = global_events[0]   # this will be evicted on append()

        # append new sample to the global window
        Xp = ohe_dict[p].transform([feats]).ravel()
        yp = le.transform([target])[0]
        global_events.append((p, Xp, yp, noise_flag))

        # add to this prefix’s buffer
        buffers[p]["X"].append(Xp)
        buffers[p]["y"].append(yp)
        buffers[p]["noise"].append(noise_flag)

        # manually evict from the old prefix buffer if needed
        if dropped is not None:
            old_p, old_Xp, old_yp, old_noise = dropped
            buffers[old_p]["X"].popleft()
            buffers[old_p]["y"].popleft()
            buffers[old_p]["noise"].popleft()

          # 3) Initial training once buffer non-empty (or however you prefer)
        if buffers[p]["model"] is None:
            Xw = np.vstack(buffers[p]["X"])
            yw = np.array(buffers[p]["y"])
            rf = RandomForestClassifier(
                n_estimators=50,
                random_state=42,
                n_jobs=-1
            )
            rf.fit(Xw, yw)
            buffers[p]["model"] = rf
            buffers[p]["filled"] = True

        # 4) Global retrain trigger
        global_update_counter += 1
        if global_update_counter >= global_retrain_batch:
            print("=== Global retrain of all prefix models ===")
            for q in prefix_range:
                if len(buffers[q]["X"]) == 0:
                    continue
                Xw = np.vstack(buffers[q]["X"])
                yw = np.array(buffers[q]["y"])
                rf = RandomForestClassifier(
                    n_estimators=50,
                    random_state=42,
                    n_jobs=-1
                )
                rf.fit(Xw, yw)
                buffers[q]["model"] = rf
            global_update_counter = 0


=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
Processed 10000/79441 rows (12.6%)
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
Processed 20000/79441 rows (25.2%)
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Gl

In [16]:
# 6) Summarize
reports_df = pd.DataFrame(online_reports)
for p in prefix_range:
    sub = reports_df[reports_df["prefix"] == p]
    if not sub.empty:
        print(f"\n--- Prefix {p} ---")
        print(classification_report(
            sub["true_noise"], sub["pred_noise"], zero_division=0
        ))
reports_df
reports_df.to_csv('../result/%s_fixed_v2.csv'%(dataset), index=False)


--- Prefix 2 ---
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      4462
           1       1.00      0.74      0.85       537

    accuracy                           0.97      4999
   macro avg       0.98      0.87      0.92      4999
weighted avg       0.97      0.97      0.97      4999


--- Prefix 3 ---
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      4549
           1       0.96      0.74      0.83       450

    accuracy                           0.97      4999
   macro avg       0.97      0.87      0.91      4999
weighted avg       0.97      0.97      0.97      4999


--- Prefix 4 ---
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      4528
           1       0.82      0.76      0.79       471

    accuracy                           0.96      4999
   macro avg       0.90      0.87      0.88      4999
weighted avg       0