In [1]:
import sys
sys.path.append('..')
import datetime
import time
from collections import Counter, defaultdict, deque
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from kneed import KneeLocator
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, log_loss, roc_auc_score
import json
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

In [2]:
import warnings

# suppress only the “y_pred values do not sum to one” warning
warnings.filterwarnings(
    "ignore",
    message=".*y_pred values do not sum to one.*"
)

In [14]:
def cross_entropy_loss(model, x_test, y_test):
    """
    For each sample i:
      loss_i = -∑_c [1{c = y_true_i} · log P_model(c | x_i)]
    If the true label isn’t in model.classes_, returns a default high loss.
    Works for any len(x_test) >= 1, including the single-class case.
    """
    probs = model.predict_proba(x_test)
    default = log_loss([[1, 0]], [[0, 1]]) + 1  # fallback loss

    losses = []
    for i, true_label in enumerate(y_test):
        sample_probs = probs[i]
        classes = model.classes_

        # if only one class in the model
        if sample_probs.size == 1:
            if classes[0] == true_label:
                losses.append(0.0)  # perfect prediction
            else:
                losses.append(default)
            continue

        # find index of the true label
        idx_arr = np.where(classes == true_label)[0]
        if idx_arr.size == 0:
            losses.append(default)
        else:
            y_true_onehot = np.zeros_like(sample_probs)
            y_true_onehot[idx_arr[0]] = 1

            # normalize just in case
            sample_probs = sample_probs / sample_probs.sum()
            y_true_onehot = y_true_onehot / y_true_onehot.sum()

            loss_i = log_loss([y_true_onehot], [sample_probs])
            losses.append(loss_i)

    return np.array(losses)


In [15]:
def normal_loss(model, x_test, y_test):
    """
    For each sample i:
      loss_i = 1 - P_model(y_true_i | x_i)
    If the true label isn’t in model.classes_, we return 1.1 as before.
    Works for any len(x_test) >= 1.
    """
    # predict_proba returns shape (n_samples, n_classes)
    probs = model.predict_proba(x_test)
    
    losses = []
    for i, true_label in enumerate(y_test):
        sample_probs = probs[i]
        # find index of the true label in model.classes_
        idx_arr = np.where(model.classes_ == true_label)[0]
        if idx_arr.size == 0:
            losses.append(1.1)
        else:
            col_index = idx_arr[0]
            losses.append(1 - sample_probs[col_index])
    
    return np.array(losses)

In [16]:
def safe_transform_target(encoder, targets, unknown_value=-1):
    classes = set(encoder.classes_)
    transformed = []
    for t in targets:
        if t in classes:
            transformed.append(encoder.transform([t])[0])
        else:
            transformed.append(unknown_value)
    return np.array(transformed)

In [17]:
def get_clean_loss(normal_loss_value, cross_entropy_loss_value):
    normal_loss_dist = []
    cross_loss_dist = []
    for pos, prediction in  enumerate(normal_loss_value):
        if prediction != 1:
            cross_loss_dist.append(cross_entropy_loss_value[pos])
            normal_loss_dist.append(prediction)

    return np.array(normal_loss_dist), np.array(cross_loss_dist)

In [18]:
def cleaning_cls_result(classification_result):
    
    for i in classification_result.keys():
        print(i, classification_result[i].keys())

        if '1' not in classification_result[i].keys():
            classification_result[i]['1'] = {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0.0}
    return classification_result

In [26]:
def find_largest_gap(losses):
    if len(losses) ==1:
        return 0, -1
    else:
        y = sorted(losses, reverse=True)
        diffs = abs(np.diff(y))
        idx = np.argmax(diffs) + 1   # +1 because diffs[i] = y[i+1]-y[i]
    return idx, y[idx]

In [62]:
# ----------------------------
# Step 1: Read and Process the Data
# ----------------------------
dataset = '0.049_noise.csv'
df = pd.read_csv("../data/%s" % (dataset))
df = df.sort_values(by='Timestamp')
# Process the 'noise' column:
# - If NaN, assume Normal (0).
# - Otherwise, treat True/1/'True' as anomaly (1); everything else as Normal (0).
df['noise'] = df['noise'].fillna(0).apply(lambda x: 1 if (x == True or x == 1 or x == 'True' or x=='true') else 0)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
print(dataset)

# ----------------------------
# 2) Prefixes & global window settings
# ----------------------------
prefix_range = range(2, 35)   # prefix lengths 2..15
WINDOW_EVENTS = 2500          # keep the last 2 500 raw events

0.049_noise.csv


In [63]:
# ----------------------------
# 3) Pre-fit encoders
# ----------------------------
all_activities = df["Activity"].unique()
le = LabelEncoder().fit(all_activities)

# build one OHE per prefix length
ohe_dict = {}
for p in prefix_range:
    # we need p-1 columns, each with same category set
    dummy = np.array([[act] * (p - 1) for act in all_activities])
    ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    ohe.fit(dummy)
    ohe_dict[p] = ohe

In [64]:
# ----------------------------
# 4) Prepare global window & per-prefix buffers
# ----------------------------
# a single sliding window of the last WINDOW_EVENTS raw events
global_events = deque(maxlen=WINDOW_EVENTS)

# per-prefix feature/label/noise buffers (we’ll manage evictions manually)
buffers = {}
for p in prefix_range:
    buffers[p] = {
        "X": deque(),               # feature vectors for prefix length p
        "y": deque(),               # target labels
        "noise": deque(),           # noise/anomaly flags
        "model": None,              
        "filled": False,            
        "update_counter": 0,        
        # retrain every half-window by default; adjust if you like
        "retrain_batch": max(1, WINDOW_EVENTS // 2)
    }

case_events = defaultdict(list)    # to accumulate per-case histories
online_reports = []                # to collect results


In [65]:
# ----------------------------
# 5) Simulate streaming & online learning (with CE‐based anomaly cutoff)
# ----------------------------
total = len(df)

# single sliding window of the last WINDOW_EVENTS raw events
global_update_counter = 0
global_retrain_batch = WINDOW_EVENTS // 2   # 1250

for i, (_, row) in enumerate(df.iterrows(), start=1):
    # progress logging
    if i % 10000 == 0 or i == total:
        pct = i / total * 100
        print(f"Processed {i}/{total} rows ({pct:.1f}%)")

    # 1) Accumulate per-case history
    cid = row["Case ID"]
    case_events[cid].append(row)
    cur_len = len(case_events[cid])

    # 2) Only act when a case just reached a prefix length p
    for p in prefix_range:
        if cur_len != p:
            continue

        # build feature list, target label, and noise flag for this prefix-event
        group = case_events[cid]
        feats      = [e["Activity"] for e in group[: p - 1]]
        target_act = group[p - 1]["Activity"]
        noise_flag = group[p - 1]["noise"]

        # 3) Prequential prediction if the prefix-model is ready
        if buffers[p]["filled"]:
            rf     = buffers[p]["model"]
            cutoff = buffers[p]["cutoff"]

            Xp = ohe_dict[p].transform([feats])
            t_enc = le.transform([target_act])[0]

            # compute cross-entropy loss; normal_loss kept if you want, but here only CE used
            cel = cross_entropy_loss(rf, Xp, [t_enc])

            pred_anom = (cel > cutoff).astype(int)
            online_reports.append({
                "prefix":     p,
                "case_id":    cid,
                "true_noise": noise_flag,
                "pred_noise": int(pred_anom[0])
            })

        # 4) Slide the global window: peek the dropped item if full
        dropped = None
        if len(global_events) == WINDOW_EVENTS:
            dropped = global_events[0]   # will be ejected when we append()

        # transform features & label, then append to global window
        Xp = ohe_dict[p].transform([feats]).ravel()
        yp = le.transform([target_act])[0]
        global_events.append((p, Xp, yp, noise_flag))

        # also add this sample into the prefix-specific buffers
        buffers[p]["X"].append(Xp)
        buffers[p]["y"].append(yp)
        buffers[p]["noise"].append(noise_flag)

        # if we popped an old event from global_events, evict it from its prefix buffer
        if dropped is not None:
            old_p, old_Xp, old_yp, old_noise = dropped
            if buffers[old_p]["X"]:
                buffers[old_p]["X"].popleft()
                buffers[old_p]["y"].popleft()
                buffers[old_p]["noise"].popleft()

        # 5) Initial training for this prefix once we see the first sample
        if buffers[p]["model"] is None:
            # train a fresh RandomForest on whatever is in buffers[p]
            Xw = np.vstack(buffers[p]["X"])
            yw = np.array(buffers[p]["y"])

            rf = RandomForestClassifier(
                n_estimators=50,
                random_state=42,
                n_jobs=-1
            )
            rf.fit(Xw, yw)
            buffers[p]["model"]  = rf
            buffers[p]["filled"] = True

            # compute initial cutoff via cross_entropy_loss on buffered data
            ce_losses = cross_entropy_loss(rf, Xw, yw)
            sorted_losses = sorted(ce_losses, reverse=True)

            # compute knee‐point cutoff
            x = np.arange(1, len(sorted_losses) + 1)
            y = sorted_losses.copy()
            m = max(50, int(0.05 * len(y)))
            while m < len(y) and max(y[:m]) == min(y[:m]):
                m += 1
            knee = KneeLocator(x[:m], y[:m], curve='convex', direction='decreasing', S=1)
            if knee.knee is not None:
                k_idx = knee.knee
                cutoff_knee = y[k_idx - 1]
            else:
                cutoff_knee = y[0]

            # also compute gap‐based cutoff if you have find_largest_gap
            ce_arr = ce_losses
            _, cutoff_gap = find_largest_gap(ce_arr)

            # store whichever cutoff you prefer (here, use gap-based)
            buffers[p]["cutoff"] = cutoff_knee

        # 6) Global retrain trigger: count one for every new prefix-event
        #     (so we retrain after 1250 prefix-events, approximating 1250 raw events)
        global_update_counter += 1
        if global_update_counter >= global_retrain_batch:
            print("=== Global retrain of all prefix models ===")
            # retrain each prefix model on its current buffer, then recompute cutoff
            for q in prefix_range:
                if len(buffers[q]["X"]) == 0:
                    continue

                Xw = np.vstack(buffers[q]["X"])
                yw = np.array(buffers[q]["y"])

                rf = RandomForestClassifier(
                    n_estimators=50,
                    random_state=42,
                    n_jobs=-1
                )
                rf.fit(Xw, yw)
                buffers[q]["model"] = rf

                # recompute CE loss distribution & cutoff
                ce_losses = cross_entropy_loss(rf, Xw, yw)
                sorted_losses = sorted(ce_losses, reverse=True)

                x = np.arange(1, len(sorted_losses) + 1)
                y = sorted_losses.copy()
                m = max(50, int(0.05 * len(y)))
                while m < len(y) and max(y[:m]) == min(y[:m]):
                    m += 1
                knee = KneeLocator(x[:m], y[:m], curve='convex', direction='decreasing', S=1)
                if knee.knee is not None:
                    k_idx = knee.knee
                    cutoff_knee = y[k_idx - 1]
                else:
                    cutoff_knee = y[0]

                # _, cutoff_gap = find_largest_gap(ce_losses)
                buffers[q]["cutoff"] = cutoff_knee

            global_update_counter = 0


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_n

=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


Processed 10000/78504 rows (12.7%)
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  return (a - min(a)) / (max(a) - min(a))
  return (a - min(a)) / (max(a) - min(a))


=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


Processed 20000/78504 rows (25.5%)
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


Processed 30000/78504 rows (38.2%)
=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  return (a - min(a)) / (max(a) - min(a))
  return (a - min(a)) / (max(a) - min(a))
  return (a - min(a)) / (max(a) - min(a))
  return (a - min(a)) / (max(a) - min(a))
  return (a - min(a)) / (max(a) - min(a))


=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))


Processed 40000/78504 rows (51.0%)
=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


Processed 50000/78504 rows (63.7%)
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===
Processed 60000/78504 rows (76.4%)
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


Processed 70000/78504 rows (89.2%)
=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)
  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===


  return (a - min(a)) / (max(a) - min(a))
  self.S * np.abs(np.diff(self.x_normalized).mean())
  ret = ret.dtype.type(ret / rcount)


=== Global retrain of all prefix models ===
=== Global retrain of all prefix models ===
Processed 78504/78504 rows (100.0%)


In [66]:
# 6) Summarize
reports_df = pd.DataFrame(online_reports)
for p in prefix_range:
    sub = reports_df[reports_df["prefix"] == p]
    if not sub.empty:
        print(f"\n--- Prefix {p} ---")
        print(classification_report(
            sub["true_noise"], sub["pred_noise"], zero_division=0
        ))
        
reports_df.to_csv('../result/%s_celoss.csv'%(dataset), index=False)


--- Prefix 2 ---
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      4707
           1       0.54      0.65      0.59       292

    accuracy                           0.95      4999
   macro avg       0.76      0.81      0.78      4999
weighted avg       0.95      0.95      0.95      4999


--- Prefix 3 ---
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      4735
           1       0.67      0.70      0.69       264

    accuracy                           0.97      4999
   macro avg       0.83      0.84      0.83      4999
weighted avg       0.97      0.97      0.97      4999


--- Prefix 4 ---
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      4748
           1       0.65      0.74      0.69       251

    accuracy                           0.97      4999
   macro avg       0.82      0.86      0.84      4999
weighted avg       0