In [1]:
import sys
sys.path.append('..')
import datetime
import time
from collections import Counter
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from kneed import KneeLocator
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, log_loss, roc_auc_score
import json
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

In [2]:
def cross_entropy_loss(model, x_test, y_test):
    
    probs = model.predict_proba(x_test)
    
    predicted_probs = []
    for i, true_label in enumerate(y_test):
        idx_arr = np.where(model.classes_ == true_label)[0]
        if len(idx_arr) == 0:
            predicted_probs.append(log_loss(y_true = [1,0], y_pred=[0,1])+1)
        else:
            col_index = idx_arr[0]
            true_label_one_hot = np.zeros_like(probs[i])
            true_label_one_hot[idx_arr] = 1
            predicted_probs.append(log_loss(y_true = true_label_one_hot, y_pred = probs[i]))
            
    return np.array(predicted_probs)

In [3]:
def normal_loss(model, x_test, y_test):
    
    probs = model.predict_proba(x_test)

    predicted_probs = []
    for i, true_label in enumerate(y_test):
        idx_arr = np.where(model.classes_ == true_label)[0]
        if len(idx_arr) == 0:
            predicted_probs.append(1.1)
        else:
            col_index = idx_arr[0]
            
            true_label_one_hot = np.zeros_like(probs[i])
            true_label_one_hot[idx_arr] = 1
            predicted_probs.append(1-probs[i][col_index])
            
    return np.array(predicted_probs)

In [4]:
def safe_transform_target(encoder, targets, unknown_value=-1):
    classes = set(encoder.classes_)
    transformed = []
    for t in targets:
        if t in classes:
            transformed.append(encoder.transform([t])[0])
        else:
            transformed.append(unknown_value)
    return np.array(transformed)

In [5]:
def get_clean_loss(normal_loss_value, cross_entropy_loss_value):
    normal_loss_dist = []
    cross_loss_dist = []
    for pos, prediction in  enumerate(normal_loss_value):
        if prediction != 1:
            cross_loss_dist.append(cross_entropy_loss_value[pos])
            normal_loss_dist.append(prediction)

    return normal_loss_dist, cross_loss_dist

In [6]:
def cleaning_cls_result(classification_result):
    
    for i in classification_result.keys():
        print(i, classification_result[i].keys())

        if '1' not in classification_result[i].keys():
            classification_result[i]['1'] = {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0.0}
    return classification_result

In [7]:
def sample_with_min_anomalies(gt_labels, num_samples=10, min_anomalies=3, random_state=None):
    """
    Randomly sample `num_samples` indices from gt_labels (0/1 array),
    ensuring at least `min_anomalies` true-anomaly (1) indices are included.

    Parameters
    ----------
    gt_labels : array-like, shape (n_samples,)
        Ground-truth labels (0 = normal, 1 = anomaly).
    num_samples : int, default=10
        Total number of indices to sample.
    min_anomalies : int, default=3
        Minimum number of anomaly indices to include.
    random_state : int or None
        Seed for reproducibility.

    Returns
    -------
    selected_indices : ndarray, shape (<= num_samples,)
        Shuffled indices, containing at least `min_anomalies` anomalies
        (or as many as available if fewer exist).
    """
    gt_labels = np.asarray(gt_labels)
    if random_state is not None:
        np.random.seed(random_state)

    # locate anomaly vs normal indices
    anomaly_idx = np.where(gt_labels == 1)[0]
    normal_idx  = np.where(gt_labels == 0)[0]

    # determine how many anomalies we can pick
    n_anom = min(len(anomaly_idx), min_anomalies)
    # pick anomalies without replacement
    picked_anom = np.random.choice(anomaly_idx, n_anom, replace=False) if n_anom > 0 else np.array([], dtype=int)

    # fill the rest from normals
    n_normal = num_samples - n_anom
    n_normal = min(n_normal, len(normal_idx))
    picked_norm = np.random.choice(normal_idx, n_normal, replace=False) if n_normal > 0 else np.array([], dtype=int)

    # combine and shuffle
    selected = np.concatenate([picked_anom, picked_norm])
    np.random.shuffle(selected)

    return selected

In [8]:
# ----------------------------
# Step 1: Read and Process the Data
# ----------------------------
dataset = '0.099_sample.csv'
df = pd.read_csv("../data/%s" % (dataset))
df = df.sort_values(by='Timestamp')
# Process the 'noise' column:
# - If NaN, assume Normal (0).
# - Otherwise, treat True/1/'True' as anomaly (1); everything else as Normal (0).
df['noise'] = df['noise'].fillna(0).apply(lambda x: 1 if (x == True or x == 1 or x == 'True' or x=='true') else 0)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
print(dataset)
# Calculate the cutoff time (e.g., the median of all timestamps)
cutoff_time = df['Timestamp'].median()

anomaly_f1_list = []
anomaly_support_list = []
prefix_range = range(2, 16)

0.099_sample.csv


In [14]:
training_size = 0.8
print('Training window size: %s' % (training_size))
loss_prefix_dict =dict()
classification_result = dict()

all_x_train = []
all_y_train = []
all_x_test  = []
all_y_test  = []
all_train_anomaly_label = []
all_test_anomaly_label = []

known_anomalies_x = []
known_anomalies_y = []
known_anomalies_label = []
for prefix in prefix_range:    
    # Extract per case:
    # - The first (prefix-1) events (activities) as features.
    # - The prefix-th event's activity as the target.
    # - The prefix-th event's noise flag as the ground truth anomaly.
    case_features = []
    case_targets = []
    ground_truth_anomaly = []

    for case_id, group in df.groupby('ID'):
        group = group.sort_index()  # assuming the order in the file is the event order
        if len(group) >= prefix:
            events = group['Activity'].values  # adjust 'Activity' if needed
            features = events[:prefix-1]
            target_activity = events[prefix-1]  # prefix-th event's activity
            noise_flag = group['noise'].iloc[prefix-1]

            case_features.append(features)
            case_targets.append(target_activity)
            ground_truth_anomaly.append(noise_flag)

    # Convert to numpy arrays
    case_features = np.array(case_features)
    case_targets = np.array(case_targets)
    ground_truth_anomaly = np.array(ground_truth_anomaly)
    print("Total cases with at least %s events:" % (prefix), case_features.shape[0])
    
    n_cases = case_features.shape[0]
    split_index = int(training_size * n_cases)
    test_index = split_index
    X_train = case_features[:split_index]
    X_test = case_features[test_index:]
    y_train = case_targets[:split_index]
    y_test = case_targets[test_index:]
    gt_anomaly_train = ground_truth_anomaly[:split_index]
    gt_anomaly_test = ground_truth_anomaly[test_index:]
    print("Training cases:", X_train.shape[0], "Test cases:", X_test.shape[0])


    expert_anomaly_indices = sample_with_min_anomalies(
    gt_labels=gt_anomaly_train,
    num_samples=20,
    min_anomalies=10,
    random_state=42)

    # Modify training set for anomaly detection classifier
    x_detect_train = []
    y_detect_train = []
    anomaly_detect_train = [] 
    for pos, idx in enumerate(expert_anomaly_indices):
        x_detect_train.append(X_train[idx])
        y_detect_train.append(y_train[idx])
        anomaly_detect_train.append(gt_anomaly_train[idx])
            
    all_x_train.extend(X_train)
    all_y_train.extend(y_train)
    all_x_test.extend(X_test)
    all_y_test.extend(y_test)
    all_train_anomaly_label.extend(gt_anomaly_train)
    all_test_anomaly_label.extend(gt_anomaly_test)

    known_anomalies_x.extend(x_detect_train)
    known_anomalies_y.extend(y_detect_train)
    known_anomalies_label.extend(anomaly_detect_train)

Training window size: 0.8
Total cases with at least 2 events: 500
Training cases: 400 Test cases: 100
Total cases with at least 3 events: 500
Training cases: 400 Test cases: 100
Total cases with at least 4 events: 500
Training cases: 400 Test cases: 100
Total cases with at least 5 events: 500
Training cases: 400 Test cases: 100
Total cases with at least 6 events: 500
Training cases: 400 Test cases: 100
Total cases with at least 7 events: 500
Training cases: 400 Test cases: 100
Total cases with at least 8 events: 500
Training cases: 400 Test cases: 100
Total cases with at least 9 events: 446
Training cases: 356 Test cases: 90
Total cases with at least 10 events: 402
Training cases: 321 Test cases: 81
Total cases with at least 11 events: 373
Training cases: 298 Test cases: 75
Total cases with at least 12 events: 318
Training cases: 254 Test cases: 64
Total cases with at least 13 events: 241
Training cases: 192 Test cases: 49
Total cases with at least 14 events: 201
Training cases: 160 Te

In [16]:
# ----------------------------
# Build NAP Model
# ----------------------------
encoder_features = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder_features.fit_transform(pd.DataFrame(all_x_train))

target_encoder = LabelEncoder()
y_train_encoded = target_encoder.fit_transform(all_y_train)

rf_model  = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_model.fit(X_train_encoded, y_train_encoded)


In [17]:
pd.DataFrame(known_anomalies_x)
x_detect_train = encoder_features.transform(pd.DataFrame(known_anomalies_x))
x_detect_train = pd.DataFrame(x_detect_train)
ce_loss = cross_entropy_loss(model=rf_model, x_test = x_detect_train, 
                             y_test = known_anomalies_y)


In [18]:
x_detect_train = encoder_features.transform(pd.DataFrame(known_anomalies_x))
nap_probability = pd.DataFrame([rf_model.predict_proba([i])[0] for i in x_detect_train])
ce_loss = pd.DataFrame(cross_entropy_loss(model=rf_model, x_test = x_detect_train, 
                             y_test = known_anomalies_y))

x_detect_train = pd.DataFrame(x_detect_train)
target_label = target_encoder.transform(known_anomalies_y)
target_label = pd.DataFrame(target_label)
x_detect_train = pd.concat([x_detect_train, target_label], axis=1)

x_detect_train = pd.concat([x_detect_train, nap_probability], axis=1)
x_detect_train = pd.concat([x_detect_train, ce_loss], axis=1)

x_detect_train.columns = x_detect_train.columns.astype(str)
print(x_detect_train.shape)
# ----------------------------
# Step 5: Making training set for the anomaly detection classifier 
# ----------------------------
anom_clf = RandomForestClassifier(n_estimators=10, random_state=42)
# anom_clf = SVC(kernel='rbf', probability=True, random_state=42)
anom_clf.fit(x_detect_train, known_anomalies_label)

(280, 278)


In [19]:
all_x_test_2 = dict()
all_y_test_2 = dict()
all_test_anomaly_label_2 = dict()
for pos, i in enumerate(all_x_test):
    if len(i)+1 not in list(all_x_test_2.keys()):
        all_x_test_2[len(i)+1] = []
        all_y_test_2[len(i)+1] = []
        all_test_anomaly_label_2[len(i)+1] = []
    all_x_test_2[len(i)+1].append(i)
    all_y_test_2[len(i)+1].append(all_y_test[pos])
    all_test_anomaly_label_2[len(i)+1].append(all_test_anomaly_label[pos])


for prefix in all_x_test_2.keys():
    x_detect_test = pd.DataFrame()
    test_df_prefix =[]
    for i in all_x_test_2[prefix]:
        s = i.tolist()
        while len(s) < len(prefix_range):
            s.append(None)
        test_df_prefix.append(s)
        
    test_df_prefix = pd.DataFrame(test_df_prefix)
    x_detect_test = encoder_features.transform(test_df_prefix)
    nap_probability = pd.DataFrame([rf_model.predict_proba([i])[0] for i in x_detect_test])
    ce_loss = pd.DataFrame(cross_entropy_loss(model=rf_model, x_test = x_detect_test, 
                                 y_test = all_y_test_2[prefix]))
    
    x_detect_test = pd.DataFrame(x_detect_test)
    print(x_detect_test.shape)
    target_label = target_encoder.transform(pd.DataFrame([i for i in all_y_test_2[prefix]]))
    target_label = pd.DataFrame(target_label)
    x_detect_test = pd.concat([x_detect_test, target_label], axis=1)
    print(x_detect_test.shape)

    # for pos, i in enumerate(all_x_detect_test[prefix]):
    #     while len(i['probability']) < max_prob_length:
    #         all_x_detect_test[prefix][pos]['probability'].append(0)
    x_detect_test = pd.concat([x_detect_test, nap_probability], axis=1)
    print(x_detect_test.shape)
    x_detect_test = pd.concat([x_detect_test, ce_loss], axis=1)
    x_detect_test.columns = x_detect_test.columns.astype(str)
    print(x_detect_test.shape)
    predicted_anomaly = anom_clf.predict(x_detect_test)
    gt_anomaly_test = all_test_anomaly_label_2[prefix]
    # ----------------------------
    # Step 5: Evaluate the Anomaly Detection
    # ----------------------------
    print("\n--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix %s ---" % prefix)
    classification = classification_report(gt_anomaly_test, predicted_anomaly, output_dict=True)
    f1 = classification.get('1', {}).get('f1-score', 0)
    support = classification.get('1', {}).get('support', 0)
    classification_result[prefix] = classification
    print(f"Classification Report: F1-score = {f1}, Support = {support}")
    classification_result[prefix]['ROC AUC'] = roc_auc_score(gt_anomaly_test, anom_clf.predict_proba(x_detect_test)[:,1])
    
revised_cls_result = {}
for i in classification_result.keys():
    revised_cls_result[i] = dict()
    revised_cls_result[i]['Normal precision'] =classification_result[i]['0']['precision']
    revised_cls_result[i]['Normal recall'] =classification_result[i]['0']['recall']
    revised_cls_result[i]['Normal f1-score'] =classification_result[i]['0']['f1-score']
    revised_cls_result[i]['Normal support'] =classification_result[i]['0']['support']

    revised_cls_result[i]['Anomal precision'] =classification_result[i]['1']['precision']
    revised_cls_result[i]['Anomal recall'] =classification_result[i]['1']['recall']
    revised_cls_result[i]['Anomal f1-score'] =classification_result[i]['1']['f1-score']
    revised_cls_result[i]['Anomal support'] =classification_result[i]['1']['support']    

    revised_cls_result[i]['Macro precision'] =classification_result[i]['macro avg']['precision']   
    revised_cls_result[i]['Macro recall'] =classification_result[i]['macro avg']['recall']   
    revised_cls_result[i]['Macro f1-score'] =classification_result[i]['macro avg']['f1-score']   
    revised_cls_result[i]['ROC AUC'] =classification_result[i]['ROC AUC']   


result_df = pd.DataFrame.from_dict(revised_cls_result).T
result_df.index = result_df.index.set_names(['Prefix length'])
result_df = result_df.reset_index(drop=False)
# result_file_title = '../result/%s_cross_entropy_%s_anomal_thr_result.csv'%(dataset, anomaly_thr_method)
# print(result_file_title)
result_df

(100, 258)
(100, 259)
(100, 277)
(100, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 2 ---
Classification Report: F1-score = 1.0, Support = 11.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(100, 258)
(100, 259)
(100, 277)
(100, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 3 ---
Classification Report: F1-score = 0.7272727272727273, Support = 7.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(100, 258)
(100, 259)
(100, 277)
(100, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 4 ---
Classification Report: F1-score = 0.1875, Support = 9.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(100, 258)
(100, 259)
(100, 277)
(100, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 5 ---
Classification Report: F1-score = 0.3870967741935484, Support = 10.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(100, 258)
(100, 259)
(100, 277)
(100, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 6 ---
Classification Report: F1-score = 0.45454545454545453, Support = 12.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(100, 258)
(100, 259)
(100, 277)
(100, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 7 ---
Classification Report: F1-score = 0.26666666666666666, Support = 13.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(100, 258)
(100, 259)
(100, 277)
(100, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 8 ---
Classification Report: F1-score = 0.17391304347826086, Support = 8.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(90, 258)
(90, 259)
(90, 277)
(90, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 9 ---
Classification Report: F1-score = 0.21818181818181817, Support = 9.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(81, 258)
(81, 259)
(81, 277)
(81, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 10 ---
Classification Report: F1-score = 0.3181818181818182, Support = 8.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(75, 258)
(75, 259)
(75, 277)
(75, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 11 ---
Classification Report: F1-score = 0.2702702702702703, Support = 5.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(64, 258)
(64, 259)
(64, 277)
(64, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 12 ---
Classification Report: F1-score = 0.06896551724137931, Support = 5.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(49, 258)
(49, 259)
(49, 277)
(49, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 13 ---
Classification Report: F1-score = 0.23529411764705882, Support = 8.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(41, 258)
(41, 259)
(41, 277)
(41, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 14 ---
Classification Report: F1-score = 0.17391304347826086, Support = 3.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(35, 258)
(35, 259)
(35, 277)
(35, 278)

--- Anomaly Detection (Dynamic Threshold) Classification Report for prefix 15 ---
Classification Report: F1-score = 0.09523809523809523, Support = 1.0


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Unnamed: 0,Prefix length,Normal precision,Normal recall,Normal f1-score,Normal support,Anomal precision,Anomal recall,Anomal f1-score,Anomal support,Macro precision,Macro recall,Macro f1-score,ROC AUC
0,2,1.0,1.0,1.0,89.0,1.0,1.0,1.0,11.0,1.0,1.0,1.0,1.0
1,3,0.96875,1.0,0.984127,93.0,1.0,0.571429,0.727273,7.0,0.984375,0.785714,0.8557,0.655146
2,4,0.922078,0.78022,0.845238,91.0,0.130435,0.333333,0.1875,9.0,0.526256,0.556777,0.516369,0.772894
3,5,0.949367,0.833333,0.887574,90.0,0.285714,0.6,0.387097,10.0,0.617541,0.716667,0.637335,0.746111
4,6,0.970588,0.75,0.846154,88.0,0.3125,0.833333,0.454545,12.0,0.641544,0.791667,0.65035,0.796402
5,7,0.90566,0.551724,0.685714,87.0,0.170213,0.615385,0.266667,13.0,0.537937,0.583554,0.47619,0.68214
6,8,0.935484,0.630435,0.753247,92.0,0.105263,0.5,0.173913,8.0,0.520374,0.565217,0.46358,0.558424
7,9,0.931818,0.506173,0.656,81.0,0.130435,0.666667,0.218182,9.0,0.531126,0.58642,0.437091,0.643347
8,10,0.977778,0.60274,0.745763,73.0,0.194444,0.875,0.318182,8.0,0.586111,0.73887,0.531972,0.744863
9,11,1.0,0.614286,0.761062,70.0,0.15625,1.0,0.27027,5.0,0.578125,0.807143,0.515666,0.942857
