In [1]:
import sys
sys.path.append('..')
import datetime
import time

from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from kneed import KneeLocator
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, log_loss, roc_auc_score
import json
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

In [2]:
def cross_entropy_loss(model, x_test, y_test):
    
    probs = model.predict_proba(x_test)
    
    predicted_probs = []
    for i, true_label in enumerate(y_test):
        idx_arr = np.where(model.classes_ == true_label)[0]
        if len(idx_arr) == 0:
            predicted_probs.append(log_loss(y_true = [1,0], y_pred=[0,1])+1)
        else:
            col_index = idx_arr[0]
            true_label_one_hot = np.zeros_like(probs[i])
            true_label_one_hot[idx_arr] = 1
            predicted_probs.append(log_loss(y_true = true_label_one_hot, y_pred = probs[i]))
            
    return np.array(predicted_probs)

In [3]:
def normal_loss(model, x_test, y_test):
    
    probs = model.predict_proba(X_test)

    predicted_probs = []
    for i, true_label in enumerate(y_test):
        idx_arr = np.where(model.classes_ == true_label)[0]
        if len(idx_arr) == 0:
            predicted_probs.append(1.1)
        else:
            col_index = idx_arr[0]
            
            true_label_one_hot = np.zeros_like(probs[i])
            true_label_one_hot[idx_arr] = 1
            predicted_probs.append(1-probs[i][col_index])
            
    return np.array(predicted_probs)

In [4]:
def safe_transform_target(encoder, targets, unknown_value=-1):
    classes = set(encoder.classes_)
    transformed = []
    for t in targets:
        if t in classes:
            transformed.append(encoder.transform([t])[0])
        else:
            transformed.append(unknown_value)
    return np.array(transformed)

In [5]:
def get_clean_loss(normal_loss_value, cross_entropy_loss_value):
    normal_loss_dist = []
    cross_loss_dist = []
    for pos, prediction in  enumerate(normal_loss_value):
        if prediction != 1:
            cross_loss_dist.append(cross_entropy_loss_value[pos])
            normal_loss_dist.append(prediction)

    return normal_loss_dist, cross_loss_dist

In [6]:
def cleaning_cls_result(classification_result):
    
    for i in classification_result.keys():
        print(i, classification_result[i].keys())

        if '1' not in classification_result[i].keys():
            classification_result[i]['1'] = {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0.0}
    return classification_result

In [7]:
# ----------------------------
# Step 1: Read and Process the Data
# ----------------------------
dataset = '0.099_sample.csv'
df = pd.read_csv("../data/%s" % (dataset))
df = df.sort_values(by='Timestamp')
# Process the 'noise' column:
# - If NaN, assume Normal (0).
# - Otherwise, treat True/1/'True' as anomaly (1); everything else as Normal (0).
df['noise'] = df['noise'].fillna(0).apply(lambda x: 1 if (x == True or x == 1 or x == 'True' or x=='true') else 0)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
print(dataset)
# Calculate the cutoff time (e.g., the median of all timestamps)
cutoff_time = df['Timestamp'].median()

anomaly_f1_list = []
anomaly_support_list = []
prefix_range = range(2, 16)

0.099_sample.csv


In [8]:
training_size = 0.8
print('Training window size: %s' % (training_size))
loss_prefix_dict =dict()
classification_result = dict()
anomaly_thr_method = 'diff'
adaptive_thr_dict = dict()

for prefix in prefix_range:    
    # Extract per case:
    # - The first (prefix-1) events (activities) as features.
    # - The prefix-th event's activity as the target.
    # - The prefix-th event's noise flag as the ground truth anomaly.
    case_features = []
    case_targets = []
    ground_truth_anomaly = []

    for case_id, group in df.groupby('ID'):
        group = group.sort_index()  # assuming the order in the file is the event order
        if len(group) >= prefix:
            events = group['Activity'].values  # adjust 'Activity' if needed
            features = events[:prefix-1]
            target_activity = events[prefix-1]  # prefix-th event's activity
            noise_flag = group['noise'].iloc[prefix-1]

            case_features.append(features)
            case_targets.append(target_activity)
            ground_truth_anomaly.append(noise_flag)

    # Convert to numpy arrays
    case_features = np.array(case_features)
    case_targets = np.array(case_targets)
    ground_truth_anomaly = np.array(ground_truth_anomaly)
    print("Total cases with at least %s events:" % (prefix), case_features.shape[0])

    # ----------------------------
    # Step 2: Encode the Features and Target
    # ----------------------------
    encoder_features = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_encoded = encoder_features.fit_transform(case_features)
    print("Encoded feature shape:", X_encoded.shape)

    # IMPORTANT: Fit LabelEncoder on the full set of target activities (all cases)
    target_encoder = LabelEncoder()
    target_encoder.fit(case_targets)
    y_encoded = target_encoder.transform(case_targets)
    full_classes = target_encoder.classes_
    # print("Full set of event classes (for prefix %s):" % prefix, full_classes)

    # ----------------------------
    # Step 3: Ordered Train/Test Split and Next Event Prediction Model (Stage 1)
    # ----------------------------
    # Instead of a random split, take the first 80% for training and the remaining 20% for testing.
    n_cases = X_encoded.shape[0]
    split_index = int(training_size * n_cases)
    test_index = split_index
    X_train = X_encoded[:split_index]
    X_test = X_encoded[test_index:]
    y_train = y_encoded[:split_index]
    y_test = y_encoded[test_index:]
    gt_anomaly_train = ground_truth_anomaly[:split_index]
    gt_anomaly_test = ground_truth_anomaly[test_index:]
    print("Training cases:", X_train.shape[0], "Test cases:", X_test.shape[0])

    # ----------------------------
    # Step 3: Train model
    # ----------------------------

    # Train a RandomForest classifier with the training set.
    rf_model  = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # ----------------------------
    # Step 4: Anomaly Detection (Stage 2) with Dynamic Thresholding
    # ----------------------------
    # Obtain predicted probabilities for the test set.

    normal_loss_dist = normal_loss(rf_model, X_test, y_test)
    sorted_normal_loss_dist = np.array([i for i in normal_loss_dist if i <1.1])
    sorted_normal_loss_dist = sorted(sorted_normal_loss_dist, reverse=True)

    cross_entropy_loss_dist = cross_entropy_loss(rf_model, X_train, y_train)
    sorted_cross_entropy_loss_dist = [i for i in cross_entropy_loss_dist if i < log_loss([1,0], [0,1])+1]
    sorted_cross_entropy_loss_dist = sorted(sorted_cross_entropy_loss_dist, reverse=True)

    sorted_normal_loss_dist,sorted_cross_entropy_loss_dist = get_clean_loss(sorted_normal_loss_dist, sorted_cross_entropy_loss_dist)

    diffs = np.diff(sorted_cross_entropy_loss_dist)  # consecutive differences
    threshold = -0.02  # e.g., define a large negative drop as dramatic
    dramatic_indices = [i for i, d in enumerate(diffs, start=1) if d < threshold]

    if len(dramatic_indices) ==0:
        adaptive_thr =1
    else:
        adaptive_thr = sorted_cross_entropy_loss_dist[dramatic_indices[0]]
        adaptive_thr_idx = sorted_cross_entropy_loss_dist.index(adaptive_thr)
    if anomaly_thr_method == 'fixed':
        adaptive_thr = 0.01
        predicted_anomaly = (normal_loss_dist > 1-adaptive_thr).astype(int)
    elif anomaly_thr_method == 'diff':
        predicted_anomaly = (cross_entropy_loss_dist > adaptive_thr).astype(int)
        adaptive_thr_dict[prefix] = adaptive_thr

    x = np.arange(1, len(sorted_cross_entropy_loss_dist) + 1)
    y = sorted_cross_entropy_loss_dist

    m = max(50, int(0.05 * len(y)))
    while m < len(y) and max(y[:m]) == min(y[:m]):
        m += 1 
    # 'convex' because the curve bends downward
    knee = KneeLocator(x[:m], y[:m], curve='convex', direction='decreasing', S=10)
    k_idx = knee.knee  # rank of knee, 1-based
    adaptive_thr_kneed = y[k_idx - 1]
    print(f"Kneedle knee at position {k_idx}: threshold={round(adaptive_thr_kneed,3)} Cross entropy diff={round(adaptive_thr,3)}")
    print(f"Kneedle Normal loss={sorted_normal_loss_dist[k_idx - 1]} Drop diff normal loss={sorted_normal_loss_dist[adaptive_thr_idx]}")

    cross_entropy_loss_dist = cross_entropy_loss(rf_model, X_test, y_test)

    predicted_anomaly = (cross_entropy_loss(rf_model, X_test, y_test) > adaptive_thr_kneed).astype(int)


    # ----------------------------
    # Step 5: Evaluate the Anomaly Detection
    # ----------------------------

    classification = classification_report(gt_anomaly_test, predicted_anomaly, output_dict=True)
    f1 = classification.get('1', {}).get('f1-score', 0)
    support = classification.get('1', {}).get('support', 0)
    classification_result[prefix] = classification
#     classification_result[prefix]['ROC AUC'] = roc_auc_score(gt_anomaly_test, anom_clf.predict_proba(x_detect_test)[:,1])

classification_result = cleaning_cls_result(classification_result)
revised_cls_result = {}
for i in classification_result.keys():
    revised_cls_result[i] = dict()
    revised_cls_result[i]['Normal precision'] =classification_result[i]['0']['precision']
    revised_cls_result[i]['Normal recall'] =classification_result[i]['0']['recall']
    revised_cls_result[i]['Normal f1-score'] =classification_result[i]['0']['f1-score']
    revised_cls_result[i]['Normal support'] =classification_result[i]['0']['support']

    revised_cls_result[i]['Anomal precision'] =classification_result[i]['1']['precision']
    revised_cls_result[i]['Anomal recall'] =classification_result[i]['1']['recall']
    revised_cls_result[i]['Anomal f1-score'] =classification_result[i]['1']['f1-score']
    revised_cls_result[i]['Anomal support'] =classification_result[i]['1']['support']    

    revised_cls_result[i]['Macro precision'] =classification_result[i]['macro avg']['precision']   
    revised_cls_result[i]['Macro recall'] =classification_result[i]['macro avg']['recall']   
    revised_cls_result[i]['Macro f1-score'] =classification_result[i]['macro avg']['f1-score']   
#     revised_cls_result[i]['ROC AUC'] =classification_result[i]['ROC AUC']   
result_df = pd.DataFrame.from_dict(revised_cls_result).T
result_df.index = result_df.index.set_names(['Prefix length'])
result_df = result_df.reset_index(drop=False)
# result_file_title = '../result/%s_cross_entropy_%sfold_%s_anomal_thr_result.csv'%(dataset, fold, anomaly_thr_method)
# print(result_file_title)
result_df
# result_df.to_csv(result_file_title, index=False)

# loss_prefix_title = '../result/%s_cross_entropy_loss_list.json'%(dataset)
# with open(loss_prefix_title, 'w') as f:
#     json.dump(adaptive_thr_dict, f)

Training window size: 0.8
Total cases with at least 2 events: 500
Encoded feature shape: (500, 17)
Training cases: 400 Test cases: 100
Kneedle knee at position 1: threshold=0.496 Cross entropy diff=0.449
Kneedle Normal loss=0.9976414451579514 Drop diff normal loss=0.9947933838517131


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Total cases with at least 3 events: 500
Encoded feature shape: (500, 34)
Training cases: 400 Test cases: 100
Kneedle knee at position 3: threshold=0.373 Cross entropy diff=0.373
Kneedle Normal loss=0.9873638659589311 Drop diff normal loss=0.9873638659589311
Total cases with at least 4 events: 500
Encoded feature shape: (500, 52)
Training cases: 400 Test cases: 100
Kneedle knee at position 1: threshold=0.428 Cross entropy diff=0.377
Kneedle Normal loss=0.995 Drop diff normal loss=0.9198333333333333
Total cases with at least 5 events: 500
Encoded feature shape: (500, 70)
Training cases: 400 Test cases: 100
Kneedle knee at position 1: threshold=0.442 Cross entropy diff=0.376
Kneedle Normal loss=0.970375935243061 Drop diff normal loss=0.714
Total cases with at least 6 events: 500
Encoded feature shape: (500, 88)
Training cases: 400 Test cases: 100
Kneedle knee at position 1: threshold=0.351 Cross entropy diff=0.303
Kneedle Normal loss=0.9917098125131069 Drop diff normal loss=0.800947866023

Unnamed: 0,Prefix length,Normal precision,Normal recall,Normal f1-score,Normal support,Anomal precision,Anomal recall,Anomal f1-score,Anomal support,Macro precision,Macro recall,Macro f1-score
0,2,0.89,1.0,0.941799,89.0,0.0,0.0,0.0,11.0,0.445,0.5,0.470899
1,3,0.958763,1.0,0.978947,93.0,1.0,0.428571,0.6,7.0,0.979381,0.714286,0.789474
2,4,0.978022,0.978022,0.978022,91.0,0.777778,0.777778,0.777778,9.0,0.8779,0.8779,0.8779
3,5,0.978261,1.0,0.989011,90.0,1.0,0.8,0.888889,10.0,0.98913,0.9,0.93895
4,6,0.988764,1.0,0.99435,88.0,1.0,0.916667,0.956522,12.0,0.994382,0.958333,0.975436
5,7,0.988506,0.988506,0.988506,87.0,0.923077,0.923077,0.923077,13.0,0.955791,0.955791,0.955791
6,8,1.0,0.684783,0.812903,92.0,0.216216,1.0,0.355556,8.0,0.608108,0.842391,0.584229
7,9,1.0,0.851852,0.92,81.0,0.428571,1.0,0.6,9.0,0.714286,0.925926,0.76
8,10,1.0,0.684932,0.813008,73.0,0.258065,1.0,0.410256,8.0,0.629032,0.842466,0.611632
9,11,1.0,0.428571,0.6,70.0,0.111111,1.0,0.2,5.0,0.555556,0.714286,0.4
