In [25]:
import sys
sys.path.append('..')
import datetime
import time

from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from kneed import KneeLocator
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, f1_score, accuracy_score, log_loss, roc_auc_score, make_scorer
import json
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

In [3]:
def safe_transform_target(encoder, targets, unknown_value=-1):
    classes = set(encoder.classes_)
    transformed = []
    for t in targets:
        if t in classes:
            transformed.append(encoder.transform([t])[0])
        else:
            transformed.append(unknown_value)
    return np.array(transformed)

In [4]:
def get_clean_loss(normal_loss_value, cross_entropy_loss_value):
    normal_loss_dist = []
    cross_loss_dist = []
    for pos, prediction in  enumerate(normal_loss_value):
        if prediction != 1:
            cross_loss_dist.append(cross_entropy_loss_value[pos])
            normal_loss_dist.append(prediction)

    return normal_loss_dist, cross_loss_dist

In [5]:
def cleaning_cls_result(classification_result):
    
    for i in classification_result.keys():
        print(i, classification_result[i].keys())

        if '1' not in classification_result[i].keys():
            classification_result[i]['1'] = {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0.0}
    return classification_result

In [6]:
# ----------------------------
# Step 1: Read and Process the Data
# ----------------------------
dataset = '0.099_sample.csv'
df = pd.read_csv("../data/%s" % (dataset))
df = df.sort_values(by='Timestamp')
# Process the 'noise' column:
# - If NaN, assume Normal (0).
# - Otherwise, treat True/1/'True' as anomaly (1); everything else as Normal (0).
df['noise'] = df['noise'].fillna(0).apply(lambda x: 1 if (x == True or x == 1 or x == 'True' or x=='true') else 0)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
print(dataset)
# Calculate the cutoff time (e.g., the median of all timestamps)
cutoff_time = df['Timestamp'].median()

anomaly_f1_list = []
anomaly_support_list = []
prefix_range = range(2, 16)

0.099_sample.csv


In [36]:
training_size = 0.8
print('Training window size: %s' % (training_size))
loss_prefix_dict =dict()
classification_result = dict()
anomaly_thr_method = 'diff'
adaptive_thr_dict = dict()
tuned_parameters = {
    'n_estimators': [50, 100, 200],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.25],
    'max_features': [1.0]
}

for prefix in prefix_range:    
    # Extract per case:
    # - The first (prefix-1) events (activities) as features.
    # - The prefix-th event's activity as the target.
    # - The prefix-th event's noise flag as the ground truth anomaly.
    case_features = []
    case_targets = []
    ground_truth_anomaly = []

    for case_id, group in df.groupby('ID'):
        group = group.sort_index()  # assuming the order in the file is the event order
        if len(group) >= prefix:
            events = group['Activity'].values  # adjust 'Activity' if needed
            features = events[:prefix]
            noise_flag = group['noise'].iloc[prefix-1]

            case_features.append(features)
            case_targets.append(target_activity)
            ground_truth_anomaly.append(noise_flag)

    # Convert to numpy arrays
    case_features = np.array(case_features)
    case_targets = np.array(case_targets)
    ground_truth_anomaly = np.array(ground_truth_anomaly)
    print("Total cases with at least %s events:" % (prefix), case_features.shape[0])

    # ----------------------------
    # Step 2: Encode the Features and Target
    # ----------------------------
    encoder_features = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_encoded = encoder_features.fit_transform(case_features)
    print("Encoded feature shape:", X_encoded.shape)

    # ----------------------------
    # Step 3: Ordered Train/Test Split and Next Event Prediction Model (Stage 1)
    # ----------------------------
    # Instead of a random split, take the first 80% for training and the remaining 20% for testing.
    n_cases = X_encoded.shape[0]
    split_index = int(training_size * n_cases)
    test_index = split_index
    X_train = X_encoded[:split_index]
    X_test = X_encoded[test_index:]
    gt_anomaly_train = ground_truth_anomaly[:split_index]
    gt_anomaly_test = ground_truth_anomaly[test_index:]
    print("Training cases:", X_train.shape[0], "Test cases:", X_test.shape[0])
    train_feature_df = pd.DataFrame(X_train)   
    test_feature_df = pd.DataFrame(X_test)
    
    # ----------------------------
    # Step 3: Train model
    # ----------------------------

    # Train Isolation Forest with the training set. 
    anom_clf = IsolationForest(random_state=42)   
    
    # Use a stratified split to maintain class balance in folds
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

    # Custom scorer: use ROC AUC on the anomaly scores
    def anomaly_scorer(estimator, X, y):
        # decision_function gives anomaly scores (higher = more normal)
        scores = estimator.decision_function(X)
        return roc_auc_score(y, scores)

    scorer = make_scorer(anomaly_scorer, greater_is_better=True)

    # Custom scorer: macro F1 on the binary predictions
    def iso_anomaly_f1_scorer(estimator, X, y_true):
        # Predict returns +1 for inliers, -1 for outliers
        preds = estimator.predict(X)
        # map to binary: 0=normal, 1=anomaly
        y_pred = np.where(preds == -1, 1, 0)
        # compute binary F1 for anomaly class (pos_label=1)
        return f1_score(y_true, y_pred, pos_label=1)

    scorer = make_scorer(iso_f1_scorer, greater_is_better=True)    
    
    # Grid search
    grid_search = GridSearchCV(
        estimator=anom_clf,
        param_grid=tuned_parameters,
        scoring=scorer,
        cv=cv,
        n_jobs=-1,
        verbose=2
    )

    # Fit on training data
    grid_search.fit(train_feature_df, gt_anomaly_train)

    print("Best parameters found:", grid_search.best_params_)
    print("Best CV ROC AUC:", grid_search.best_score_)

    # ----------------------------
    # Step 4: Anomaly Detection (Stage 2) with Isolation Forest
    # ----------------------------
    
    # Use the best estimator for predictions
    best_iso = grid_search.best_estimator_

    # Predict on test set
    test_scores = best_iso.decision_function(test_feature_df)
    predictions = best_iso.predict(test_feature_df)
    # Convert to binary anomaly labels (1 for anomaly, 0 for normal)
    binary_preds = np.where(predictions == -1, 1, 0)

    test_feature_df['anomaly_score'] = test_scores
    test_feature_df['anomaly'] = binary_preds

    # Evaluate if you have true labels for test set
    test_auc = roc_auc_score(gt_anomaly_test, test_scores)
    print(f"Test ROC AUC: {test_auc:.3f}")

    # Calculate anomaly scores and classify anomalies
    predicted_anomaly = test_feature_df['anomaly']
    
    # ----------------------------
    # Step 5: Evaluate the Anomaly Detection
    # ----------------------------

    classification = classification_report(gt_anomaly_test, predicted_anomaly, output_dict=True)
    f1 = classification.get('1', {}).get('f1-score', 0)
    support = classification.get('1', {}).get('support', 0)
    classification_result[prefix] = classification
#     classification_result[prefix]['ROC AUC'] = roc_auc_score(gt_anomaly_test, anom_clf.predict_proba(x_detect_test)[:,1])

classification_result = cleaning_cls_result(classification_result)
revised_cls_result = {}
for i in classification_result.keys():
    revised_cls_result[i] = dict()
    revised_cls_result[i]['Normal precision'] =classification_result[i]['0']['precision']
    revised_cls_result[i]['Normal recall'] =classification_result[i]['0']['recall']
    revised_cls_result[i]['Normal f1-score'] =classification_result[i]['0']['f1-score']
    revised_cls_result[i]['Normal support'] =classification_result[i]['0']['support']

    revised_cls_result[i]['Anomal precision'] =classification_result[i]['1']['precision']
    revised_cls_result[i]['Anomal recall'] =classification_result[i]['1']['recall']
    revised_cls_result[i]['Anomal f1-score'] =classification_result[i]['1']['f1-score']
    revised_cls_result[i]['Anomal support'] =classification_result[i]['1']['support']    

    revised_cls_result[i]['Macro precision'] =classification_result[i]['macro avg']['precision']   
    revised_cls_result[i]['Macro recall'] =classification_result[i]['macro avg']['recall']   
    revised_cls_result[i]['Macro f1-score'] =classification_result[i]['macro avg']['f1-score']   
#     revised_cls_result[i]['ROC AUC'] =classification_result[i]['ROC AUC']   
result_df = pd.DataFrame.from_dict(revised_cls_result).T
result_df.index = result_df.index.set_names(['Prefix length'])
result_df = result_df.reset_index(drop=False)
# result_file_title = '../result/%s_cross_entropy_%sfold_%s_anomal_thr_result.csv'%(dataset, fold, anomaly_thr_method)
# print(result_file_title)
result_df
# result_df.to_csv(result_file_title, index=False)

# loss_prefix_title = '../result/%s_cross_entropy_loss_list.json'%(dataset)
# with open(loss_prefix_title, 'w') as f:
#     json.dump(adaptive_thr_dict, f)

Training window size: 0.8
Total cases with at least 2 events: 500
Encoded feature shape: (500, 34)
Training cases: 400 Test cases: 100
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.105
Total cases with at least 3 events: 500
Encoded feature shape: (500, 52)
Training cases: 400 Test cases: 100
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.518
Total cases with at least 4 events: 500
Encoded feature shape: (500, 70)
Training cases: 400 Test cases: 100
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.304
Total cases with at least 5 events: 500
Encoded feature shape: (500, 88)
Training cases: 400 Test cases: 100
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.289
Total cases with at least 6 events: 500
Encoded feature shape: (500, 105)
Training cases: 400 Test cases: 100
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.565
Total cases with at least 7 events: 500
Encoded feature shape: (500, 123)
Training cases: 400 Test cases: 100
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.257
Total cases with at least 8 events: 500
Encoded feature shape: (500, 141)
Training cases: 400 Test cases: 100
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.365
Total cases with at least 9 events: 446
Encoded feature shape: (446, 159)
Training cases: 356 Test cases: 90
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.619
Total cases with at least 10 events: 402
Encoded feature shape: (402, 176)
Training cases: 321 Test cases: 81
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.366
Total cases with at least 11 events: 373
Encoded feature shape: (373, 189)
Training cases: 298 Test cases: 75
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.571
Total cases with at least 12 events: 318
Encoded feature shape: (318, 205)
Training cases: 254 Test cases: 64
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.529
Total cases with at least 13 events: 241
Encoded feature shape: (241, 217)
Training cases: 192 Test cases: 49
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.220
Total cases with at least 14 events: 201
Encoded feature shape: (201, 221)
Training cases: 160 Test cases: 41
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.202
Total cases with at least 15 events: 171
Encoded feature shape: (171, 224)
Training cases: 136 Test cases: 35
Fitting 4 folds for each of 9 candidates, totalling 36 fits




Best parameters found: {'contamination': 0.25, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 50}
Best CV ROC AUC: nan
Test ROC AUC: 0.588
2 dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])
3 dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])
4 dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])
5 dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])
6 dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])
7 dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])
8 dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])
9 dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])
10 dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])
11 dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])
12 dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])
13 dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])
14 dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg']

Unnamed: 0,Prefix length,Normal precision,Normal recall,Normal f1-score,Normal support,Anomal precision,Anomal recall,Anomal f1-score,Anomal support,Macro precision,Macro recall,Macro f1-score
0,2,1.0,0.853933,0.921212,89.0,0.458333,1.0,0.628571,11.0,0.729167,0.926966,0.774892
1,3,0.934211,0.763441,0.840237,93.0,0.083333,0.285714,0.129032,7.0,0.508772,0.524578,0.484634
2,4,0.898734,0.78022,0.835294,91.0,0.047619,0.111111,0.066667,9.0,0.473177,0.445665,0.45098
3,5,0.909091,0.777778,0.838323,90.0,0.130435,0.3,0.181818,10.0,0.519763,0.538889,0.510071
4,6,0.883117,0.772727,0.824242,88.0,0.130435,0.25,0.171429,12.0,0.506776,0.511364,0.497835
5,7,0.905405,0.770115,0.832298,87.0,0.230769,0.461538,0.307692,13.0,0.568087,0.615827,0.569995
6,8,0.947368,0.782609,0.857143,92.0,0.166667,0.5,0.25,8.0,0.557018,0.641304,0.553571
7,9,0.893939,0.728395,0.802721,81.0,0.083333,0.222222,0.121212,9.0,0.488636,0.475309,0.461967
8,10,0.918033,0.767123,0.835821,73.0,0.15,0.375,0.214286,8.0,0.534016,0.571062,0.525053
9,11,0.949153,0.8,0.868217,70.0,0.125,0.4,0.190476,5.0,0.537076,0.6,0.529347
