In [5]:
import sys
sys.path.append('..')
import datetime
import time

from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from kneed import KneeLocator
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from xgboost import XGBClassifier
from sklearn.svm import OneClassSVM
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, f1_score, accuracy_score, log_loss, roc_auc_score, make_scorer
import json
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

In [6]:
def safe_transform_target(encoder, targets, unknown_value=-1):
    classes = set(encoder.classes_)
    transformed = []
    for t in targets:
        if t in classes:
            transformed.append(encoder.transform([t])[0])
        else:
            transformed.append(unknown_value)
    return np.array(transformed)

In [7]:
def cleaning_cls_result(classification_result):
    
    for i in classification_result.keys():
        print(i, classification_result[i].keys())

        if '1' not in classification_result[i].keys():
            classification_result[i]['1'] = {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0.0}
    return classification_result

In [8]:
# ----------------------------
# Step 1: Read and Process the Data
# ----------------------------
dataset = '0.099_sample.csv'
df = pd.read_csv("../data/%s" % (dataset))
df = df.sort_values(by='Timestamp')
# Process the 'noise' column:
# - If NaN, assume Normal (0).
# - Otherwise, treat True/1/'True' as anomaly (1); everything else as Normal (0).
df['noise'] = df['noise'].fillna(0).apply(lambda x: 1 if (x == True or x == 1 or x == 'True' or x=='true') else 0)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
print(dataset)
# Calculate the cutoff time (e.g., the median of all timestamps)
cutoff_time = df['Timestamp'].median()

anomaly_f1_list = []
anomaly_support_list = []
prefix_range = range(2, 16)

0.099_sample.csv


In [13]:
training_size = 0.8
print('Training window size: %s' % (training_size))
loss_prefix_dict =dict()
classification_result = dict()
anomaly_thr_method = 'diff'
adaptive_thr_dict = dict()
tuned_parameters = {
    'n_estimators': [50, 100, 200],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.05, 0.1, 0.15],
    'max_features': [1.0]
}

for prefix in prefix_range:    
    # Extract per case:
    # - The first (prefix-1) events (activities) as features.
    # - The prefix-th event's activity as the target.
    # - The prefix-th event's noise flag as the ground truth anomaly.
    case_features = []
    case_targets = []
    ground_truth_anomaly = []

    for case_id, group in df.groupby('ID'):
        group = group.sort_index()  # assuming the order in the file is the event order
        if len(group) >= prefix:
            events = group['Activity'].values  # adjust 'Activity' if needed
            features = events[:prefix]
            noise_flag = group['noise'].iloc[prefix-1]

            case_features.append(features)
            ground_truth_anomaly.append(noise_flag)

    # Convert to numpy arrays
    case_features = np.array(case_features)
    ground_truth_anomaly = np.array(ground_truth_anomaly)
    print("Total cases with at least %s events:" % (prefix), case_features.shape[0])

    # ----------------------------
    # Step 2: Encode the Features and Target
    # ----------------------------
    encoder_features = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_encoded = encoder_features.fit_transform(case_features)
    print("Encoded feature shape:", X_encoded.shape)

    # ----------------------------
    # Step 3: Ordered Train/Test Split and Next Event Prediction Model (Stage 1)
    # ----------------------------
    # Instead of a random split, take the first 80% for training and the remaining 20% for testing.
    n_cases = X_encoded.shape[0]
    split_index = int(training_size * n_cases)
    test_index = split_index
    X_train = X_encoded[:split_index]
    X_test = X_encoded[test_index:]
    gt_anomaly_train = ground_truth_anomaly[:split_index]
    gt_anomaly_test = ground_truth_anomaly[test_index:]
    print("Training cases:", X_train.shape[0], "Test cases:", X_test.shape[0])
    train_feature_df = pd.DataFrame(X_train)   
    test_feature_df = pd.DataFrame(X_test)
    
    # ----------------------------
    # Step 3: Train model
    # ----------------------------

    # Train Isolation Forest with the training set. 
    anom_clf = OneClassSVM(gamma='auto')
    
    # Fit on training data
    anom_clf.fit(train_feature_df)
    # ----------------------------
    # Step 4: Anomaly Detection (Stage 2) with Isolation Forest
    # ----------------------------
    
    # Predict on test set
    predictions = anom_clf.predict(test_feature_df)
    # Convert to binary anomaly labels (1 for anomaly, 0 for normal)
    binary_preds = np.where(predictions == -1, 1, 0)

    test_feature_df['anomaly'] = binary_preds

    # Evaluate if you have true labels for test set

    # Calculate anomaly scores and classify anomalies
    predicted_anomaly = test_feature_df['anomaly']
    
    # ----------------------------
    # Step 5: Evaluate the Anomaly Detection
    # ----------------------------

    classification = classification_report(gt_anomaly_test, predicted_anomaly, output_dict=True)
    f1 = classification.get('1', {}).get('f1-score', 0)
    support = classification.get('1', {}).get('support', 0)
    classification_result[prefix] = classification
#     classification_result[prefix]['ROC AUC'] = roc_auc_score(gt_anomaly_test, anom_clf.predict_proba(x_detect_test)[:,1])

classification_result = cleaning_cls_result(classification_result)
revised_cls_result = {}
for i in classification_result.keys():
    revised_cls_result[i] = dict()
    revised_cls_result[i]['Normal precision'] =classification_result[i]['0']['precision']
    revised_cls_result[i]['Normal recall'] =classification_result[i]['0']['recall']
    revised_cls_result[i]['Normal f1-score'] =classification_result[i]['0']['f1-score']
    revised_cls_result[i]['Normal support'] =classification_result[i]['0']['support']

    revised_cls_result[i]['Anomal precision'] =classification_result[i]['1']['precision']
    revised_cls_result[i]['Anomal recall'] =classification_result[i]['1']['recall']
    revised_cls_result[i]['Anomal f1-score'] =classification_result[i]['1']['f1-score']
    revised_cls_result[i]['Anomal support'] =classification_result[i]['1']['support']    

    revised_cls_result[i]['Macro precision'] =classification_result[i]['macro avg']['precision']   
    revised_cls_result[i]['Macro recall'] =classification_result[i]['macro avg']['recall']   
    revised_cls_result[i]['Macro f1-score'] =classification_result[i]['macro avg']['f1-score']   
#     revised_cls_result[i]['ROC AUC'] =classification_result[i]['ROC AUC']   
result_df = pd.DataFrame.from_dict(revised_cls_result).T
result_df.index = result_df.index.set_names(['Prefix length'])
result_df = result_df.reset_index(drop=False)
# result_file_title = '../result/%s_cross_entropy_%sfold_%s_anomal_thr_result.csv'%(dataset, fold, anomaly_thr_method)
# print(result_file_title)
result_df
# result_df.to_csv(result_file_title, index=False)

# loss_prefix_title = '../result/%s_cross_entropy_loss_list.json'%(dataset)
# with open(loss_prefix_title, 'w') as f:
#     json.dump(adaptive_thr_dict, f)

Training window size: 0.8
Total cases with at least 2 events: 500
Encoded feature shape: (500, 34)
Training cases: 400 Test cases: 100
Total cases with at least 3 events: 500
Encoded feature shape: (500, 52)
Training cases: 400 Test cases: 100
Total cases with at least 4 events: 500
Encoded feature shape: (500, 70)
Training cases: 400 Test cases: 100
Total cases with at least 5 events: 500
Encoded feature shape: (500, 88)
Training cases: 400 Test cases: 100
Total cases with at least 6 events: 500
Encoded feature shape: (500, 105)
Training cases: 400 Test cases: 100
Total cases with at least 7 events: 500
Encoded feature shape: (500, 123)
Training cases: 400 Test cases: 100
Total cases with at least 8 events: 500
Encoded feature shape: (500, 141)
Training cases: 400 Test cases: 100
Total cases with at least 9 events: 446
Encoded feature shape: (446, 159)
Training cases: 356 Test cases: 90
Total cases with at least 10 events: 402
Encoded feature shape: (402, 176)
Training cases: 321 Test

Unnamed: 0,Prefix length,Normal precision,Normal recall,Normal f1-score,Normal support,Anomal precision,Anomal recall,Anomal f1-score,Anomal support,Macro precision,Macro recall,Macro f1-score
0,2,1.0,0.853933,0.921212,89.0,0.458333,1.0,0.628571,11.0,0.729167,0.926966,0.774892
1,3,1.0,0.602151,0.751678,93.0,0.159091,1.0,0.27451,7.0,0.579545,0.801075,0.513094
2,4,0.857143,0.131868,0.228571,91.0,0.081395,0.777778,0.147368,9.0,0.469269,0.454823,0.18797
3,5,0.965517,0.622222,0.756757,90.0,0.190476,0.8,0.307692,10.0,0.577997,0.711111,0.532225
4,6,0.967742,0.340909,0.504202,88.0,0.15942,0.916667,0.271605,12.0,0.563581,0.628788,0.387903
5,7,0.909091,0.45977,0.610687,87.0,0.160714,0.692308,0.26087,13.0,0.534903,0.576039,0.435778
6,8,0.935484,0.315217,0.471545,92.0,0.086957,0.75,0.155844,8.0,0.51122,0.532609,0.313694
7,9,0.893617,0.518519,0.65625,81.0,0.093023,0.444444,0.153846,9.0,0.49332,0.481481,0.405048
8,10,0.925,0.506849,0.654867,73.0,0.121951,0.625,0.204082,8.0,0.523476,0.565925,0.429474
9,11,0.933333,0.4,0.56,70.0,0.066667,0.6,0.12,5.0,0.5,0.5,0.34
