In [1]:
import pandas as pd
import os
import sys
import numpy as np

sys.path.append(os.path.abspath(".."))
df = pd.read_csv("BGL.log_structured.csv")

df.drop(columns=['LineId','Date', 'Time', 'NodeRepeat', 'Type', 'Component', 'Level', 'Content', 'EventTemplate', 'ParameterList'], inplace=True)
unique_events = df['EventId'].unique()
print(f"Unique events: {len(unique_events)}")
events_map = {event: idx + 1 for idx, event in enumerate(unique_events)}
df['event'] = df['EventId'].map(events_map)
unique_machines = df['Node'].unique()
print(f"Unique machines: {len(unique_machines)}")
machines_map = {mac: idx + 1 for idx, mac in enumerate(unique_machines)}
df['machine'] = df['Node'].map(machines_map)

df['timestamp'] = df['Timestamp'].astype(int)
df.sort_values(by=['timestamp'], inplace=True)
min_timestamp = df['timestamp'].min()
df['timestamp'] = df['timestamp'] - min_timestamp

df['label'] = np.where(
    df['Label'] == '-',  # if alert_flag is '-', it's not an alert
    0,  # not an alert
    1   # is an alert
)
df.drop(columns=['Timestamp', 'Label', 'Node', 'EventId'], inplace=True)

df

Unique events: 1848
Unique machines: 69252


Unnamed: 0,event,machine,timestamp,label
0,1,1,0,0
1,1,1,0,0
2,1,1,0,0
3,1,1,0,0
4,1,1,0,0
...,...,...,...,...
4713488,1058,6497,18551835,1
4713489,1058,6244,18551835,1
4713490,1058,6414,18551835,1
4713491,1058,6276,18551835,1


In [2]:
from tokenizer import tokenize

tokenize(df, save_path="./numeric_tokenizer")

Created 1848 priority tokens





In [3]:
from train import train

_ = train(df, tokenizer_path="./numeric_tokenizer", debug=True, num_queries_al=5)


Using device: cuda

Epoch 1/2


MLM Training: 100%|██████████| 9671/9671 [21:09<00:00,  7.62it/s, loss=1.4416, acc=0.6774, lr=2.50e-05]


Train Loss: 1.6821, Train Acc: 0.6579

Epoch 2/2


MLM Training: 100%|██████████| 9671/9671 [21:32<00:00,  7.48it/s, loss=1.1508, acc=0.7391, lr=0.00e+00]


Train Loss: 1.2936, Train Acc: 0.7024
Best loss: 1.2936
loaded 39 tensors
Model saved to ./pretrained_classifier




Train accuracy: 0.96000
Test accuracy: 0.94848
Test F1 score: 0.76030
Confusion Matrix (Test):
[[2145810  104801]
 [  22755  202296]]


100%|██████████| 2475662/2475662 [12:24<00:00, 3324.67it/s]


Iteration #0 (200 samples)

Train accuracy: 0.95000
Test accuracy: 0.95396
Test F1 score: 0.78369
Confusion Matrix (Test):
[[2155191   95420]
 [  18568  206483]]


100%|██████████| 2475662/2475662 [12:40<00:00, 3256.70it/s]


Iteration #1 (300 samples)

Train accuracy: 0.98000
Test accuracy: 0.96516
Test F1 score: 0.83223
Confusion Matrix (Test):
[[2175452   75159]
 [  11103  213948]]


100%|██████████| 2475662/2475662 [12:44<00:00, 3239.85it/s]


Iteration #2 (400 samples)

Train accuracy: 0.98500
Test accuracy: 0.99021
Test F1 score: 0.94669
Confusion Matrix (Test):
[[2236251   14360]
 [   9876  215175]]


100%|██████████| 2475662/2475662 [12:32<00:00, 3288.67it/s] 


Iteration #3 (500 samples)

Train accuracy: 1.00000
Test accuracy: 0.99711
Test F1 score: 0.98396
Confusion Matrix (Test):
[[2249020    1591]
 [   5567  219484]]


100%|██████████| 2475662/2475662 [18:24<00:00, 2241.46it/s] 


Iteration #4 (600 samples)

Train accuracy: 0.99333
Test accuracy: 0.99744
Test F1 score: 0.98579
Confusion Matrix (Test):
[[2249422    1189]
 [   5152  219899]]
