In [1]:
import pandas as pd
import os
import sys
import numpy as np

sys.path.append(os.path.abspath(".."))
df = pd.read_csv("BGL.log_structured.csv")

df.drop(columns=['LineId','Date', 'Time', 'NodeRepeat', 'Type', 'Component', 'Level', 'Content', 'EventTemplate', 'ParameterList'], inplace=True)
unique_events = df['EventId'].unique()
print(f"Unique events: {len(unique_events)}")
events_map = {event: idx + 1 for idx, event in enumerate(unique_events)}
df['event'] = df['EventId'].map(events_map)
unique_machines = df['Node'].unique()
print(f"Unique machines: {len(unique_machines)}")
machines_map = {mac: idx + 1 for idx, mac in enumerate(unique_machines)}
df['machine'] = df['Node'].map(machines_map)

df['timestamp'] = df['Timestamp'].astype(int)
df.sort_values(by=['timestamp'], inplace=True)
min_timestamp = df['timestamp'].min()
df['timestamp'] = df['timestamp'] - min_timestamp

df['label'] = np.where(
    df['Label'] == '-',  # if alert_flag is '-', it's not an alert
    0,  # not an alert
    1   # is an alert
)
df.drop(columns=['Timestamp', 'Label', 'Node', 'EventId'], inplace=True)

df

Unique events: 1848
Unique machines: 69252


Unnamed: 0,event,machine,timestamp,label
0,1,1,0,0
1,1,1,0,0
2,1,1,0,0
3,1,1,0,0
4,1,1,0,0
...,...,...,...,...
4713488,1058,6497,18551835,1
4713489,1058,6244,18551835,1
4713490,1058,6414,18551835,1
4713491,1058,6276,18551835,1


In [2]:
from tokenizer import tokenize

tokenize(df, save_path="./numeric_tokenizer")

Created 1848 priority tokens





In [3]:
from train import train

train(df, tokenizer_path="./numeric_tokenizer", debug=True, num_queries_al=5)


Using device: cuda
Starting training...

Epoch 1/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00,  6.50it/s, loss=0.7041, acc=0.6250, lr=2.00e-05]



Epoch 1 Summary:
Train Loss: 0.7143, Train Acc: 0.5234
Learning Rate: 2.00e-05

Epoch 2/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 38.13it/s, loss=0.7882, acc=0.2500, lr=1.78e-05]



Epoch 2 Summary:
Train Loss: 0.7118, Train Acc: 0.4648
Learning Rate: 1.78e-05

Epoch 3/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 38.42it/s, loss=0.6844, acc=0.6250, lr=1.56e-05]



Epoch 3 Summary:
Train Loss: 0.6899, Train Acc: 0.5430
Learning Rate: 1.56e-05

Epoch 4/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 38.74it/s, loss=0.6705, acc=0.6250, lr=1.33e-05]



Epoch 4 Summary:
Train Loss: 0.6788, Train Acc: 0.5898
Learning Rate: 1.33e-05

Epoch 5/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 37.14it/s, loss=0.6392, acc=0.8750, lr=1.11e-05]



Epoch 5 Summary:
Train Loss: 0.6730, Train Acc: 0.6289
Learning Rate: 1.11e-05

Epoch 6/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 38.36it/s, loss=0.7169, acc=0.5000, lr=8.89e-06]



Epoch 6 Summary:
Train Loss: 0.6930, Train Acc: 0.5508
Learning Rate: 8.89e-06

Epoch 7/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 37.88it/s, loss=0.6914, acc=0.5000, lr=6.67e-06]



Epoch 7 Summary:
Train Loss: 0.6795, Train Acc: 0.5664
Learning Rate: 6.67e-06

Epoch 8/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 41.06it/s, loss=0.6942, acc=0.5000, lr=4.44e-06]



Epoch 8 Summary:
Train Loss: 0.6816, Train Acc: 0.5586
Learning Rate: 4.44e-06

Epoch 9/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 38.33it/s, loss=0.7138, acc=0.5000, lr=2.22e-06]



Epoch 9 Summary:
Train Loss: 0.6838, Train Acc: 0.5820
Learning Rate: 2.22e-06

Epoch 10/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 40.31it/s, loss=0.7184, acc=0.3750, lr=0.00e+00]



Epoch 10 Summary:
Train Loss: 0.6884, Train Acc: 0.5352
Learning Rate: 0.00e+00

Training completed!
Best training accuracy: 0.6289




Train accuracy: 0.95200
Test accuracy: 0.94652
Test F1 score: 0.95226
Confusion Matrix (Test):
[[107557    333]
 [ 13589 138844]]


100%|██████████| 260124/260124 [00:48<00:00, 5311.89it/s]


Train accuracy: 0.98167
Test accuracy: 0.98232
Test F1 score: 0.98494
Confusion Matrix (Test):
[[105258   2632]
 [  1970 150463]]


100%|██████████| 260124/260124 [00:47<00:00, 5452.74it/s]


Train accuracy: 0.93571
Test accuracy: 0.97208
Test F1 score: 0.97642
Confusion Matrix (Test):
[[102592   5298]
 [  1970 150463]]


100%|██████████| 260124/260124 [00:56<00:00, 4612.93it/s]


Train accuracy: 0.99250
Test accuracy: 0.99327
Test F1 score: 0.99425
Confusion Matrix (Test):
[[106928    962]
 [   791 151642]]


100%|██████████| 260124/260124 [00:53<00:00, 4823.28it/s]


Train accuracy: 0.99222
Test accuracy: 0.99503
Test F1 score: 0.99575
Confusion Matrix (Test):
[[107159    731]
 [   564 151869]]


100%|██████████| 260124/260124 [01:28<00:00, 2930.99it/s]


Train accuracy: 0.99200
Test accuracy: 0.99688
Test F1 score: 0.99734
Confusion Matrix (Test):
[[107539    351]
 [   461 151972]]


<small_text.integrations.transformers.classifiers.classification.TransformerBasedClassification at 0x7c475b9e4860>