In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv("tbird.log_structured.csv", nrows=2_000_000)
df.drop(columns=['LineId', 'Date', 'Admin', 'Month', 'Day', 'Time', 'EventTemplate', 'Content'], inplace=True)
unique_events = df['EventId'].unique()
print(f"Unique events: {len(unique_events)}")
events_map = {event: idx + 1 for idx, event in enumerate(unique_events)}
df['event'] = df['EventId'].map(events_map)
unique_machines = df['AdminAddr'].unique()
print(f"Unique machines: {len(unique_machines)}")
machines_map = {mac: idx + 1 for idx, mac in enumerate(unique_machines)}
df['machine'] = df['AdminAddr'].map(machines_map)

df['timestamp'] = df['Id'].astype(int)
df.sort_values(by=['timestamp'], inplace=True)
min_timestamp = df['timestamp'].min()
df['timestamp'] = df['timestamp'] - min_timestamp

df['label'] = np.where(
    df['Label'] == '-',  # if alert_flag is '-', it's not an alert
    0,  # not an alert
    1   # is an alert
)
df.drop(columns=['Id', 'Label', 'AdminAddr', 'EventId'], inplace=True)
df

Unique events: 760
Unique machines: 4533


Unnamed: 0,event,machine,timestamp,label
0,1,1,0,0
1,2,2,570,0
2,2,2,572,0
3,2,2,605,0
4,2,2,606,0
...,...,...,...,...
1999995,283,3458,234564,0
1999996,22,3458,234564,0
1999997,22,3535,234564,0
1999998,22,3480,234564,0


In [4]:
import sys
import os
sys.path.append(os.path.abspath(".."))

from tokenizer import tokenize

tokenize(df, save_path="./numeric_tokenizer")

Created 760 priority tokens





In [6]:
from train import train

_ = train(df, tokenizer_path="./numeric_tokenizer", debug=True, num_queries_al=5)


Using device: cuda
Starting training...

Epoch 1/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 24.88it/s, loss=0.7053, acc=0.5000, lr=2.00e-05]



Epoch 1 Summary:
Train Loss: 0.6906, Train Acc: 0.5625
Learning Rate: 2.00e-05

Epoch 2/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 16.93it/s, loss=0.6260, acc=0.7500, lr=1.78e-05]



Epoch 2 Summary:
Train Loss: 0.6473, Train Acc: 0.6367
Learning Rate: 1.78e-05

Epoch 3/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 20.69it/s, loss=0.6036, acc=1.0000, lr=1.56e-05]



Epoch 3 Summary:
Train Loss: 0.5955, Train Acc: 0.8242
Learning Rate: 1.56e-05

Epoch 4/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 22.23it/s, loss=0.5508, acc=0.7500, lr=1.33e-05]



Epoch 4 Summary:
Train Loss: 0.5469, Train Acc: 0.8125
Learning Rate: 1.33e-05

Epoch 5/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 20.26it/s, loss=0.5153, acc=0.5000, lr=1.11e-05]



Epoch 5 Summary:
Train Loss: 0.5132, Train Acc: 0.7578
Learning Rate: 1.11e-05

Epoch 6/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 19.45it/s, loss=0.4211, acc=1.0000, lr=8.89e-06]



Epoch 6 Summary:
Train Loss: 0.4694, Train Acc: 0.9141
Learning Rate: 8.89e-06

Epoch 7/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 18.88it/s, loss=0.3698, acc=1.0000, lr=6.67e-06]



Epoch 7 Summary:
Train Loss: 0.4310, Train Acc: 0.9414
Learning Rate: 6.67e-06

Epoch 8/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 21.54it/s, loss=0.4771, acc=0.8750, lr=4.44e-06]



Epoch 8 Summary:
Train Loss: 0.4510, Train Acc: 0.9023
Learning Rate: 4.44e-06

Epoch 9/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 21.31it/s, loss=0.4979, acc=0.8750, lr=2.22e-06]



Epoch 9 Summary:
Train Loss: 0.4339, Train Acc: 0.9023
Learning Rate: 2.22e-06

Epoch 10/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 22.20it/s, loss=0.4810, acc=0.7500, lr=0.00e+00]



Epoch 10 Summary:
Train Loss: 0.4301, Train Acc: 0.8672
Learning Rate: 0.00e+00

Training completed!
Best training accuracy: 0.9414




Train accuracy: 0.99800
Test accuracy: 0.99989
Test F1 score: 0.99600
Confusion Matrix (Test):
[[1277745      20]
 [    125   18074]]


100%|██████████| 1295764/1295764 [10:16<00:00, 2102.07it/s]


Train accuracy: 0.99833
Test accuracy: 0.99963
Test F1 score: 0.98693
Confusion Matrix (Test):
[[1277402     363]
 [    116   18083]]


100%|██████████| 1295764/1295764 [08:46<00:00, 2458.90it/s]


Train accuracy: 0.99857
Test accuracy: 0.99980
Test F1 score: 0.99297
Confusion Matrix (Test):
[[1277627     138]
 [    118   18081]]


100%|██████████| 1295764/1295764 [09:50<00:00, 2195.14it/s]


Train accuracy: 0.99875
Test accuracy: 0.99985
Test F1 score: 0.99456
Confusion Matrix (Test):
[[1277682      83]
 [    115   18084]]


100%|██████████| 1295764/1295764 [08:59<00:00, 2403.08it/s]


Train accuracy: 0.99889
Test accuracy: 0.99988
Test F1 score: 0.99576
Confusion Matrix (Test):
[[1277733      32]
 [    122   18077]]


100%|██████████| 1295764/1295764 [06:07<00:00, 3524.07it/s]


Train accuracy: 0.99900
Test accuracy: 0.99988
Test F1 score: 0.99562
Confusion Matrix (Test):
[[1277723      42]
 [    117   18082]]
