In [1]:
import pandas as pd
import os
import sys

sys.path.append(os.path.abspath(".."))
df = pd.read_csv("harrison_alerts.csv")
df

Unnamed: 0,time,name,ip,host,short,time_label,event_label
0,1643932807,Suricata: Alert - ET INFO Observed DNS Query t...,192.168.131.109,internal_share,S-Dns-Qry3,false_positive,dnsteal
1,1643932807,Wazuh: First time this IDS alert is generated.,192.168.131.109,internal_share,W-All-Ids,false_positive,-
2,1643932807,Wazuh: First time this IDS alert is generated.,192.168.128.1,inet-firewall,W-All-Ids,false_positive,-
3,1643932807,Suricata: Alert - ET INFO Observed DNS Query t...,192.168.128.1,inet-firewall,S-Dns-Qry3,false_positive,dnsteal
4,1643932819,Suricata: Alert - ET INFO Observed DNS Query t...,192.168.131.109,internal_share,S-Dns-Qry3,false_positive,dnsteal
...,...,...,...,...,...,...,...
593943,1644362611,AMiner: Unusual occurrence frequencies of DNS ...,172.28.255.254,inet-dns,A-Dns-Clc1,false_positive,-
593944,1644363011,AMiner: Unusual occurrence frequencies of DNS ...,192.168.128.1,inet-firewall,A-Dns-Clc1,false_positive,-
593945,1644364024,AMiner: Unusual occurrence frequencies of DNS ...,172.28.255.254,inet-dns,A-Dns-Clc1,false_positive,-
593946,1644364576,AMiner: Unusual occurrence frequencies of DNS ...,172.28.255.254,inet-dns,A-Dns-Clc1,false_positive,-


In [2]:
df.rename(columns={
    'time': 'timestamp',
    'name': 'event',
    'host': 'machine',
}, inplace=True)

unique_events = df['event'].unique()
event_map = {evt: idx + 1 for idx, evt in enumerate(unique_events)}
df['event'] = df['event'].map(event_map)
unique_machines = df['machine'].unique()
machines_map = {mac: idx + 1 for idx, mac in enumerate(unique_machines)}
df['machine'] = df['machine'].map(machines_map)
# Create binary label column (0=normal, 1=anomalous)
df['label'] = df['event_label'].apply(lambda x: 0 if x == '-' else 1)

df.drop(columns=['event_label', 'time_label', 'short', 'ip'], inplace=True)

df.sort_values(by=['timestamp'], inplace=True)
min_timestamp = df['timestamp'].min()
df['timestamp'] = df['timestamp'] - min_timestamp
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,timestamp,event,machine,label
0,0,45,7,0
1,0,47,7,0
2,0,48,7,0
3,0,45,7,0
4,0,46,7,0
...,...,...,...,...
593943,431238,5,3,0
593944,431243,5,3,0
593945,431243,5,3,0
593946,431775,64,10,0


In [3]:
from tokenizer import tokenize

tokenize(df, save_path="./numeric_tokenizer")

Created 78 priority tokens





In [4]:
from train import train

train(df, tokenizer_path="./numeric_tokenizer", debug=True, num_queries_al=5)


Using device: cuda
Starting training...

Epoch 1/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00,  6.95it/s, loss=0.6650, acc=0.7500, lr=2.00e-05]



Epoch 1 Summary:
Train Loss: 0.7395, Train Acc: 0.5430
Learning Rate: 2.00e-05

Epoch 2/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 40.31it/s, loss=0.5735, acc=0.8750, lr=1.78e-05]



Epoch 2 Summary:
Train Loss: 0.6347, Train Acc: 0.6875
Learning Rate: 1.78e-05

Epoch 3/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 42.93it/s, loss=0.4693, acc=0.8750, lr=1.56e-05]



Epoch 3 Summary:
Train Loss: 0.5131, Train Acc: 0.9102
Learning Rate: 1.56e-05

Epoch 4/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 42.83it/s, loss=0.5434, acc=0.8750, lr=1.33e-05]



Epoch 4 Summary:
Train Loss: 0.4612, Train Acc: 0.9297
Learning Rate: 1.33e-05

Epoch 5/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 41.14it/s, loss=0.3261, acc=1.0000, lr=1.11e-05]



Epoch 5 Summary:
Train Loss: 0.3660, Train Acc: 0.9688
Learning Rate: 1.11e-05

Epoch 6/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 42.72it/s, loss=0.3463, acc=0.8750, lr=8.89e-06]



Epoch 6 Summary:
Train Loss: 0.3297, Train Acc: 0.9375
Learning Rate: 8.89e-06

Epoch 7/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 42.67it/s, loss=0.2257, acc=1.0000, lr=6.67e-06]



Epoch 7 Summary:
Train Loss: 0.2730, Train Acc: 0.9648
Learning Rate: 6.67e-06

Epoch 8/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 42.65it/s, loss=0.3742, acc=0.8750, lr=4.44e-06]



Epoch 8 Summary:
Train Loss: 0.2897, Train Acc: 0.9375
Learning Rate: 4.44e-06

Epoch 9/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 42.47it/s, loss=0.1827, acc=1.0000, lr=2.22e-06]



Epoch 9 Summary:
Train Loss: 0.2331, Train Acc: 0.9648
Learning Rate: 2.22e-06

Epoch 10/10
------------------------------


Training: 100%|██████████| 4/4 [00:00<00:00, 42.40it/s, loss=0.3652, acc=0.8750, lr=0.00e+00]



Epoch 10 Summary:
Train Loss: 0.2671, Train Acc: 0.9375
Learning Rate: 0.00e+00

Training completed!
Best training accuracy: 0.9688




Train accuracy: 0.99800
Test accuracy: 0.99726
Test F1 score: 0.99848
Confusion Matrix (Test):
[[ 51589    182]
 [  1236 464344]]


100%|██████████| 517151/517151 [02:34<00:00, 3357.58it/s]


Train accuracy: 0.99667
Test accuracy: 0.99826
Test F1 score: 0.99903
Confusion Matrix (Test):
[[ 51488    283]
 [   617 464963]]


100%|██████████| 517151/517151 [03:07<00:00, 2754.13it/s]


Train accuracy: 0.99857
Test accuracy: 0.99890
Test F1 score: 0.99939
Confusion Matrix (Test):
[[ 51477    294]
 [   274 465306]]


100%|██████████| 517151/517151 [01:48<00:00, 4766.85it/s]


Train accuracy: 0.99875
Test accuracy: 0.99911
Test F1 score: 0.99950
Confusion Matrix (Test):
[[ 51540    231]
 [   230 465350]]


100%|██████████| 517151/517151 [03:13<00:00, 2666.33it/s]


Train accuracy: 0.99333
Test accuracy: 0.99910
Test F1 score: 0.99950
Confusion Matrix (Test):
[[ 51419    352]
 [   113 465467]]


100%|██████████| 517151/517151 [01:44<00:00, 4967.26it/s]


Train accuracy: 0.99500
Test accuracy: 0.99920
Test F1 score: 0.99956
Confusion Matrix (Test):
[[ 51451    320]
 [    94 465486]]


<small_text.integrations.transformers.classifiers.classification.TransformerBasedClassification at 0x71db33a8cad0>