In [747]:
import numpy as np
from src.utils import load_data_label_vector

# This is notebooks helps to identify the best rules
# for distinguishing between attack and sane payloads
# 
# It takes all the activation vectors from a test set
# It will then see how many times each rule has been
# activated for each class (attack and sane)
# It will then calculate the difference between the two
# and sort the rules by the difference
#
# The rules with the highest difference are the best
# rules for distinguishing between the two classes
# (i.e. the rules that are most often activated for one
# class and least often activated for the other class)
#
# Keep in mind that this is a very simple approach and
# might return rules that are very similar to each other
# 
# Thus the omition of one of these rules might not have
# a big impact on the performance of the model, as another
# rule might already cover the same information

top = 20 # number of top rules to display
train = load_data_label_vector("/app/wafcraft/data/prepared/2024-05-06_11-25-19_honeydew-tough/train.csv")

In [748]:
# find what index has the most 1s in the rows vectors

def find_most_common(data):
    all_sumed = np.sum(data["vector"].to_list(), axis=0)
    # print(all_sumed)
    # create a list of tuples with index and sum
    sumed = [(i, all_sumed[i]) for i in range(0, len(all_sumed))]
    # print(sumed)
    # sort the list by the sum (so the index with the most 1s is first)
    # sumed.sort(key=lambda x: x[1], reverse=True)
    return sumed


train_only_attacks = train[train["label"] == 1]
train_only_sanes = train[train["label"] == 0]
most_common_attacks = find_most_common(train_only_attacks)
most_common_sanes = find_most_common(train_only_sanes)

print("most common attacks")
print(most_common_attacks)
print("most common sanes")
print(most_common_sanes)

# print where the difference is the biggest
diff = []
for i in range(0, len(most_common_attacks)):
    diff.append((i, abs(most_common_attacks[i][1] - most_common_sanes[i][1])))

print("diff")
diff.sort(key=lambda x: x[1], reverse=True)
print([x[0] for x in diff[:top]])

most common attacks
[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 8254.0), (9, 0.0), (10, 577.0), (11, 4564.0), (12, 2320.0), (13, 4381.0), (14, 681.0), (15, 6372.0), (16, 3289.0), (17, 0.0), (18, 1862.0), (19, 498.0), (20, 9130.0), (21, 3524.0), (22, 7883.0), (23, 3362.0), (24, 0.0), (25, 2265.0), (26, 264.0), (27, 62.0), (28, 209.0), (29, 6984.0), (30, 1187.0), (31, 1292.0), (32, 415.0), (33, 4232.0), (34, 7074.0), (35, 370.0), (36, 0.0), (37, 4956.0), (38, 1851.0), (39, 2600.0), (40, 2723.0), (41, 7.0), (42, 856.0), (43, 8882.0), (44, 6161.0), (45, 3339.0), (46, 4437.0), (47, 6872.0), (48, 0.0), (49, 0.0), (50, 9854.0), (51, 9997.0), (52, 9999.0), (53, 4577.0), (54, 1465.0), (55, 9997.0), (56, 377.0), (57, 8992.0), (58, 5073.0), (59, 0.0), (60, 9950.0), (61, 4202.0), (62, 9035.0), (63, 560.0), (64, 122.0), (65, 4722.0), (66, 2314.0), (67, 0.0), (68, 0.0)]
most common sanes
[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (

In [749]:
rule_ids = ['942011', '942012', '942013', '942014', '942015', '942016', '942017', '942018', '942100', '942101', '942110', '942120', '942130', '942131', '942140', '942150', '942151', '942152', '942160', '942170', '942180', '942190', '942200', '942210', '942220', '942230', '942240', '942250', '942251', '942260', '942270', '942280', '942290', '942300', '942310', '942320', '942321', '942330', '942340', '942350', '942360', '942361', '942362', '942370', '942380', '942390', '942400', '942410', '942420', '942421', '942430', '942431', '942432', '942440', '942450', '942460', '942470', '942480', '942490', '942500', '942510', '942511', '942520', '942521', '942522', '942530', '942540', '942550', '942560']
def covert_index_to_rule_id(index):
    return rule_ids[index]

for i, j in diff[:top]:
    print(f"index: {i} with diff {j}")
    print(f"rule id: {covert_index_to_rule_id(i)}")
    print(f"attacks: {most_common_attacks[i][1]}")
    print(f"sanes: {most_common_sanes[i][1]}")
    print()

index: 43 with diff 6350.0
rule id: 942370
attacks: 8882.0
sanes: 2532.0

index: 22 with diff 5120.0
rule id: 942200
attacks: 7883.0
sanes: 2763.0

index: 37 with diff 4956.0
rule id: 942330
attacks: 4956.0
sanes: 0.0

index: 50 with diff 4676.0
rule id: 942430
attacks: 9854.0
sanes: 5178.0

index: 53 with diff 4577.0
rule id: 942440
attacks: 4577.0
sanes: 0.0

index: 46 with diff 4437.0
rule id: 942400
attacks: 4437.0
sanes: 0.0

index: 47 with diff 4411.0
rule id: 942410
attacks: 6872.0
sanes: 2461.0

index: 33 with diff 4232.0
rule id: 942300
attacks: 4232.0
sanes: 0.0

index: 15 with diff 3911.0
rule id: 942150
attacks: 6372.0
sanes: 2461.0

index: 34 with diff 3807.0
rule id: 942310
attacks: 7074.0
sanes: 3267.0

index: 8 with diff 3727.0
rule id: 942100
attacks: 8254.0
sanes: 4527.0

index: 65 with diff 3653.0
rule id: 942530
attacks: 4722.0
sanes: 1069.0

index: 20 with diff 3581.0
rule id: 942180
attacks: 9130.0
sanes: 5549.0

index: 21 with diff 3456.0
rule id: 942190
attacks: