In [36]:
import numpy as np
from src.utils import load_data_label_vector

# This is notebooks helps to identify the best rules
# for distinguishing between attack and sane payloads
# 
# It takes all the activation vectors from a test set
# It will then see how many times each rule has been
# activated for each class (attack and sane)
# It will then calculate the difference between the two
# and sort the rules by the difference
#
# The rules with the highest difference are the best
# rules for distinguishing between the two classes
# (i.e. the rules that are most often activated for one
# class and least often activated for the other class)
#
# Keep in mind that this is a very simple approach and
# might return rules that are very similar to each other
# 
# Thus the omition of one of these rules might not have
# a big impact on the performance of the model, as another
# rule might already cover the same information

top = 20 # number of top rules to display
# train = load_data_label_vector("/app/wafcraft/data/prepared/2024-05-06_11-25-19_honeydew-tough/train.csv")
train = load_data_label_vector("/app/wafcraft/data/prepared/2024-06-09_20-11-04_lightslategray-them/test.csv")

In [37]:
# find what index has the most 1s in the rows vectors

def find_most_common(data):
    all_sumed = np.sum(data["vector"].to_list(), axis=0)
    # print(all_sumed)
    # create a list of tuples with index and sum
    sumed = [(i, all_sumed[i]) for i in range(0, len(all_sumed))]
    # print(sumed)
    # sort the list by the sum (so the index with the most 1s is first)
    # sumed.sort(key=lambda x: x[1], reverse=True)
    return sumed


train_only_attacks = train[train["label"] == 1]
train_only_sanes = train[train["label"] == 0]
most_common_attacks = find_most_common(train_only_attacks)
most_common_sanes = find_most_common(train_only_sanes)

print("most common attacks")
print(most_common_attacks)
print("most common sanes")
print(most_common_sanes)

# print where the difference is the biggest
diff = []
for i in range(0, len(most_common_attacks)):
    diff.append((i, abs(most_common_attacks[i][1] - most_common_sanes[i][1])))

print("diff")
diff.sort(key=lambda x: x[1], reverse=True)
print([x[0] for x in diff[:top]])

most common attacks
[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 1673.0), (9, 0.0), (10, 119.0), (11, 908.0), (12, 474.0), (13, 878.0), (14, 144.0), (15, 1257.0), (16, 630.0), (17, 0.0), (18, 352.0), (19, 102.0), (20, 1818.0), (21, 758.0), (22, 1557.0), (23, 655.0), (24, 0.0), (25, 481.0), (26, 49.0), (27, 12.0), (28, 58.0), (29, 1409.0), (30, 236.0), (31, 251.0), (32, 81.0), (33, 814.0), (34, 1435.0), (35, 73.0), (36, 0.0), (37, 991.0), (38, 394.0), (39, 558.0), (40, 556.0), (41, 2.0), (42, 172.0), (43, 1774.0), (44, 1242.0), (45, 679.0), (46, 897.0), (47, 1336.0), (48, 0.0), (49, 0.0), (50, 1970.0), (51, 2000.0), (52, 2000.0), (53, 930.0), (54, 289.0), (55, 2000.0), (56, 83.0), (57, 1778.0), (58, 1051.0), (59, 0.0), (60, 1988.0), (61, 809.0), (62, 1795.0), (63, 97.0), (64, 20.0), (65, 943.0), (66, 460.0), (67, 0.0), (68, 0.0)]
most common sanes
[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 893.0), (9, 0.0)

In [38]:
rule_ids = ['942011', '942012', '942013', '942014', '942015', '942016', '942017', '942018', '942100', '942101', '942110', '942120', '942130', '942131', '942140', '942150', '942151', '942152', '942160', '942170', '942180', '942190', '942200', '942210', '942220', '942230', '942240', '942250', '942251', '942260', '942270', '942280', '942290', '942300', '942310', '942320', '942321', '942330', '942340', '942350', '942360', '942361', '942362', '942370', '942380', '942390', '942400', '942410', '942420', '942421', '942430', '942431', '942432', '942440', '942450', '942460', '942470', '942480', '942490', '942500', '942510', '942511', '942520', '942521', '942522', '942530', '942540', '942550', '942560']
def covert_index_to_rule_id(index):
    return rule_ids[index]

for i, j in diff[:top]:
    print(f"index: {i} with diff {j}")
    print(f"rule id: {covert_index_to_rule_id(i)}")
    print(f"attacks: {most_common_attacks[i][1]}")
    print(f"sanes: {most_common_sanes[i][1]}")
    print()

index: 43 with diff 1281.0
rule id: 942370
attacks: 1774.0
sanes: 493.0

index: 37 with diff 991.0
rule id: 942330
attacks: 991.0
sanes: 0.0

index: 22 with diff 989.0
rule id: 942200
attacks: 1557.0
sanes: 568.0

index: 53 with diff 930.0
rule id: 942440
attacks: 930.0
sanes: 0.0

index: 50 with diff 921.0
rule id: 942430
attacks: 1970.0
sanes: 1049.0

index: 46 with diff 897.0
rule id: 942400
attacks: 897.0
sanes: 0.0

index: 47 with diff 826.0
rule id: 942410
attacks: 1336.0
sanes: 510.0

index: 33 with diff 814.0
rule id: 942300
attacks: 814.0
sanes: 0.0

index: 8 with diff 780.0
rule id: 942100
attacks: 1673.0
sanes: 893.0

index: 34 with diff 779.0
rule id: 942310
attacks: 1435.0
sanes: 656.0

index: 21 with diff 751.0
rule id: 942190
attacks: 758.0
sanes: 7.0

index: 15 with diff 747.0
rule id: 942150
attacks: 1257.0
sanes: 510.0

index: 65 with diff 730.0
rule id: 942530
attacks: 943.0
sanes: 213.0

index: 20 with diff 714.0
rule id: 942180
attacks: 1818.0
sanes: 1104.0

index: