In [4]:
import numpy as np
from src.utils import load_data_label_vector

# This is notebooks helps to identify the best rules
# for distinguishing between attack and sane payloads
# 
# It takes all the activation vectors from a test set
# It will then see how many times each rule has been
# activated for each class (attack and sane)
# It will then calculate the difference between the two
# and sort the rules by the difference
#
# The rules with the highest difference are the best
# rules for distinguishing between the two classes
# (i.e. the rules that are most often activated for one
# class and least often activated for the other class)
#
# Keep in mind that this is a very simple approach and
# might return rules that are very similar to each other
# 
# Thus the omition of one of these rules might not have
# a big impact on the performance of the model, as another
# rule might already cover the same information

top = 20 # number of top rules to display
# train = load_data_label_vector("/app/wafcraft/data/prepared/2024-05-06_11-25-19_honeydew-tough/train.csv")
# train = load_data_label_vector("/app/wafcraft/data/prepared/2024-06-09_20-11-04_lightslategray-them/test.csv")
train = load_data_label_vector("/app/wafcraft/data/prepared/2024-06-17_18-23-01_rosybrown-finish/test.csv")

In [5]:
#rule_ids = ['942011', '942012', '942013', '942014', '942015', '942016', '942017', '942018', '942100', '942101', '942110', '942120', '942130', '942131', '942140', '942150', '942151', '942152', '942160', '942170', '942180', '942190', '942200', '942210', '942220', '942230', '942240', '942250', '942251', '942260', '942270', '942280', '942290', '942300', '942310', '942320', '942321', '942330', '942340', '942350', '942360', '942361', '942362', '942370', '942380', '942390', '942400', '942410', '942420', '942421', '942430', '942431', '942432', '942440', '942450', '942460', '942470', '942480', '942490', '942500', '942510', '942511', '942520', '942521', '942522', '942530', '942540', '942550', '942560']
rule_ids = ['942011', '942012', '942013', '942014', '942015', '942016', '942017', '942018', '942100', '942101', '942110', '942120', '942130', '942140', '942150', '942160', '942170', '942180', '942190', '942200', '942210', '942220', '942230', '942240', '942250', '942251', '942260', '942270', '942280', '942290', '942300', '942310', '942320', '942330', '942340', '942350', '942360', '942361', '942370', '942380', '942390', '942400', '942410', '942420', '942421', '942430', '942431', '942432', '942440', '942450', '942460', '942470', '942480', '942490', '942500', '942510', '942511']
def covert_index_to_rule_id(index):
    return rule_ids[index]

In [6]:
# find what index has the most 1s in the rows vectors

def find_most_common(data):
    all_sumed = np.sum(data["vector"].to_list(), axis=0)
    # print(all_sumed)
    # create a list of tuples with index and sum
    sumed = [(i, all_sumed[i]) for i in range(0, len(all_sumed))]
    # print(sumed)
    # sort the list by the sum (so the index with the most 1s is first)
    # sumed.sort(key=lambda x: x[1], reverse=True)
    return sumed


train_only_attacks = train[train["label"] == 1]
train_only_sanes = train[train["label"] == 0]
most_common_attacks = find_most_common(train_only_attacks)
most_common_sanes = find_most_common(train_only_sanes)

print("most common attacks")
print(most_common_attacks)
print("most common sanes")
print(most_common_sanes)

# print where the difference is the biggest
diff = []
for i in range(0, len(most_common_attacks)):
    diff.append((i, abs(most_common_attacks[i][1] - most_common_sanes[i][1])))
    if (most_common_attacks[i][1] == 0 and most_common_sanes[i][1] == 0):
        print("never activated", covert_index_to_rule_id(i))

print("diff")
diff.sort(key=lambda x: x[1], reverse=True)
print([x[0] for x in diff[:top]])

most common attacks
[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 1660.0), (9, 0.0), (10, 92.0), (11, 905.0), (12, 1580.0), (13, 136.0), (14, 1374.0), (15, 351.0), (16, 96.0), (17, 1747.0), (18, 704.0), (19, 1582.0), (20, 688.0), (21, 0.0), (22, 51.0), (23, 52.0), (24, 17.0), (25, 51.0), (26, 1708.0), (27, 226.0), (28, 261.0), (29, 0.0), (30, 804.0), (31, 1438.0), (32, 47.0), (33, 997.0), (34, 362.0), (35, 493.0), (36, 147.0), (37, 5.0), (38, 1797.0), (39, 1234.0), (40, 673.0), (41, 925.0), (42, 1374.0), (43, 0.0), (44, 0.0), (45, 1977.0), (46, 2000.0), (47, 2000.0), (48, 928.0), (49, 283.0), (50, 2000.0), (51, 80.0), (52, 1792.0), (53, 1039.0), (54, 0.0), (55, 1988.0), (56, 858.0)]
most common sanes
[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 901.0), (9, 0.0), (10, 0.0), (11, 379.0), (12, 365.0), (13, 0.0), (14, 457.0), (15, 0.0), (16, 0.0), (17, 1147.0), (18, 16.0), (19, 511.0), (20, 16.0), (21, 0.0), (22

In [7]:
for i, j in diff[:top]:
    print(f"index: {i} with diff {j}")
    print(f"rule id: {covert_index_to_rule_id(i)}")
    print(f"attacks: {most_common_attacks[i][1]}")
    print(f"sanes: {most_common_sanes[i][1]}")
    print()

index: 38 with diff 1291.0
rule id: 942370
attacks: 1797.0
sanes: 506.0

index: 12 with diff 1215.0
rule id: 942130
attacks: 1580.0
sanes: 365.0

index: 19 with diff 1071.0
rule id: 942200
attacks: 1582.0
sanes: 511.0

index: 33 with diff 997.0
rule id: 942330
attacks: 997.0
sanes: 0.0

index: 45 with diff 966.0
rule id: 942430
attacks: 1977.0
sanes: 1011.0

index: 48 with diff 928.0
rule id: 942440
attacks: 928.0
sanes: 0.0

index: 41 with diff 925.0
rule id: 942400
attacks: 925.0
sanes: 0.0

index: 14 with diff 917.0
rule id: 942150
attacks: 1374.0
sanes: 457.0

index: 42 with diff 917.0
rule id: 942410
attacks: 1374.0
sanes: 457.0

index: 30 with diff 804.0
rule id: 942300
attacks: 804.0
sanes: 0.0

index: 26 with diff 777.0
rule id: 942260
attacks: 1708.0
sanes: 931.0

index: 8 with diff 759.0
rule id: 942100
attacks: 1660.0
sanes: 901.0

index: 31 with diff 732.0
rule id: 942310
attacks: 1438.0
sanes: 706.0

index: 18 with diff 688.0
rule id: 942190
attacks: 704.0
sanes: 16.0

ind