In [1]:
from sklearn.ensemble import RandomForestClassifier

from src.utils import get_rules_list


class Target_Config:
    # General Settings
    NAME = "Target"
    DESCRIPTION = "full data set size from paper"
    BATCH_SIZE = 10
    MAX_PROCESSES = 20
    # Training Settings
    ATTACK_DATA_PATH = "data/raw/attacks_full.sql"
    SANE_DATA_PATH = "data/raw/sanes_full.sql"
    TRAIN_ATTACKS_SIZE = 10000
    TRAIN_SANES_SIZE = 10000
    TEST_ATTACKS_SIZE = 2000
    TEST_SANES_SIZE = 2000
    MODEL = RandomForestClassifier(n_estimators=160, random_state=666)
    PARANOIA_LEVEL = 4
    RULE_IDS = get_rules_list()
    DESIRED_FPR = 0.01
    OVERLAP_SETTINGS = {
        "use_overlap": False,
        "overlap": None,
        "overlap_path": None,
    }
    # Adversarial Training Settings
    ADVERSARIAL_TRAINING = True
    TRAIN_ADV_SIZE = 5000
    TEST_ADV_SIZE = 2000
    MODEL_ADV = RandomForestClassifier(n_estimators=160, random_state=666)
    ENGINE_SETTINGS = {  # paper unclear about these settings
        "max_rounds": 200,
        "round_size": 10,
        "timeout": 15,
        "threshold": 0.0,  # just go as far as possible
    }
    # Sample Creation Settings
    FIND_SAMPLES = False
    SAMPLE_ATTEMPTS = 10000
    ENGINE_SETTINGS_SAMPLE_CREATION = {
        "max_rounds": 300,
        "round_size": 10,
        "timeout": 30,
        "threshold": 0.0,
    }
Config = Target_Config

In [None]:
from src.utils import load_data_label_vector, read_and_parse_sql
attacks = read_and_parse_sql(Config.ATTACK_DATA_PATH)
attacks["label"] = 1

sanes = read_and_parse_sql(Config.SANE_DATA_PATH)
sanes["label"] = 0

print("Data loaded")

In [2]:
# Get data

import pandas as pd
from sklearn.model_selection import train_test_split

# train_attacks, test_attacks = train_test_split(
#         attacks,
#         train_size=Config.TRAIN_ATTACKS_SIZE,
#         test_size=Config.TEST_ATTACKS_SIZE,
#         stratify=attacks["label"],
#     )
# train_sanes, test_sanes = train_test_split(
#         sanes,
#         train_size=Config.TRAIN_SANES_SIZE,
#         test_size=Config.TEST_SANES_SIZE,
#         stratify=sanes["label"],
#     )
# train = (
#     pd.concat([train_attacks, train_sanes]).sample(frac=1).reset_index(drop=True)
# )

train = load_data_label_vector("train.csv")
train_no_vectors = train.drop(columns=["vector"])
# remove the data that is already in the training set
attacks = attacks[~attacks["data"].isin(train["data"])]
sanes = sanes[~sanes["data"].isin(train["data"])]
# now sample test_attacks and test_sanes from the remaining data
test_attacks = attacks.sample(n=Config.TEST_ATTACKS_SIZE, random_state=666)
test_sanes = sanes.sample(n=Config.TEST_SANES_SIZE, random_state=666)
test = pd.concat([test_attacks, test_sanes]).sample(frac=1).reset_index(drop=True)



# free memory of attacks and sanes
del attacks
del sanes

In [3]:
print(train.head())

                                                data  label  \
0  SU5TRVJUIElOVE8gYHRhYmAgKCBgY29sMmAgKSBWQUxVRV...      1   
1  aWYgbm90KHN1YnN0cmluZygoc2VsZWN0IEBAdmVyc2lvbi...      1   
2  SU5TRVJUIElOVE8gYHRhYmAgKCBgY29sM2AsIGBjb2wzYC...      0   
3  U0VMRUNUIGBjb2wyYCwgYGNvbDJgLCBgY29sMmAgRlJPTS...      0   
4  REVMRVRFIEZST00gYHRhYmAgV0hFUkUgYGNvbDFgID0gJ3...      1   

                                              vector  
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...  
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...  
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
3  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...  
4  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...  


In [4]:
# add vec column with features
# use modsec as feature extraction
import base64
import numpy as np
from tqdm import tqdm
from src.modsec import get_activated_rules, init_modsec


def payload_to_vec(payload_base64, rule_ids, modsec, paranoia_level):
    matches = get_activated_rules(
        payloads_base64=[payload_base64], modsec=modsec, paranoia_level=paranoia_level
    )
    # rule_array as numpy array of 0s and 1s
    rule_array = [1 if int(rule_id) in set(matches) else 0 for rule_id in rule_ids]
    return np.array(rule_array)


def add_vec_to_data(data_set, rule_ids, paranoia_level):
    # split data_set into batches
    batches = np.array_split(data_set, len(data_set) // 100)

    for i, batch in enumerate(batches):
        # print on same line f"Processing batch {i+1}/{len(batches)}", no newline
        print(f"Processing batch {i+1}/{len(batches)}", end="\r")
        modsec = init_modsec()
        # tqdm.pandas(desc="Processing payloads")
        # batch["vector"] = batch["data"].progress_apply(
        #     lambda x: payload_to_vec(x, rule_ids, modsec, paranoia_level)
        # )
        # without progressbar
        batch["vector"] = batch["data"].apply(
            lambda x: payload_to_vec(x, rule_ids, modsec, paranoia_level)
        )

    # concatenate batches
    data_set = pd.concat(batches)

    return data_set

# train = add_vec_to_data(train, Config.RULE_IDS, Config.PARANOIA_LEVEL)
# train.to_csv("train.csv", index=False)
# test = add_vec_to_data(test, Config.RULE_IDS, Config.PARANOIA_LEVEL)
# test.to_csv("test.csv", index=False)

Processing batch 40/40

In [5]:
# check data
print(train.shape)
print(test.shape)
print(train.head())
print(test.head())

# save as csv
# train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

(20000, 3)
(4000, 3)
                                                data  label  \
0  SU5TRVJUIElOVE8gYHRhYmAgKCBgY29sMmAgKSBWQUxVRV...      1   
1  aWYgbm90KHN1YnN0cmluZygoc2VsZWN0IEBAdmVyc2lvbi...      1   
2  SU5TRVJUIElOVE8gYHRhYmAgKCBgY29sM2AsIGBjb2wzYC...      0   
3  U0VMRUNUIGBjb2wyYCwgYGNvbDJgLCBgY29sMmAgRlJPTS...      0   
4  REVMRVRFIEZST00gYHRhYmAgV0hFUkUgYGNvbDFgID0gJ3...      1   

                                              vector  
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...  
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...  
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
3  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...  
4  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...  
                                                data  label  \
0  VVBEQVRFIGB0YWJgIFNFVCBgY29sMmAgPSAwIFdIRVJFIG...      0   
1  U0VMRUNUIGBjb2wzYCwgYGNvbDNgLCBgY29sMmAgRlJPTS...      0   
2  REVMRVRFIEZST00gYHRhYmAgV0hFUkUgYGNvbDNgID0gLT...      0   
3  VVBEQVRFIGB0YWJ

In [92]:
entries = range(0, 61)
for i in range(0, 100):
    # if train["vector"][i] at any of the entries is 1
    vec = train["vector"][i]
    elements = [vec[i] for i in entries]
    if 1 in elements:
        print("found")
        print(vec)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]
found
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]
found
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [82]:
# chose 12 random numbers between 0 and 60

from random import sample
import numpy as np

ranom_numbers = sample(range(0, 60), 12)

# set the index of the random numbers to 0 for train datas vector
print(ranom_numbers)

def setzero(vector, i):
    vector[i] = 0
    return vector

for i in ranom_numbers:
    train["vector"] = train["vector"].apply(lambda x: setzero(x, i))
    


[35, 46, 19, 44, 56, 2, 23, 39, 27, 8, 13, 57]


In [39]:
print(train.iloc[0]["vector"])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.
 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0.]


In [34]:
# train model
X_train, y_train = list(train["vector"]), train["label"]
Config.MODEL.fit(X_train, y_train)

In [35]:
# choose test data
# test1 = load_data_label_vector("data/prepared/2024-04-08_10-49-51_paleturquoise-nor/test.csv")
# test_adv = load_data_label_vector("data/prepared/2024-04-08_10-49-51_paleturquoise-nor/test_adv.csv")
# test = pd.concat([test1, test_adv]).sample(frac=1).reset_index(drop=True)

test = load_data_label_vector("test.csv")
for i in ranom_numbers:
    test["vector"] = test["vector"].apply(lambda x: setzero(x, i))
X_test, y_test = list(test["vector"]), test["label"]



In [36]:
# Evaluate check model TPR at desired FPR
# calculate TPR at desired FPR
from sklearn.metrics import confusion_matrix, roc_curve


probabilities = Config.MODEL.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, probabilities)  # plot ROC curve
closest_idx = np.argmin(np.abs(fpr - Config.DESIRED_FPR))  # threshold closest to FPR
threshold = thresholds[closest_idx]
adjusted_predictions = (probabilities >= threshold).astype(int)  #  new preds

cm = confusion_matrix(y_test, adjusted_predictions)

tn, fp, fn, tp = cm.ravel()

tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

print(f"TPR: {tpr}")
print(f"FPR: {fpr}")

TPR: 0.94625
FPR: 0.0085
