In [1]:
# Imports
import pandas as pd  # type: ignore
import numpy as np  # type: ignore
import os
import base64
from utils import (
    get_rules_list,
    create_train_test_split,
    create_model,
    create_adv_train_test_split,
    test_evasion,
    log,
)
from modsec import init_modsec

from sklearn.ensemble import RandomForestClassifier  # type: ignore
from wafamole.evasion import EvasionEngine  # type: ignore

log("Starting...")
rule_ids = get_rules_list()
modsec = init_modsec()

Starting...


In [2]:
# Set up variables

attack_data_path = "data/raw/attacks_20k.sql" # raw attack data
sane_data_path = "data/raw/sanes_20k.sql" # raw sane data
processed_data_path = "data/preprocessed/4"  # path to store the preprocessed train and test data

paranoia_level = 4

train_attacks_size = 500  # paper uses 10000
train_sanes_size = 500  # paper uses 10000
test_attacks_size = 200  # paper uses 2000
test_sanes_size = 200  # paper uses 2000

train_adv_size = 20  # paper uses 5000 (1/4 of total train set size)
test_adv_size = 10  # paper uses 2000 (1/2 of total test set size)

engine_settings = {
    "max_rounds": 200,
    "round_size": 10,
    "timeout": 10,
}

model = RandomForestClassifier(n_estimators=160, random_state=666)
model_adv = RandomForestClassifier(n_estimators=160, random_state=666)

In [3]:
# # Create train and test sets and train model
# if not os.path.exists(processed_data_path):
#     os.makedirs(processed_data_path)
# train, test = create_train_test_split(
#     attack_file=attack_data_path,
#     sane_file=sane_data_path,
#     train_attacks_size=train_attacks_size,
#     train_sanes_size=train_sanes_size,
#     test_attacks_size=test_attacks_size,
#     test_sanes_size=test_sanes_size,
#     modsec=modsec,
#     rule_ids=rule_ids,
#     paranoia_level=paranoia_level,
# )
# train.to_csv(f"{processed_data_path}/train_{train_attacks_size + train_sanes_size}.csv", index=False)
# test.to_csv(f"{processed_data_path}/test_{test_attacks_size + test_sanes_size}.csv", index=False)
# log("Train and test sets created", True)

# load the train and test sets from disk
train = pd.read_csv(f"{processed_data_path}/train_{train_attacks_size + train_sanes_size}.csv")
test = pd.read_csv(f"{processed_data_path}/test_{test_attacks_size + test_sanes_size}.csv")
train['vector'] = train['vector'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))
test['vector'] = test['vector'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))

wafamole_model, threshold = create_model(
    train=train,
    test=test,
    model=model,
    desired_fpr=0.01,
    modsec=modsec,
    rule_ids=rule_ids,
    paranoia_level=paranoia_level,
)

Model trained successfully!
Evaluating model...
Default threshold: 0.5
              precision    recall  f1-score   support

      attack       0.96      0.95      0.96      1000
        sane       0.96      0.96      0.96      1000

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000

Adjusted threshold: 0.9600915385892147
              precision    recall  f1-score   support

      attack       0.83      0.99      0.90      1000
        sane       0.99      0.79      0.88      1000

    accuracy                           0.89      2000
   macro avg       0.91      0.89      0.89      2000
weighted avg       0.91      0.89      0.89      2000



In [4]:
# adversarial training

engine = EvasionEngine(wafamole_model)
# train_adv, test_adv = create_adv_train_test_split(
#     train=train,
#     test=test,
#     train_adv_size=train_adv_size,
#     test_adv_size=test_adv_size,
#     engine=engine,
#     engine_settings={
#         **engine_settings,
#         "threshold": threshold,
#     },
#     modsec=modsec,
#     rule_ids=rule_ids,
#     paranoia_level=paranoia_level,
# )
# train_adv.to_csv(f"{processed_data_path}/train_adv_{train_adv_size}.csv", index=False)
# test_adv.to_csv(f"{processed_data_path}/test_adv_{test_adv_size}.csv", index=False)
# log("Adversarial train and test sets created", True)

# load the train_adv and test_adv sets from disk
train_adv = pd.read_csv(f"{processed_data_path}/train_adv_{train_adv_size}.csv")
test_adv = pd.read_csv(f"{processed_data_path}/test_adv_{test_adv_size}.csv")
train_adv['vector'] = train_adv['vector'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))
test_adv['vector'] = test_adv['vector'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))

# train new model with train + train_adv
wafamole_model_adv, threshold_adv = create_model(
    train=pd.concat([train, train_adv]).sample(frac=1).reset_index(drop=True),
    test=pd.concat([test, test_adv]).sample(frac=1).reset_index(drop=True),
    model=model_adv,
    desired_fpr=0.01,
    modsec=modsec,
    rule_ids=rule_ids,
    paranoia_level=paranoia_level,
)

Model trained successfully!
Evaluating model...
Default threshold: 0.5
              precision    recall  f1-score   support

      attack       0.96      0.95      0.96      1045
        sane       0.95      0.96      0.96      1000

    accuracy                           0.96      2045
   macro avg       0.96      0.96      0.96      2045
weighted avg       0.96      0.96      0.96      2045

Adjusted threshold: 0.9593239919384103
              precision    recall  f1-score   support

      attack       0.83      0.99      0.90      1045
        sane       0.99      0.78      0.87      1000

    accuracy                           0.89      2045
   macro avg       0.91      0.89      0.89      2045
weighted avg       0.91      0.89      0.89      2045



In [5]:
# Test the model (without adversarial training)
test_evasion(
    payload='SELECT SLEEP(5)#";',
    threshold=threshold,
    model=wafamole_model,
    engine=EvasionEngine(wafamole_model),
    engine_eval_settings={
        "max_rounds": 200,
        "round_size": 10,
        "timeout": 60,
        "threshold": 0.0,
    },
    modsec=modsec,
    rule_ids=rule_ids,
    paranoia_level=paranoia_level,
)

Payload: SELECT SLEEP(5)#";
Vec: [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0]
Confidence: 0.9888
[!] Max number of iterations reached
Reached confidence 0.9166624730069992
with payload
'SELEcT\tSLEEP(0x5)#";61'
Min payload: b'SELEcT\tSLEEP(0x5)#";61'
Min confidence: 0.91666
Reduced confidence from 0.9888 to 0.91666 (reduction of 0.07214)

Evasion successful


In [6]:
# Test the model (with adversarial training)
test_evasion(
    payload='SELECT SLEEP(5)#";',
    threshold=threshold,
    model=wafamole_model_adv,
    engine=EvasionEngine(wafamole_model_adv),
    engine_eval_settings={
        "max_rounds": 200,
        "round_size": 10,
        "timeout": 60,
        "threshold": 0.0,
    },
    modsec=modsec,
    rule_ids=rule_ids,
    paranoia_level=paranoia_level,
)

Payload: SELECT SLEEP(5)#";
Vec: [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0]
Confidence: 0.97833
[!] Max number of iterations reached
Reached confidence 0.8767173507009958
with payload
'SELECT\xa0SLEEP((SELECT 0x5))#";e'
Min payload: b'SELECT\xc2\xa0SLEEP((SELECT 0x5))#";e'
Min confidence: 0.87672
Reduced confidence from 0.97833 to 0.87672 (reduction of 0.10161)

Evasion successful


: 