In [7]:
# Imports

import base64
import pandas as pd
import sqlparse
from utils import get_rules_list, create_train_test_split, payload_to_vec, predict_vec
from modsec import init_modsec
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from wafamole.models import Model
from wafamole.evasion import EvasionEngine

In [8]:
# Set up variables

# TODO: handle large files
dataset_path = "/app/httpParamsDataset/payload_full.csv"

rule_ids = get_rules_list()
modsec = init_modsec()

In [18]:
# Create train and test datasets OR load them from disk

def add_payload_to_vec(data, rule_ids, modsec):
    tqdm.pandas(desc="Processing payloads")
    data["vector"] = data["payload"].progress_apply(
        lambda x: payload_to_vec(x, rule_ids, modsec)
    )
    return data


print("Reading and parsing data...")

data = pd.read_csv(dataset_path)
data['payload'] = data['payload'].apply(lambda x: base64.b64encode(x.encode()).decode())
data['label'] = data['label'].replace({'norm': 'sane', 'anom': 'attack'})

# Concatenate and shuffle
full_data = data.sample(frac=1).reset_index(drop=True)

print("Full data shape:", full_data.shape)
print("Splitting into train and test...")
train, test = train_test_split(
    full_data,
    train_size=5000,
    test_size=1000,
    stratify=full_data["label"],
)
# Add vector for payloads in train and test
print("Creating vectors...")
train = add_payload_to_vec(train, rule_ids, modsec)
test = add_payload_to_vec(test, rule_ids, modsec)
print("Done!")
print(f"Train shape: {train.shape} | Test shape: {test.shape}")


# Create a RF model

X_train = list(train["vector"])
y_train = train["label"]
X_test = list(test["vector"])
y_test = test["label"]

# create and train the Random Forest model
# number of trees is set to 160 for PLs other than PL1 as per the paper
model = RandomForestClassifier(n_estimators=160, random_state=666)
model.fit(X_train, y_train)
print("Model trained successfully!")

print(classification_report(y_test, model.predict(X_test)))

# free the memory of unneeded data
del train, test
del X_train, y_train, X_test, y_test

Reading and parsing data...
Full data shape: (31067, 4)
Splitting into train and test...
Creating vectors...


Processing payloads: 100%|██████████| 5000/5000 [01:28<00:00, 56.32it/s] 
Processing payloads: 100%|██████████| 1000/1000 [00:40<00:00, 24.43it/s]


Done!
Train shape: (5000, 5) | Test shape: (1000, 5)
Model trained successfully!
              precision    recall  f1-score   support

      attack       1.00      0.98      0.99       379
        sane       0.99      1.00      0.99       621

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000



In [19]:
# Create WAFamole model
class WAFamoleModel(Model):
    # TODO: rework predict payload to take vec ?
    def extract_features(self, value: str):
        payload_base64 = base64.b64encode(value.encode("utf-8")).decode("utf-8")
        return payload_to_vec(
            payload_base64=payload_base64, rule_ids=rule_ids, modsec=modsec
        )

    def classify(self, value: str):
        vec = self.extract_features(value)
        return predict_vec(
            vec=vec,
            model=model,
            rule_ids=rule_ids,
            modsec=modsec,
        )

In [20]:
# Create WAFamole evasion engine
wafamole_model = WAFamoleModel()
engine = EvasionEngine(wafamole_model)

In [23]:
# payload = "UPDATE `tab` SET `col1` = 1 WHERE `col3` >= 1110573056 LIMIT 516358144;" # sane
# payload = "SELECT `col1` FROM `tab` WHERE `col1` LIKE '%'s'%';" # attack
# payload = 'SELECT SLEEP(5)#";'  # attack
payload = 'test'


# Test payload without evasion
payload_base64 = base64.b64encode(payload.encode("utf-8")).decode("utf-8")
vec = payload_to_vec(payload_base64, rule_ids, modsec)
is_attack = predict_vec(
    vec=vec,
    model=model,
    rule_ids=rule_ids,
    modsec=modsec,
)
print(f"Payload: {payload}")
print(f"Vec: {vec}")
print(f"Confidence: {round(is_attack, 5)}")

Payload: UPDATE `tab` SET `col1` = 1 WHERE `col3` >= 1110573056 LIMIT 516358144;
Vec: [0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0]
Confidence: 1.0


In [22]:
# Try and evade the WAF with WAFamole

min_confidence, min_payload = engine.evaluate(
    payload=payload,
    max_rounds=200,
    round_size=10,
    timeout=60,
    threshold=0.5,
)
print()
print(f"Min payload: {min_payload.encode('utf-8')}")
print(f"Min confidence: {round(min_confidence, 5)}")
print()
print(
    f"Reduced confidence from {round(is_attack, 5)} to {round(min_confidence, 5)} (reduction of {round(is_attack - min_confidence, 5)})"
)

[!] Execution timed out
Reached confidence 1.0
with payload
SELECT SLEEP((SELECT 5)) AND 'W'<>'Wy'#";

Min payload: b'SELECT SLEEP((SELECT 5)) AND \'W\'<>\'Wy\'#";'
Min confidence: 1.0

Reduced confidence from 1.0 to 1.0 (reduction of 0.0)
