In [21]:
# Sanity Check, see model performance, without using preprocessing
# i.e. do not extract the activation vector, instead just use raw
# sql payloads as input to the model

In [22]:
# load data
import pandas as pd
from src.utils import read_and_parse_sql

attacks = read_and_parse_sql("data/raw/attacks_20k.sql")
print(attacks.shape)
attacks["label"] = 1
sanes = read_and_parse_sql("data/raw/sanes_20k.sql")
print(sanes.shape)
sanes["label"] = 0

(6483, 1)
(6518, 1)


In [23]:
# sample test set
test_attacks = attacks.sample(n=2000)
test_sanes = sanes.sample(n=2000)

test = pd.concat([test_attacks, test_sanes]).sample(frac=1).reset_index(drop=True)

del attacks
del sanes

In [24]:
rule_ids = ['942011', '942012', '942013', '942014', '942015', '942016', '942017', '942018', '942100', '942101', '942110', '942120', '942130', '942131', '942140', '942150', '942151', '942152', '942160', '942170', '942180', '942190', '942200', '942210', '942220', '942230', '942240', '942250', '942251', '942260', '942270', '942280', '942290', '942300', '942310', '942320', '942321', '942330', '942340', '942350', '942360', '942361', '942362', '942370', '942380', '942390', '942400', '942410', '942420', '942421', '942430', '942431', '942432', '942440', '942450', '942460', '942470', '942480', '942490', '942500', '942510', '942511', '942520', '942521', '942522', '942530', '942540', '942550', '942560']

In [25]:
# load model and threshold
import joblib
from src.model import create_wafamole_model
from src.modsec import init_modsec

workspace = "/app/wafcraft/data/prepared/2024-04-07_18-15-53_brown-lot"

threshold = float(open(f"{workspace}/model/threshold.txt", "r").read())

modsec = init_modsec()
model = joblib.load(f"{workspace}/model/model.joblib")
wafamole_model = create_wafamole_model(model, modsec, rule_ids, 4)

In [28]:
# get predictions (confidences)
import base64
from tqdm import tqdm

preds = []
for i, row in tqdm(test.iterrows(), total=len(test)):
    payload_b64 = row["data"]
    payload = base64.b64decode(payload_b64)
    label = row["label"]
    confidence_is_attack = wafamole_model.classify(payload.decode("utf-8"))
    preds.append((label, confidence_is_attack))

 47%|████▋     | 1867/4000 [02:40<04:06,  8.65it/s]

In [None]:
# evaluate tpr and fpr
from sklearn.metrics import confusion_matrix
import numpy as np

labels, confidences = zip(*preds)
labels = np.array(labels)
confidences = np.array(confidences)
predictions = (confidences >= threshold).astype(int)

tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

print(f"TPR: {tpr}")
print(f"FPR: {fpr}")