In [1]:
# Imports

import base64
import pandas as pd
from utils import get_rules_list, predict_payload, create_train_test_split
from rf_classifier import generate_rf_model
from modsec import init_modsec


In [2]:
# Set up variables

# TODO: handle large files
attack_data_path = "data/attacks_20k.sql"
sane_data_path = "data/sanes_20k.sql"

rule_ids = get_rules_list()
modsec = init_modsec()

In [3]:
# Create train and test datasets OR load them from disk

train, test = create_train_test_split(
    attack_file=attack_data_path,
    sane_file=sane_data_path,
    train_size=5000,
    test_size=1000,
    modsec=modsec,
    rule_ids=rule_ids,
)

Reading and parsing data...
Full data shape: (13504, 2)
Splitting into train and test...
Creating vectors...


Processing payloads: 100%|██████████| 5000/5000 [01:33<00:00, 53.32it/s]
Processing payloads: 100%|██████████| 1000/1000 [00:39<00:00, 25.55it/s]

Done!
Train shape: (5000, 3) | Test shape: (1000, 3)





In [10]:
# Create a RF model

model = generate_rf_model(train_data=train, test_data=test)

Model trained successfully!
              precision    recall  f1-score   support

      attack       0.97      0.95      0.96       516
        sane       0.94      0.97      0.96       484

    accuracy                           0.96      1000
   macro avg       0.96      0.96      0.96      1000
weighted avg       0.96      0.96      0.96      1000



In [12]:
# Test payload

payload = "UPDATE `tab` SET `col1` = 1 WHERE `col3` >= 1110573056 LIMIT 516358144;" # sane
# payload = "SELECT `col1` FROM `tab` WHERE `col1` LIKE '%'s'%';" # attack
payload_base64 = base64.b64encode(payload.encode("utf-8")).decode("utf-8")
is_attack = predict_payload(
    payload_base64=payload_base64,
    model=model,
    rule_ids=rule_ids,
    modsec=modsec,
)
print(f"Payload: {payload}")
print(f"Is attack confidence: {round(is_attack, 5)}")

Payload: UPDATE `tab` SET `col1` = 1 WHERE `col3` >= 1110573056 LIMIT 516358144;
Is attack confidence: 0.0
