In [1]:
# Imports

import base64
import pandas as pd
import sqlparse
from utils import get_rules_list, create_train_test_split, payload_to_vec, predict_vec
from modsec import init_modsec
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from wafamole.models import Model
from wafamole.evasion import EvasionEngine

In [2]:
# Set up variables

# TODO: handle large files
dataset_path = "/app/httpParamsDataset/payload_full.csv"

rule_ids = get_rules_list()
modsec = init_modsec()

In [3]:
print(f"rule_ids: {rule_ids}")
print(f"len(rule_ids): {len(rule_ids)}")


rule_ids: ['901001', '901100', '901110', '901111', '901115', '901120', '901125', '901130', '901140', '901141', '901142', '901143', '901160', '901162', '901163', '901164', '901165', '901167', '901168', '901169', '901171', '901200', '901320', '901340', '901350', '901400', '901410', '901450', '901500', '905100', '905110', '911011', '911012', '911013', '911014', '911015', '911016', '911017', '911018', '911100', '913011', '913012', '913013', '913014', '913015', '913016', '913017', '913018', '913100', '920011', '920012', '920013', '920014', '920015', '920016', '920017', '920018', '920100', '920120', '920121', '920160', '920170', '920171', '920180', '920181', '920190', '920200', '920201', '920202', '920210', '920220', '920230', '920240', '920250', '920260', '920270', '920271', '920272', '920273', '920274', '920275', '920280', '920290', '920300', '920310', '920311', '920320', '920330', '920340', '920341', '920350', '920360', '920370', '920380', '920390', '920400', '920410', '920420', '920430',

In [18]:
# Create train and test datasets OR load them from disk

def add_payload_to_vec(data, rule_ids, modsec):
    tqdm.pandas(desc="Processing payloads")
    data["vector"] = data["payload"].progress_apply(
        lambda x: payload_to_vec(x, rule_ids, modsec)
    )
    return data


print("Reading and parsing data...")

data = pd.read_csv(dataset_path)
data['payload'] = data['payload'].apply(lambda x: base64.b64encode(x.encode()).decode())
data['label'] = data['label'].replace({'norm': 'sane', 'anom': 'attack'})

# Concatenate and shuffle
full_data = data.sample(frac=1).reset_index(drop=True)

print("Full data shape:", full_data.shape)
print("Splitting into train and test...")
train, test = train_test_split(
    full_data,
    train_size=500,
    test_size=100,
    stratify=full_data["label"],
)
# Add vector for payloads in train and test
print("Creating vectors...")
train = add_payload_to_vec(train, rule_ids, modsec)
test = add_payload_to_vec(test, rule_ids, modsec)
print("Done!")
print(f"Train shape: {train.shape} | Test shape: {test.shape}")


# Create a RF model

X_train = list(train["vector"])
y_train = train["label"]
X_test = list(test["vector"])
y_test = test["label"]

print("attack count in train:", y_train.value_counts()["attack"])
print("sane count in train:", y_train.value_counts()["sane"])

print("attack count in test:", y_test.value_counts()["attack"])
print("sane count in test:", y_test.value_counts()["sane"])

# create and train the Random Forest model
# number of trees is set to 160 for PLs other than PL1 as per the paper
model = RandomForestClassifier(n_estimators=160, random_state=666)
model.fit(X_train, y_train)
print("Model trained successfully!")

print(classification_report(y_test, model.predict(X_test)))

# free the memory of unneeded data
del train, test
del X_train, y_train, X_test, y_test

Reading and parsing data...
Full data shape: (31067, 4)
Splitting into train and test...
Creating vectors...


Processing payloads: 100%|██████████| 500/500 [01:19<00:00,  6.29it/s]
Processing payloads: 100%|██████████| 100/100 [00:17<00:00,  5.87it/s]


Done!
Train shape: (500, 5) | Test shape: (100, 5)
attack count in train: 189
sane count in train: 311
attack count in test: 38
sane count in test: 62
Model trained successfully!
              precision    recall  f1-score   support

      attack       1.00      1.00      1.00        38
        sane       1.00      1.00      1.00        62

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [19]:
# Create WAFamole model
class WAFamoleModel(Model):
    # TODO: rework predict payload to take vec ?
    def extract_features(self, value: str):
        payload_base64 = base64.b64encode(value.encode("utf-8")).decode("utf-8")
        return payload_to_vec(
            payload_base64=payload_base64, rule_ids=rule_ids, modsec=modsec
        )

    def classify(self, value: str):
        vec = self.extract_features(value)
        return predict_vec(
            vec=vec,
            model=model,
            rule_ids=rule_ids,
            modsec=modsec,
        )

In [20]:
# Create WAFamole evasion engine
wafamole_model = WAFamoleModel()
engine = EvasionEngine(wafamole_model)

In [73]:
# payload = "UPDATE `tab` SET `col1` = 1 WHERE `col3` >= 1110573056 LIMIT 516358144;" # sane
# payload = "SELECT `col1` FROM `tab` WHERE `col1` LIKE '%'s'%';" # attack
# payload = '" OR SELECT SLEEP(5);'  # attack
# payload = 'UPDATE `the` news is not good #bla'
# payload = 'blablbalbalabla<script>alert(1)</script>blablbalbalabla'
# payload = '-1+/*!12345% 55nIoN*//**/(/*!12345%53EleCt*//**/1)+-+'
payload = "12000 un%69on /*blabla*/ select 1"
payload = "uNiOn aLl sElEcT"
payload = "union all select 'bla"


# Test payload without evasion
payload_base64 = base64.b64encode(payload.encode("utf-8")).decode("utf-8")
vec = payload_to_vec(payload_base64, rule_ids, modsec)
is_attack = predict_vec(
    vec=vec,
    model=model,
    rule_ids=rule_ids,
    modsec=modsec,
)
print(f"Payload: {payload}")
#print(f"Vec: {vec}")
print(f"Activaded rules: {len([x for x in vec if x > 0])}")
print(f"Confidence: {round(is_attack, 5)}")

Payload: union all select 'bla
Activaded rules: 6
Confidence: 0.6875


In [33]:
test_payloads = """
/*!50000%55nIoN*/ /*!50000%53eLeCt*/
%55nion(%53elect 1,2,3)-- -
+union+distinct+select+
+union+distinctROW+select+
/**//*!12345UNION SELECT*//**/
/**//*!50000UNION SELECT*//**/
/**/UNION/**//*!50000SELECT*//**/
/*!50000UniON SeLeCt*/
union /*!50000%53elect*/
+ #?uNiOn + #?sEleCt
+ #?1q %0AuNiOn all#qa%0A#%0AsEleCt
/*!%55NiOn*/ /*!%53eLEct*/
/*!u%6eion*/ /*!se%6cect*/
+un/**/ion+se/**/lect
uni%0bon+se%0blect
%2f**%2funion%2f**%2fselect
union%23foo*%2F*bar%0D%0Aselect%23foo%0D%0A
REVERSE(noinu)+REVERSE(tceles)
/*--*/union/*--*/select/*--*/
union (/*!/**/ SeleCT */ 1,2,3)
/*!union*/+/*!select*/
union+/*!select*/
/**/union/**/select/**/
/**/uNIon/**/sEleCt/**/
+%2F**/+Union/*!select*/
/**//*!union*//**//*!select*//**/
/*!uNIOn*/ /*!SelECt*/
+union+distinct+select+
+union+distinctROW+select+
uNiOn aLl sElEcT
UNIunionON+SELselectECT
/**/union/*!50000select*//**/
0%a0union%a0select%09
%0Aunion%0Aselect%0A
%55nion/**/%53elect
uni/*!20000%0d%0aunion*/+/*!20000%0d%0aSelEct*/
%252f%252a*/UNION%252f%252a /SELECT%252f%252a*/
%0A%09UNION%0CSELECT%10NULL%
/*!union*//*--*//*!all*//*--*//*!select*/
union%23foo*%2F*bar%0D%0Aselect%23foo%0D%0A1% 2C2%2C
/*!20000%0d%0aunion*/+/*!20000%0d%0aSelEct*/
+UnIoN/*&a=*/SeLeCT/*&a=*/
union+sel%0bect
+uni*on+sel*ect+
+#1q%0Aunion all#qa%0A#%0Aselect
union(select (1),(2),(3),(4),(5))
UNION(SELECT(column)FROM(table))
%23xyz%0AUnIOn%23xyz%0ASeLecT+
%23xyz%0A%55nIOn%23xyz%0A%53eLecT+
union(select(1),2,3)
union (select 1111,2222,3333)
uNioN (/*!/**/ SeleCT */ 11)
union (select 1111,2222,3333)
+#1q%0AuNiOn all#qa%0A#%0AsEleCt
/**//*U*//*n*//*I*//*o*//*N*//*S*//*e*//*L*//*e*//*c*//*T*/
%0A/**//*!50000%55nIOn*//*yoyu*/all/**/%0A/*!%53eLEct*/%0A/*nnaa*/
+%23sexsexsex%0AUnIOn%23sexsexs ex%0ASeLecT+
+union%23foo*%2F*bar%0D%0Aselect%23foo%0D%0A1% 2C2%2C
/*!f****U%0d%0aunion*/+/*!f****U%0d%0aSelEct*/
+%23blobblobblob%0aUnIOn%23blobblobblob%0aSeLe cT+
/*!blobblobblob%0d%0aunion*/+/*!blobblobblob%0d%0aSelEct*/
/union\sselect/g
/union\s+select/i
/*!UnIoN*/SeLeCT
+UnIoN/*&a=*/SeLeCT/*&a=*/
+uni>on+sel>ect+
+(UnIoN)+(SelECT)+
+(UnI)(oN)+(SeL)(EcT)
+’UnI”On’+'SeL”ECT’
+uni on+sel ect+
+/*!UnIoN*/+/*!SeLeCt*/+
/*!u%6eion*/ /*!se%6cect*/
uni%20union%20/*!select*/%20
union%23aa%0Aselect
/**/union/*!50000select*/
/^.*union.*$/ /^.*select.*$/
/*union*/union/*select*/select+
/*uni X on*/union/*sel X ect*/
+un/**/ion+sel/**/ect+
+UnIOn%0d%0aSeleCt%0d%0a
UNION/*&test=1*/SELECT/*&pwn=2*/
un?+un/**/ion+se/**/lect+
+UNunionION+SEselectLECT+
+uni%0bon+se%0blect+
%252f%252a*/union%252f%252a /select%252f%252a*/
/%2A%2A/union/%2A%2A/select/%2A%2A/
%2f**%2funion%2f**%2fselect%2f**%2f
union%23foo*%2F*bar%0D%0Aselect%23foo%0D%0A
/*!UnIoN*/SeLecT+
"""

# payloads to array
test_payloads = test_payloads.splitlines()
# test payloads with engine
for payload in test_payloads:
    payload_base64 = base64.b64encode(payload.encode("utf-8")).decode("utf-8")
    vec = payload_to_vec(payload_base64, rule_ids, modsec)
    is_attack = predict_vec(
        vec=vec,
        model=model,
        rule_ids=rule_ids,
        modsec=modsec,
    )
    if is_attack < 0.5:
        print("Attack not detected!")
        print(f"Payload: {payload.encode('utf-8')}")
        print(f"Confidence: {round(is_attack, 5)}")

Attack not detected!
Payload: b''
Confidence: 0.0
Attack not detected!
Payload: b'uni%0bon+se%0blect'
Confidence: 0.0875
Attack not detected!
Payload: b'union%23foo*%2F*bar%0D%0Aselect%23foo%0D%0A'
Confidence: 0.4125
Attack not detected!
Payload: b'uNiOn aLl sElEcT'
Confidence: 0.41875
Attack not detected!
Payload: b'UNIunionON+SELselectECT'
Confidence: 0.0
Attack not detected!
Payload: b'union%23foo*%2F*bar%0D%0Aselect%23foo%0D%0A1% 2C2%2C'
Confidence: 0.475
Attack not detected!
Payload: b'union+sel%0bect'
Confidence: 0.0875
Attack not detected!
Payload: b'+uni*on+sel*ect+'
Confidence: 0.0
Attack not detected!
Payload: b'+%23blobblobblob%0aUnIOn%23blobblobblob%0aSeLe cT+'
Confidence: 0.19375
Attack not detected!
Payload: b'/union\\sselect/g'
Confidence: 0.15625
Attack not detected!
Payload: b'/union\\s+select/i'
Confidence: 0.375
Attack not detected!
Payload: b'+uni>on+sel>ect+'
Confidence: 0.01875
Attack not detected!
Payload: b'+(UnIoN)+(SelECT)+'
Confidence: 0.425
Attack not detect

In [37]:
# Try and evade the WAF with WAFamole

min_confidence, min_payload = engine.evaluate(
    payload=payload,
    max_rounds=200,
    round_size=10,
    timeout=60,
    threshold=0.5,
)
print()
print(f"Min payload: {min_payload.encode('utf-8')}")
print(f"Min confidence: {round(min_confidence, 5)}")
print()
print(
    f"Reduced confidence from {round(is_attack, 5)} to {round(min_confidence, 5)} (reduction of {round(is_attack - min_confidence, 5)})"
)

[!] Execution timed out
Reached confidence 0.9
with payload
-0x1/*z*/union SELECt 0x1 ||0-- 

Min payload: b'-0x1/*z*/union\xc2\xa0SELECt 0x1 ||0-- '
Min confidence: 0.9

Reduced confidence from 1.0 to 0.9 (reduction of 0.1)
