<h1>Comparing binary classification model over CSIC 2010 and FWAF datasets</h1>

In [1]:
import pandas as pd
import numpy as np

import urllib.parse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/fwaf-dataset/badqueries.txt
/kaggle/input/fwaf-dataset/goodqueries.txt
/kaggle/input/csic-2010-web-application-attacks/csic_database.csv


<h3><b>FWAF preparing</b></h3>

In [2]:
def loadFile(name):
    directory = str(os.getcwd())
    filepath = os.path.join(directory, name)
    with open(filepath,'r') as f:
        data = f.readlines()
    data = list(set(data))
    result = []
    for d in data:
        d = str(urllib.parse.unquote(d))   #converting url encoded data to simple string
        result.append(d)
    return result

In [3]:
badQueries = loadFile('/kaggle/input/fwaf-dataset/badqueries.txt')
validQueries = loadFile('/kaggle/input/fwaf-dataset/goodqueries.txt')
badQueries = list(set(badQueries))
validQueries = list(set(validQueries))
allQueries = badQueries + validQueries
yBad = [1 for i in range(0, len(badQueries))]  #labels, 1 for malicious and 0 for clean
yGood = [0 for i in range(0, len(validQueries))]

yFWAF = yBad + yGood
queriesFWAF = allQueries

In [4]:
queriesFWAF[0], yFWAF[0]

('/misc/audio.php?recording=../version.inc\n', 1)

In [5]:
queriesFWAF[-1], yFWAF[-1]

('/rfc927/\n', 0)

<h3><b>FWAF tokenization</b></h3>

<h3>Using  chars from two datasets as vocabulary</h3>

In [6]:
vocabulary = ['\x00',  '\x01',  '\x02',  '\x03',  '\x04',  '\x07',  '\x08',  '\x14',  '\x16',  '\x18',  '\x1b', 
              ' ',  '!',  '"',  '#',  '$',  '%',  '&',  "'",  '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/', 
              '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  ':',  ';',  '<',  '=',  '>',  '?',  '@', 
              '[',  '\\',  ']',  '^',  '_',  '`',  'a',  'b',  'c',  'd',  'e',  'f',  'g',  'h',  'i',  'j',  'k',
              'l',  'm',  'n',  'o',  'p',  'q',  'r',  's',  't',  'u',  'v',  'w',  'x',  'y',  'z',  '{',  '|', 
              '}',  '~',  '\x7f',  '\x80',  '\x86',  '\x89',  '\x8e',  '\x8f',  '\x90',  '\x92',  '\x93',  '\x98', 
              '\x9d',  '¢',  '¥',  '§',  '¨',  '°',  '²',  '³',  '¸',  '¼',  '½',  '¾',  '¿',  'â',  'ã',  'å',  
              'æ',  'è',  'í',  'ï',  'ĺ',  'œ',  'ž',  'ſ',  'ʺ',  'ˆ',  'ͥ',  'ч',  'ԯ',  'դ',  '–',  '‘',  '’',  '‚',
              '“',  '”',  '•',  '′',  '″',  '›',  '₨',  '€',  '℅',  '™',  '↓',  '∀',  '⑭',  '⒕',  '⒛',  '〳',  '㰀', 
              '㸀',  '晕',  '萨',  'ﬁ',  '＇',  '＜',  '＞',  '�',  '😯',
             'ｅ',  'ｉ',  'ｎ',  'ｏ',  'ｐ',  'ｒ',  'ｓ',  'ｘ']

In [7]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", encoding='utf-8', vocabulary = vocabulary)
XFWAF = vectorizer.fit_transform(queriesFWAF)

In [8]:
vectorizer.get_feature_names_out()

array(['\x00', '\x01', '\x02', '\x03', '\x04', '\x07', '\x08', '\x14',
       '\x16', '\x18', '\x1b', ' ', '!', '"', '#', '$', '%', '&', "'",
       '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4',
       '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[',
       '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
       'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
       'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x7f', '\x80',
       '\x86', '\x89', '\x8e', '\x8f', '\x90', '\x92', '\x93', '\x98',
       '\x9d', '¢', '¥', '§', '¨', '°', '²', '³', '¸', '¼', '½', '¾', '¿',
       'â', 'ã', 'å', 'æ', 'è', 'í', 'ï', 'ĺ', 'œ', 'ž', 'ſ', 'ʺ', 'ˆ',
       'ͥ', 'ч', 'ԯ', 'դ', '–', '‘', '’', '‚', '“', '”', '•', '′', '″',
       '›', '₨', '€', '℅', '™', '↓', '∀', '⑭', '⒕', '⒛', '〳', '㰀', '㸀',
       '晕', '萨', 'ﬁ', '＇', '＜', '＞', '�', '😯', 'ｅ', 'ｉ', 'ｎ', 'ｏ', 'ｐ',
       'ｒ', 'ｓ', 'ｘ'], dtype=object)

<h3><b>Dataset splitting</b></h3>

In [9]:
XFWAF_train, XFWAF_test, yFWAF_train, yFWAF_test = train_test_split(XFWAF, yFWAF, test_size=0.3, random_state=2023)

<h3><b>Logistic Regression model</b></h3>

In [10]:
lgs_fwaf = LogisticRegression(max_iter=500)
lgs_fwaf.fit(XFWAF_train, yFWAF_train)
predictions = lgs_fwaf.predict(XFWAF_test)

In [11]:
print("Accuracy", metrics.accuracy_score(yFWAF_test, predictions))
print("Precision", metrics.precision_score(yFWAF_test, predictions, average='weighted', labels=np.unique(predictions)))
print("Recall", metrics.recall_score(yFWAF_test, predictions, average='weighted', labels=np.unique(predictions)))
print("F1", metrics.f1_score(yFWAF_test, predictions, average='weighted', labels=np.unique(predictions)))
print("ROC AUC", metrics.roc_auc_score(yFWAF_test, predictions, average='weighted', labels=np.unique(predictions)))

Accuracy 0.9911433745726843
Precision 0.9908576345146608
Recall 0.9911433745726843
F1 0.9907582793688448
ROC AUC 0.8921111203618298


<h3><b>Cross Validation on FWAF only</b></h3>

In [13]:
accuracy = cross_val_score(lgs_fwaf, XFWAF, yFWAF, cv=5, scoring='accuracy')
precision = cross_val_score(lgs_fwaf, XFWAF, yFWAF, cv=5, scoring='precision')
recall = cross_val_score(lgs_fwaf, XFWAF, yFWAF, cv=5, scoring='recall')
f1 = cross_val_score(lgs_fwaf, XFWAF, yFWAF, cv=5, scoring='f1')
roc_auc = cross_val_score(lgs_fwaf, XFWAF, yFWAF, cv=5, scoring='roc_auc')

In [14]:
print("Accuracy", accuracy)
print("Precision", precision)
print("Recall", recall)
print("F1", f1)
print("ROC AUC", roc_auc)

Accuracy [0.99101113 0.99093861 0.99112937 0.99120568 0.99080889]
Precision [0.93936955 0.93581532 0.94031848 0.93860465 0.94107264]
Recall [0.78623555 0.78735826 0.78901864 0.79306086 0.77823939]
F1 [0.85600782 0.85519176 0.85804994 0.85971639 0.85194518]
ROC AUC [0.98974828 0.98946906 0.99028437 0.98979104 0.98818331]


<h3><b>Preparing CSIC 2010</b></h3>

In [15]:
csic = pd.read_csv('/kaggle/input/csic-2010-web-application-attacks/csic_database.csv')
csic.head()

Unnamed: 0.1,Unnamed: 0,Method,User-Agent,Pragma,Cache-Control,Accept,Accept-encoding,Accept-charset,language,host,cookie,content-type,connection,lenght,content,classification,URL
0,Normal,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=1F767F17239C9B670A39E9B10C3825F4,,close,,,0,http://localhost:8080/tienda1/index.jsp HTTP/1.1
1,Normal,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=81761ACA043B0E6014CA42A4BCD06AB5,,close,,,0,http://localhost:8080/tienda1/publico/anadir.j...
2,Normal,POST,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=933185092E0B668B90676E0A2B0767AF,application/x-www-form-urlencoded,Connection: close,Content-Length: 68,id=3&nombre=Vino+Rioja&precio=100&cantidad=55&...,0,http://localhost:8080/tienda1/publico/anadir.j...
3,Normal,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=8FA18BA82C5336D03D3A8AFA3E68CBB0,,close,,,0,http://localhost:8080/tienda1/publico/autentic...
4,Normal,POST,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=7104E6C68A6BCF1423DAE990CE49FEE2,application/x-www-form-urlencoded,Connection: close,Content-Length: 63,modo=entrar&login=choong&pwd=d1se3ci%F3n&remem...,0,http://localhost:8080/tienda1/publico/autentic...


<h3>Structure of CSIC 2010 is different, so we have to use only url without host from this dataset for comparing on FWAF with the same model. Because of presence GET, POST and PUT methods, we transform POST and PUT queries to preudo-GET</h3>

In [16]:
pd.set_option('mode.chained_assignment',None) # warnings off

In [17]:
cutted_csic = csic[['Method', 'URL', 'content', 'classification']]
cutted_csic['URL'] = cutted_csic['URL'].apply(lambda x: x.replace("http://localhost:8080", ""))
cutted_csic['URL'] = cutted_csic['URL'].apply(lambda x: x.replace("HTTP/1.1", "").strip())
cutted_csic['subpath'] = cutted_csic['URL'] + cutted_csic['content'].astype(str).apply(lambda x: "" if x == "nan" else '?'+ x)
target_csic = cutted_csic[['subpath', 'classification']]
target_csic.head()

Unnamed: 0,subpath,classification
0,/tienda1/index.jsp,0
1,/tienda1/publico/anadir.jsp?id=3&nombre=Vino+R...,0
2,/tienda1/publico/anadir.jsp?id=3&nombre=Vino+R...,0
3,/tienda1/publico/autenticar.jsp?modo=entrar&lo...,0
4,/tienda1/publico/autenticar.jsp?modo=entrar&lo...,0


In [18]:
target_csic[target_csic['classification'] == 1]['subpath'].iloc[1]

'/tienda1/publico/anadir.jsp?id=2&nombre=Jam%F3n+Ib%E9rico&precio=85&cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+*+FROM+datos+WHERE+nombre+LIKE+%27%25&B1=A%F1adir+al+carrito'

In [19]:
target_csic[target_csic['classification'] == 0]['subpath'].iloc[1]

'/tienda1/publico/anadir.jsp?id=3&nombre=Vino+Rioja&precio=100&cantidad=55&B1=A%F1adir+al+carrito'

In [20]:
queriesCSIC = target_csic['subpath']
yCSIC = target_csic['classification']

<h3><b>The same tokenization for CSIC as for FWAF</b></h3>

In [21]:
XCSIC = vectorizer.fit_transform(queriesCSIC)

In [22]:
XCSIC_train, XCSIC_test, yCSIC_train, yCSIC_test = train_test_split(XCSIC, yCSIC, test_size=0.3, random_state=2023)

<h3><b>Checking predictions on CSIC for fitted on FWAF model</b></h3>

<h3><b>Predict on CSIC with FWAF-trained model</b></h3>

In [23]:
predictions2 = lgs_fwaf.predict(XCSIC_test)

In [24]:
print("Accuracy", metrics.accuracy_score(yCSIC_test, predictions2))
print("Precision", metrics.precision_score(yCSIC_test, predictions2, average='weighted', labels=np.unique(predictions2)))
print("Recall", metrics.recall_score(yCSIC_test, predictions2, average='weighted', labels=np.unique(predictions2)))
print("F1", metrics.f1_score(yCSIC_test, predictions2, average='weighted', labels=np.unique(predictions2)))
print("ROC AUC", metrics.roc_auc_score(yCSIC_test, predictions2, average='weighted', labels=np.unique(predictions2)))

Accuracy 0.6350982532751092
Precision 0.7571480547236381
Recall 0.6350982532751092
F1 0.5371243147125148
ROC AUC 0.5595255296325342


<h3><b>Predict on full CSIC with FWAF-trained model</b></h3>

In [25]:
predictions3 = lgs_fwaf.predict(XCSIC)

In [26]:
print("Accuracy", metrics.accuracy_score(yCSIC, predictions3))
print("Precision", metrics.precision_score(yCSIC, predictions3, average='weighted', labels=np.unique(predictions3)))
print("Recall", metrics.recall_score(yCSIC, predictions3, average='weighted', labels=np.unique(predictions3)))
print("F1", metrics.f1_score(yCSIC, predictions3, average='weighted', labels=np.unique(predictions3)))
print("ROC AUC", metrics.roc_auc_score(yCSIC, predictions3, average='weighted', labels=np.unique(predictions3)))

Accuracy 0.6393678866781298
Precision 0.7617630613082813
Recall 0.6393678866781298
F1 0.5426583480908818
ROC AUC 0.5614171653700379


In [28]:
accuracy = cross_val_score(lgs_fwaf, XCSIC, yCSIC, cv=5, scoring='accuracy')
print("Accuracy", accuracy)

precision = cross_val_score(lgs_fwaf, XCSIC, yCSIC, cv=5, scoring='precision')
print("Precision", precision)

recall = cross_val_score(lgs_fwaf, XCSIC, yCSIC, cv=5, scoring='recall')
print("Recall", recall)

f1 = cross_val_score(lgs_fwaf, XCSIC, yCSIC, cv=5, scoring='f1')
print("F1", f1)

roc_auc = cross_val_score(lgs_fwaf, XCSIC, yCSIC, cv=5, scoring='roc_auc')
print("ROC AUC", roc_auc)

Accuracy [0.84401867 0.84999591 0.85171539 0.85032343 0.85163351]
Precision [0.84502664 0.85273897 0.85625278 0.8513126  0.85813381]
Recall [0.75922601 0.76700578 0.76760423 0.76979852 0.76501097]
F1 [0.79983188 0.80760344 0.80950878 0.80850618 0.80890108]
ROC AUC [0.9318436  0.93045922 0.93228162 0.93326072 0.92857299]


<h3><b>Logistic Regression model trained on CSIC</b></h3>
<h3><b>Immediately trying to predict on FWAF</b></h3>

In [29]:
lgs_csic = LogisticRegression(max_iter=500)
lgs_csic.fit(XCSIC_train, yCSIC_train)
predictions4 = lgs_csic.predict(XFWAF_test)

In [30]:
print("Accuracy", metrics.accuracy_score(yFWAF_test, predictions4))
print("Precision", metrics.precision_score(yFWAF_test, predictions4, average='weighted', labels=np.unique(predictions4)))
print("Recall", metrics.recall_score(yFWAF_test, predictions4, average='weighted', labels=np.unique(predictions4)))
print("F1", metrics.f1_score(yFWAF_test, predictions4, average='weighted', labels=np.unique(predictions4)))
print("ROC AUC", metrics.roc_auc_score(yFWAF_test, predictions4, average='weighted', labels=np.unique(predictions4)))

Accuracy 0.2338586602637148
Precision 0.9497860724462838
Recall 0.2338586602637148
F1 0.3382898001108911
ROC AUC 0.5485297692473338


<h3><b>Very bad scores...it shows some problems with datasets interoperability</b></h3>
<h3><b>lets see scores on 'own' test sample</b></h3>

In [31]:
predictions5 = lgs_csic.predict(XCSIC_test)

In [32]:
print("Accuracy", metrics.accuracy_score(yCSIC_test, predictions5))
print("Precision", metrics.precision_score(yCSIC_test, predictions5, average='weighted', labels=np.unique(predictions5)))
print("Recall", metrics.recall_score(yCSIC_test, predictions5, average='weighted', labels=np.unique(predictions5)))
print("F1", metrics.f1_score(yCSIC_test, predictions5, average='weighted', labels=np.unique(predictions5)))
print("ROC AUC", metrics.roc_auc_score(yCSIC_test, predictions5, average='weighted', labels=np.unique(predictions5)))

Accuracy 0.8447052401746725
Precision 0.8451581005069917
Recall 0.8447052401746725
F1 0.8430745214416991
ROC AUC 0.8317507473012353


<h3>maybe worse</h3>