In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras import layers
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from preprocessor import classify_features

# https://wikidocs.net/22894
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

pd.set_option("display.max_columns", None)


In [2]:
from datetime import datetime
def ticks(dt):
    return (datetime(dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt   .microsecond) - datetime(1, 1, 1)).total_seconds() * 10000000

def tokenize(dataset):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(dataset)
    all_encoded = tokenizer.texts_to_sequences(dataset)
    vocab_size = len(tokenizer.word_index) + 1
    max_len = max(len(s) for s in all_encoded)
    return tokenizer, max_len, vocab_size

def pad_result(tokenizer, data):
    all_encoded = tokenizer.texts_to_sequences(data)
    max_len = max(len(_) for _ in all_encoded)
    return pad_sequences(all_encoded, maxlen=max_len)

In [59]:
all_data = pd.read_csv("Full-Dataset/processed/merged_data.csv", skipinitialspace=True, index_col="index", engine="python")

In [60]:
drop_columns = ["Unnamed: 0", 'http.request.uri.path', 'http.file_data', "0", "http.request.uri.scheme", "http.request.uri.netloc", "http.referer"]
all_data = all_data.drop([_ for _ in drop_columns if _ in all_data.columns], axis="columns")
all_data = all_data[all_data["frame.time"]!='']

In [61]:
all_data.head(5)

Unnamed: 0_level_0,ip.src_host,frame.time,tcp.checksum,http.response,arp.src.proto_ipv4,http.content_length,Attack_type,arp.dst.proto_ipv4,tcp.connection.rst,arp.hw.size,tcp.len,tcp.seq,tcp.options,tcp.connection.syn,ip.dst_host,tcp.flags.ack,tcp.ack_raw,tcp.connection.fin,tcp.dstport,tcp.flags,tcp.ack,tcp.connection.synack,tcp.payload,tcp.srcport,arp.opcode,GET,POST,TRACE,HTTP/1.0,HTTP/1.1,http.data.key,http.data.value
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
0,192.168.0.170,2021 01:45:35.974758000,0x0000016d,0.0,0,0.0,Password,0,0.0,0.0,86.0,1.0,0101080a9a8a02a2d3a23932,0.0,192.168.0.128,1.0,2788267000.0,0.0,80.0,0x00000018,1.0,0.0,474554202f445657412f6c6f67696e2e70687020485454...,37670.0,0.0,True,False,False,True,False,,0
1,192.168.0.170,2021 01:45:35.976362000,0x000067c9,0.0,0,0.0,Password,0,0.0,0.0,86.0,1.0,0101080a9a8a02a3d3a23935,0.0,192.168.0.128,1.0,317324000.0,0.0,80.0,0x00000018,1.0,0.0,474554202f445657412f6c6f67696e2e70687020485454...,37672.0,0.0,True,False,False,True,False,,0
2,192.168.0.170,2021 01:45:36.094926000,0x000057a1,0.0,0,37.0,Password,0,0.0,0.0,260.0,1.0,0101080a9a8a031bd3a239b3,0.0,192.168.0.128,1.0,939340800.0,0.0,80.0,0x00000018,1.0,0.0,504f5354202f445657412f6c6f67696e2e706870204854...,37674.0,0.0,False,True,False,True,False,username,admin
2,192.168.0.170,2021 01:45:36.094926000,0x000057a1,0.0,0,37.0,Password,0,0.0,0.0,260.0,1.0,0101080a9a8a031bd3a239b3,0.0,192.168.0.128,1.0,939340800.0,0.0,80.0,0x00000018,1.0,0.0,504f5354202f445657412f6c6f67696e2e706870204854...,37674.0,0.0,False,True,False,True,False,password,0
2,192.168.0.170,2021 01:45:36.094926000,0x000057a1,0.0,0,37.0,Password,0,0.0,0.0,260.0,1.0,0101080a9a8a031bd3a239b3,0.0,192.168.0.128,1.0,939340800.0,0.0,80.0,0x00000018,1.0,0.0,504f5354202f445657412f6c6f67696e2e706870204854...,37674.0,0.0,False,True,False,True,False,Login,Login


In [62]:
# all_data["frame.time"] = pd.to_datetime(all_data["frame.time"]).apply(ticks)
all_data = all_data.drop(["frame.time"],axis="columns")

In [63]:
all_data["tcp.checksum"] = all_data["tcp.checksum"].apply(int, base=16)
all_data["tcp.flags"] = all_data["tcp.flags"].apply(int, base=16)
all_data = all_data.drop(classify_features(all_data)[1], axis="columns")

In [64]:
all_data.select_dtypes('object').columns
# http.referer => 0.0 || 0 || 127.0.0.1 => need to be dropped
# ip.src_host, ip.dst_host, arp.src.proto_ipv4, arp.dst.proto_ipv4 need to be vectorized

#  Unknown Columns. Needed to be talked about.
# tcp.options => unknown values. Needed to be tokenized.
# tcp.payload => unknown values. Needed to be tokenized.

Index(['ip.src_host', 'arp.src.proto_ipv4', 'Attack_type',
       'arp.dst.proto_ipv4', 'tcp.options', 'ip.dst_host', 'tcp.payload',
       'http.data.key', 'http.data.value'],
      dtype='object')

In [65]:
all_data[["tcp.options", "tcp.payload"]].nunique()

tcp.options    82640
tcp.payload    52832
dtype: int64

In [66]:
all_data.head(5)


Unnamed: 0_level_0,ip.src_host,tcp.checksum,http.response,arp.src.proto_ipv4,http.content_length,Attack_type,arp.dst.proto_ipv4,tcp.connection.rst,arp.hw.size,tcp.len,tcp.seq,tcp.options,tcp.connection.syn,ip.dst_host,tcp.flags.ack,tcp.ack_raw,tcp.connection.fin,tcp.dstport,tcp.flags,tcp.ack,tcp.connection.synack,tcp.payload,tcp.srcport,arp.opcode,GET,POST,TRACE,HTTP/1.0,HTTP/1.1,http.data.key,http.data.value
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
0,192.168.0.170,365,0.0,0,0.0,Password,0,0.0,0.0,86.0,1.0,0101080a9a8a02a2d3a23932,0.0,192.168.0.128,1.0,2788267000.0,0.0,80.0,24,1.0,0.0,474554202f445657412f6c6f67696e2e70687020485454...,37670.0,0.0,True,False,False,True,False,,0
1,192.168.0.170,26569,0.0,0,0.0,Password,0,0.0,0.0,86.0,1.0,0101080a9a8a02a3d3a23935,0.0,192.168.0.128,1.0,317324000.0,0.0,80.0,24,1.0,0.0,474554202f445657412f6c6f67696e2e70687020485454...,37672.0,0.0,True,False,False,True,False,,0
2,192.168.0.170,22433,0.0,0,37.0,Password,0,0.0,0.0,260.0,1.0,0101080a9a8a031bd3a239b3,0.0,192.168.0.128,1.0,939340800.0,0.0,80.0,24,1.0,0.0,504f5354202f445657412f6c6f67696e2e706870204854...,37674.0,0.0,False,True,False,True,False,username,admin
2,192.168.0.170,22433,0.0,0,37.0,Password,0,0.0,0.0,260.0,1.0,0101080a9a8a031bd3a239b3,0.0,192.168.0.128,1.0,939340800.0,0.0,80.0,24,1.0,0.0,504f5354202f445657412f6c6f67696e2e706870204854...,37674.0,0.0,False,True,False,True,False,password,0
2,192.168.0.170,22433,0.0,0,37.0,Password,0,0.0,0.0,260.0,1.0,0101080a9a8a031bd3a239b3,0.0,192.168.0.128,1.0,939340800.0,0.0,80.0,24,1.0,0.0,504f5354202f445657412f6c6f67696e2e706870204854...,37674.0,0.0,False,True,False,True,False,Login,Login


In [2]:
# tcp.payload preprocess : split the string into 2 letter group -> process 16 based int into ascii char -> list join
# lecture comment : MUST BE INCLUDED IN PPT.
payload_into_str = lambda x: ("".join([chr(int(_, 16)) for _ in [x[s:s+2] for s in range(0, len(x), 2)]])).replace("\r\n","\n")
payloads = all_data["tcp.payload"].apply(payload_into_str)

NameError: name 'all_data' is not defined

In [68]:
payloads = payloads.apply(lambda x: x.split("\n"))

In [69]:
def payload_into_list(payload):
    main = payload[0]
    try:
        dic = dict((key.strip(),value.strip()) for key,value in [_.split(":") for _ in payload[1:]])
        
        return (main, dic)
    except:
        return (main, {})

In [70]:
payloads_ = payloads.apply(payload_into_list)

In [79]:
# payloads.apply(lambda x: (x[0], dict(_.split(":") for _ in x[1:])))
payloads.to_csv("test.csv")

In [71]:
payloads_.head(10)

index
0     (GET /DVWA/login.php HTTP/1.0, {})
1     (GET /DVWA/login.php HTTP/1.0, {})
2    (POST /DVWA/login.php HTTP/1.0, {})
2    (POST /DVWA/login.php HTTP/1.0, {})
2    (POST /DVWA/login.php HTTP/1.0, {})
3    (POST /DVWA/login.php HTTP/1.0, {})
3    (POST /DVWA/login.php HTTP/1.0, {})
3    (POST /DVWA/login.php HTTP/1.0, {})
4     (GET /DVWA/login.php HTTP/1.0, {})
5     (GET /DVWA/login.php HTTP/1.0, {})
Name: tcp.payload, dtype: object

In [29]:
options = all_data["tcp.options"].apply(payload_into_str)


In [41]:
list(payloads.unique())

['GET /DVWA/login.php HTTP/1.0\r\nHost: 192.168.0.128\r\nUser-Agent: Mozilla/5.0 (Hydra)\r\n\r\n',
 'POST /DVWA/login.php HTTP/1.0\r\nHost: 192.168.0.128\r\nUser-Agent: Mozilla/5.0 (Hydra)\r\nContent-Length: 37\r\nContent-Type: application/x-www-form-urlencoded\r\nCookie: PHPSESSID=bqbqr18790mcqo2gn5fel10t3o; security=impossible\r\n\r\nusername=admin&password=0&Login=Login',
 'POST /DVWA/login.php HTTP/1.0\r\nHost: 192.168.0.128\r\nUser-Agent: Mozilla/5.0 (Hydra)\r\nContent-Length: 38\r\nContent-Type: application/x-www-form-urlencoded\r\nCookie: PHPSESSID=dd6gt5vmk4migj95uct6mp98ke; security=impossible\r\n\r\nusername=admin&password=00&Login=Login',
 'GET /DVWA/login.php HTTP/1.0\r\nHost: 192.168.0.128\r\nUser-Agent: Mozilla/5.0 (Hydra)\r\nContent-Length: 0\r\nContent-Type: application/x-www-form-urlencoded\r\nCookie: PHPSESSID=bqbqr18790mcqo2gn5fel10t3o; security=impossible\r\n\r\n',
 'GET /DVWA/login.php HTTP/1.0\r\nHost: 192.168.0.128\r\nUser-Agent: Mozilla/5.0 (Hydra)\r\nContent-Le

In [37]:
list(all_data["tcp.options"].unique())

['0101080a9a8a02a2d3a23932',
 '0101080a9a8a02a3d3a23935',
 '0101080a9a8a031bd3a239b3',
 '0101080a9a8a031cd3a239b3',
 '0101080a9a8a038ed3a23a26',
 '0101080a9a8a0737d3a23dc4',
 '0101080a9a8a0738d3a23dc6',
 '0101080a9a8a07abd3a23e3f',
 '0101080a9a8a0826d3a23eba',
 '0101080a9a8a0827d3a23eba',
 '0101080a9a8a0cf4d3a24258',
 '0101080a9a8a0cf4d3a2425b',
 '0101080a9a8a0d6ad3a243fa',
 '0101080a9a8a0d6bd3a243fa',
 '0101080a9a8a0de0d3a2446f',
 '0101080a9a8a11a2d3a24827',
 '0101080a9a8a11a2d3a24828',
 '0101080a9a8a121ad3a248a5',
 '0101080a9a8a1292d3a2491c',
 '0101080a9a8a1295d3a24921',
 '0101080a9a8a1646d3a24cc8',
 '0101080a9a8a1647d3a24cca',
 '0101080a9a8a16bdd3a24d47',
 '0101080a9a8a16bed3a24d48',
 '0101080a9a8a173ad3a24dbe',
 '0101080a9a8a173ad3a24dbf',
 '0101080a9a8a1addd3a2515d',
 '0101080a9a8a1aded3a25160',
 '0101080a9a8a1b4fd3a251d6',
 '0101080a9a8a1bb6d3a2523d',
 '0101080a9a8a1bc8d3a2524a',
 '0101080a9a8a1c34d3a252bb',
 '0101080a9a8a1fccd3a25649',
 '0101080a9a8a1fccd3a2564a',
 '0101080a9a8a

In [35]:
options = all_data["tcp.options"].apply(lambda x: (x[:13], x[13:]))

In [36]:
options[:5]

index
0    (0101080a9a8a0, 2a2d3a23932)
1    (0101080a9a8a0, 2a3d3a23935)
2    (0101080a9a8a0, 31bd3a239b3)
2    (0101080a9a8a0, 31bd3a239b3)
2    (0101080a9a8a0, 31bd3a239b3)
Name: tcp.options, dtype: object

In [30]:
options.unique()

array(['\x01\x01\x08\n\x9a\x8a\x02¢Ó¢92',
       '\x01\x01\x08\n\x9a\x8a\x02£Ó¢95',
       '\x01\x01\x08\n\x9a\x8a\x03\x1bÓ¢9³', ...,
       '\x01\x01\x08\n\x9aÀõú\x9eN\x88¼',
       '\x01\x01\x08\n\x9aÀõþ\x9eN\x88Á',
       '\x01\x01\x08\n\x9aÀö\x01\x9eN\x88Á'], dtype=object)

In [184]:
ip_cols = ["ip.src_host", "ip.dst_host", "arp.src.proto_ipv4", "arp.dst.proto_ipv4"]
for ipcol in ip_cols:
    ip_values = all_data[ipcol].apply(lambda x: x.split("."))
    all_data[f"{ipcol}[0]"] = ip_values.apply(lambda x: int(x[0]))
    all_data[f"{ipcol}[1]"] = ip_values.apply(lambda x: int(x[1]))
    all_data[f"{ipcol}[2]"] = ip_values.apply(lambda x: int(x[2]))
    all_data[f"{ipcol}[3]"] = ip_values.apply(lambda x: int(x[3]))
all_data = all_data.drop(ip_cols, axis="columns")

IndexError: list index out of range

In [185]:
all_data["tcp.options"].unique()

array(['0101080a9a8a02a2d3a23932', '0101080a9a8a02a3d3a23935',
       '0101080a9a8a031bd3a239b3', ..., '0101080a9ac0f5fa9e4e88bc',
       '0101080a9ac0f5fe9e4e88c1', '0101080a9ac0f6019e4e88c1'],
      dtype=object)

In [186]:
all_data.head(3)

Unnamed: 0_level_0,ip.src_host,frame.time,tcp.checksum,http.response,arp.src.proto_ipv4,http.content_length,Attack_type,arp.dst.proto_ipv4,tcp.connection.rst,arp.hw.size,tcp.len,tcp.seq,tcp.options,tcp.connection.syn,ip.dst_host,tcp.flags.ack,tcp.ack_raw,tcp.connection.fin,tcp.dstport,tcp.flags,tcp.ack,tcp.connection.synack,tcp.payload,tcp.srcport,arp.opcode,GET,POST,TRACE,HTTP/1.0,HTTP/1.1,http.data.key,http.data.value,ip.src_host[0]
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
0,192.168.0.170,6.374506e+17,365,0.0,0,0.0,Password,0,0.0,0.0,86.0,1.0,0101080a9a8a02a2d3a23932,0.0,192.168.0.128,1.0,2788267000.0,0.0,80.0,24,1.0,0.0,474554202f445657412f6c6f67696e2e70687020485454...,37670.0,0.0,True,False,False,True,False,,0,192
1,192.168.0.170,6.374506e+17,26569,0.0,0,0.0,Password,0,0.0,0.0,86.0,1.0,0101080a9a8a02a3d3a23935,0.0,192.168.0.128,1.0,317324000.0,0.0,80.0,24,1.0,0.0,474554202f445657412f6c6f67696e2e70687020485454...,37672.0,0.0,True,False,False,True,False,,0,192
2,192.168.0.170,6.374506e+17,22433,0.0,0,37.0,Password,0,0.0,0.0,260.0,1.0,0101080a9a8a031bd3a239b3,0.0,192.168.0.128,1.0,939340800.0,0.0,80.0,24,1.0,0.0,504f5354202f445657412f6c6f67696e2e706870204854...,37674.0,0.0,False,True,False,True,False,username,admin,192


In [187]:
def getRNN_dataset(df):
    columns = ["http.data"]
    columns = [_ for _ in columns if _ in df.columns]
    # return df.loc[:, ["http.data"]].values
    return df[list(columns)]

    
def get_logistic_dataset(df):
    columns = ["http.data"]
    columns = [_ for _ in columns if _ in df.columns]
    return df.drop(columns, axis=1)

In [188]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

# 데이터를 로드합니다.
df = all_data

# LabelEncoder 클래스를 생성합니다.
encoder = LabelEncoder()
df["Attack_type"] = encoder.fit_transform(df["Attack_type"])
df = df.astype({"http.data.key":"string", "http.data.value":"string"})
df["http.data"] = df["http.data.key"]+'='+df["http.data.value"]
df["http.data"] = df["http.data"].fillna('')
# 데이터를 훈련 세트와 테스트 세트로 나눕니다.
splitted_data = train_test_split(df.drop(["Attack_type", "http.data.value", "http.data.key"], axis="columns"), df["Attack_type"], test_size=0.2)
X_train, X_test, y_train, y_test = splitted_data
RNN_X_train, RNN_X_test = [getRNN_dataset(_) for _ in [X_train, X_test]]
logistic_X_train, logistic_X_test = [get_logistic_dataset(_) for _ in [X_train, X_test]]

In [189]:
encoder.classes_

array(['Password', 'SQL_injection', 'Uploading', 'XSS'], dtype=object)

In [190]:
# https://wikidocs.net/22894
tokenizer = Tokenizer()
tokenizer.fit_on_texts(RNN_X_train["http.data"])
RNN_X_train_encoded = tokenizer.texts_to_sequences(RNN_X_train["http.data"])

In [191]:
RNN_X_train_encoded[:5]


[[], [], [], [], []]

In [192]:
word_to_index = tokenizer.word_index
tokenizer.word_index

{'login': 1,
 'username': 2,
 'admin': 3,
 'password': 4,
 '0': 5,
 '00': 6,
 'chr': 7,
 'id': 8,
 '113': 9,
 'submit': 10,
 'submitid': 11,
 'and': 12,
 'char': 13,
 'select': 14,
 'from': 15,
 '5': 16,
 '1': 17,
 'case': 18,
 'when': 19,
 'then': 20,
 'else': 21,
 'end': 22,
 '98': 23,
 '107': 24,
 '58': 25,
 'sleep': 26,
 'dual': 27,
 '120': 28,
 '122': 29,
 '106': 30,
 'order': 31,
 'by': 32,
 '3': 33,
 '8': 34,
 '118': 35,
 '4': 36,
 '54': 37,
 '59': 38,
 '9': 39,
 '7': 40,
 '2': 41,
 '53': 42,
 '55': 43,
 '57': 44,
 'pg': 45,
 'waitfor': 46,
 'delay': 47,
 "'0": 48,
 "5'": 49,
 'dbms': 50,
 'pipe': 51,
 'receive': 52,
 'message': 53,
 '112': 54,
 'concat': 55,
 'cast': 56,
 'text': 57,
 'as': 58,
 'numeric': 59,
 'upper': 60,
 'xmltype': 61,
 '60': 62,
 '62': 63,
 'extractvalue': 64,
 '0x5c': 65,
 'elt': 66,
 'in': 67,
 '49': 68,
 '48': 69,
 "59'": 70,
 "58'": 71,
 "57'": 72,
 "9'": 73,
 "8'": 74,
 "55'": 75,
 "54'": 76,
 "7'": 77,
 "3'": 78,
 "4'": 79,
 "53'": 80,
 "2'": 81,
 "'

In [193]:
threshold = 2
total_cnt = len(word_to_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합(vocabulary)에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)


등장 빈도가 1번 이하인 희귀 단어의 수: 3419
단어 집합(vocabulary)에서 희귀 단어의 비율: 73.40060111635896
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 1.8602550709497692


In [194]:
vocab_size = len(word_to_index) + 1
max_length_of_http_data = max(len(sample) for sample in RNN_X_train_encoded)
avg_length_of_http_data = (sum(map(len, RNN_X_train_encoded))/len(RNN_X_train_encoded))
print('단어 집합의 크기: {}'.format((vocab_size)))

단어 집합의 크기: 4659


In [195]:
RNN_X_train_padded = pad_sequences(RNN_X_train_encoded, maxlen = max_length_of_http_data)
print("훈련 데이터의 크기(shape):", RNN_X_train_padded.shape)

훈련 데이터의 크기(shape): (119478, 51)


In [196]:
def RNN_padding(tokenizer:Tokenizer, series, max_len):
    return pad_sequences(tokenizer.texts_to_sequences(series), maxlen=max_len)

In [197]:
RNN_X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119478 entries, 24551 to 10108
Data columns (total 1 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   http.data  119478 non-null  string
dtypes: string(1)
memory usage: 1.8 MB


In [198]:
RNN_pad = pad_result(tokenizer, RNN_X_train["http.data"])
type(RNN_pad)

numpy.ndarray

In [199]:
# RNN을 사용하여 요청의 URI 키와 값을 분류하는 모델을 학습합니다.
rnn_model = tf.keras.Sequential([
  layers.Embedding(vocab_size, 128),
  layers.SimpleRNN(128),
  layers.Dense(1, activation="sigmoid")
])

In [200]:
rnn_model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 128)         596352    
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 128)               32896     
                                                                 
 dense_13 (Dense)            (None, 1)                 129       
                                                                 
Total params: 629,377
Trainable params: 629,377
Non-trainable params: 0
_________________________________________________________________


In [203]:
rnn_model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy",])
rnn_model.fit(RNN_X_train_padded, y_train, epochs=1)



<keras.callbacks.History at 0x27e04567b10>

In [204]:
rnn_model.evaluate(pad_result(tokenizer, RNN_X_test["http.data"]), y_test)



[0.0, 0.5635085105895996]

In [205]:
tcp_options_tokenizer, tcp_options_maxlen, tcp_options_vocab_size = tokenize(logistic_X_train["tcp.options"])
tcp_payload_tokenizer, tcp_payload_maxlen, tcp_payload_vocab_size = tokenize(logistic_X_train["tcp.payload"])
logistic_X_train["tcp.options"] = pad_result(tcp_options_tokenizer, logistic_X_train["tcp.options"])
logistic_X_test["tcp.options"] = pad_result(tcp_options_tokenizer, logistic_X_test["tcp.options"])
logistic_X_train["tcp.payload"] = pad_result(tcp_payload_tokenizer, logistic_X_train["tcp.payload"])
logistic_X_test["tcp.payload"] = pad_result(tcp_payload_tokenizer, logistic_X_test["tcp.payload"])

In [206]:
logistic_X_train["tcp.options"][:5]

index
24551    25504
23397    25505
51991    25506
5408     25507
88646    25508
Name: tcp.options, dtype: int32

In [207]:
logistic_X_train.head(3)

Unnamed: 0_level_0,ip.src_host,frame.time,tcp.checksum,http.response,arp.src.proto_ipv4,http.content_length,arp.dst.proto_ipv4,tcp.connection.rst,arp.hw.size,tcp.len,tcp.seq,tcp.options,tcp.connection.syn,ip.dst_host,tcp.flags.ack,tcp.ack_raw,tcp.connection.fin,tcp.dstport,tcp.flags,tcp.ack,tcp.connection.synack,tcp.payload,tcp.srcport,arp.opcode,GET,POST,TRACE,HTTP/1.0,HTTP/1.1,ip.src_host[0]
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
24551,192.168.0.170,6.374512e+17,58713,0.0,0,0.0,0,0.0,0.0,221.0,1.0,25504,0.0,192.168.0.128,1.0,1159926000.0,0.0,80.0,24,1.0,0.0,25019,38732.0,0.0,True,False,False,True,False,192
23397,192.168.0.170,6.374512e+17,17080,0.0,0,0.0,0,0.0,0.0,221.0,1.0,25505,0.0,192.168.0.128,1.0,3038745000.0,0.0,80.0,24,1.0,0.0,25020,35452.0,0.0,True,False,False,True,False,192
51991,192.168.0.170,6.374513e+17,58793,0.0,0,0.0,0,0.0,0.0,86.0,1.0,25506,0.0,192.168.0.128,1.0,2666572000.0,0.0,80.0,24,1.0,0.0,1,59996.0,0.0,True,False,False,True,False,192


In [208]:
logistic_X_train

Unnamed: 0_level_0,ip.src_host,frame.time,tcp.checksum,http.response,arp.src.proto_ipv4,http.content_length,arp.dst.proto_ipv4,tcp.connection.rst,arp.hw.size,tcp.len,tcp.seq,tcp.options,tcp.connection.syn,ip.dst_host,tcp.flags.ack,tcp.ack_raw,tcp.connection.fin,tcp.dstport,tcp.flags,tcp.ack,tcp.connection.synack,tcp.payload,tcp.srcport,arp.opcode,GET,POST,TRACE,HTTP/1.0,HTTP/1.1,ip.src_host[0]
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
24551,192.168.0.170,6.374512e+17,58713,0.0,0,0.0,0,0.0,0.0,221.0,1.0,25504,0.0,192.168.0.128,1.0,1.159926e+09,0.0,80.0,24,1.0,0.0,25019,38732.0,0.0,True,False,False,True,False,192
23397,192.168.0.170,6.374512e+17,17080,0.0,0,0.0,0,0.0,0.0,221.0,1.0,25505,0.0,192.168.0.128,1.0,3.038745e+09,0.0,80.0,24,1.0,0.0,25020,35452.0,0.0,True,False,False,True,False,192
51991,192.168.0.170,6.374513e+17,58793,0.0,0,0.0,0,0.0,0.0,86.0,1.0,25506,0.0,192.168.0.128,1.0,2.666572e+09,0.0,80.0,24,1.0,0.0,1,59996.0,0.0,True,False,False,True,False,192
5408,192.168.0.170,6.374512e+17,22786,0.0,0,0.0,0,0.0,0.0,221.0,1.0,25507,0.0,192.168.0.128,1.0,6.861151e+08,0.0,80.0,24,1.0,0.0,25021,40842.0,0.0,True,False,False,True,False,192
88646,192.168.0.170,6.374513e+17,13581,0.0,0,0.0,0,0.0,0.0,0.0,103.0,25508,0.0,192.168.0.128,1.0,2.593838e+08,0.0,80.0,16,441.0,0.0,2,60636.0,0.0,False,False,False,False,False,192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11010,192.168.0.170,6.374512e+17,7945,0.0,0,38.0,0,0.0,0.0,261.0,1.0,10818,0.0,192.168.0.128,1.0,3.088881e+09,0.0,80.0,24,1.0,0.0,9855,56766.0,0.0,False,True,False,True,False,192
59625,192.168.0.170,6.374513e+17,50847,0.0,0,37.0,0,0.0,0.0,260.0,1.0,902,0.0,192.168.0.128,1.0,1.489381e+09,0.0,80.0,24,1.0,0.0,10293,53538.0,0.0,False,True,False,True,False,192
31853,192.168.0.170,6.374512e+17,62285,0.0,0,37.0,0,0.0,0.0,260.0,1.0,157,0.0,192.168.0.128,1.0,1.918567e+09,0.0,80.0,24,1.0,0.0,4188,59480.0,0.0,False,True,False,True,False,192
16685,192.168.0.170,6.374512e+17,5610,0.0,0,37.0,0,0.0,0.0,260.0,1.0,3356,0.0,192.168.0.128,1.0,3.511579e+09,0.0,80.0,24,1.0,0.0,24893,44642.0,0.0,False,True,False,True,False,192


In [209]:
# 로지스틱 회귀를 사용하여 나머지 데이터를 분류하는 모델을 학습합니다.
logistic_regression_model = tf.keras.Sequential([
  layers.Dense(128, activation="relu"),
  layers.Dense(4, activation="softmax")
])

In [210]:
logistic_regression_model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
logistic_regression_model.fit(X_train, y_train, epochs=1)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [None]:
# 두 모델의 예측을 결합하여 최종 예측을 만듭니다.
predictions = rnn_model.predict(RNN_padding(tokenizer,RNN_X_test,max_length_of_http_data)) * 0.5 + logistic_regression_model.predict(logistic_X_test) * 0.5

# 모델의 성능을 평가합니다.
accuracy = tf.metrics.accuracy(labels=y_test, predictions=predictions)
print("Accuracy:", accuracy)