# 概要

Character-LevelのCNNでWAFを作る。  
論文の内容をKerasで実装してみる。  
原著論文はこちら http://iyatomi-lab.info/sites/default/files/user/CSPA2018%20Proceedings_ito.pdf 


## データの準備

inputはURL decode -> Unicode encodeしたもの

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from tensorflow.keras.metrics import Precision, Recall
import tensorflow as tf
from tensorflow.keras.callbacks import TensorBoard
import keras
from keras.models import Model, load_model
from keras.layers import Input, Embedding, Dense, Dropout, Flatten, Conv1D, MaxPool1D, Add, Reshape, normalization, Concatenate, merge, GlobalMaxPooling1D
from keras.utils import plot_model, to_categorical, np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras import optimizers

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
pd.set_option("display.max_colwidth", 1024)

In [None]:
%cd /content/drive/MyDrive/WAffle/

In [None]:
df = pd.read_csv('Dataset/cisc_database/csic_database.csv')

In [None]:
df = df.rename(columns={'Unnamed: 0':'Target'})

In [None]:
df.head()

### 前処理

目標: 'Target', 'URL'のみのDataFrameにする。
DataFrameをtraining, validation, testに6:2:2で分割し、URLをInput、Targetをlabelとなるようにしていく。

In [None]:
df = df[['Target', 'URL']]
df.tail()

In [None]:
# Targetカラムの変更。
# if (df['Target'] == Normal){0} else {1}

# Normalを0, それ以外は1に置換。lossにはbinary_crossentropyを使う。
df['Target'] = df['Target'].apply(lambda x:0 if str(x) == 'Normal' else 1)
df.head()

In [None]:
# URLの末尾にあるHTTP 1.1という文字列を消す
df['URL'] = df['URL'].str[:-8]
df.head()

In [None]:
# Training : Validation : Test = 7.5 : 1.5 : 1.0

train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
X_train, X_test, y_train, y_test = train_test_split(df['URL'], df['Target'], test_size=1 - train_ratio, random_state=42)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=42) 

print(X_train, X_val, X_test)

In [None]:
print('X_train shape', X_train.shape)
print('y_train shape', y_train.shape)

print('X_val shape', X_val.shape)
print('y_val shape', y_val.shape)

print('X_test shape', X_test.shape)
print('y_test shape', y_test.shape)

In [None]:
test_data = pd.concat([X_test, y_test], axis=1)
test_data = pd.DataFrame(test_data)
test_data.head()

In [None]:
# testデータをcsvファイルにするで
test_data.to_csv('Dataset/cisc_database/test_data.csv', index=False)

In [None]:
#いつか使う URL decode
import urllib.parse
def url_decode(encoded_URL):
  return urllib.parse.unquote(encoded_URL)

In [None]:
def load_data(urls, max_length=1000):
    urls = [s.lower() for s in urls]
    url_list = []
    for url in urls:
        # url decode
        decoded_url = url_decode(url)
        # unicode encode
        encoded_url = [ord(x) for x in str(decoded_url).strip()]
        encoded_url = encoded_url[:max_length]
        url_len = len(encoded_url)
        if url_len < max_length:
            # zero padding
            encoded_url += ([0] * (max_length - url_len))
        url_list.append((encoded_url))
    # convert to numpy array
    url_list = np.array(url_list)
    return url_list

In [None]:
train_data = load_data(X_train)
val_data = load_data(X_val)
test_data = load_data(X_test)

In [None]:
print(train_data)
print(val_data)
print(test_data)

In [None]:
def create_label(labels):
  class_list = [x for x in labels]
  classes = np.array(class_list)
  return classes

In [None]:
train_classes = create_label(y_train)
val_classes = create_label(y_val)
test_classes = create_label(y_test)

In [None]:
print(train_classes)
print(val_classes)
print(test_classes)

## アーキテクチャ

1. Input(1000 characters)
2. Embedding(128demensions)
3. Conv(kernelsize = K, filter_num = 64) -> RELU
4. Max Pooling(kernelsize = K)
5. Conv(kernelsize = K, filter_num = 64) -> RELU
6. Max Pooling(kernel_size = size of Conv 5's output)
7. concat output & reshape into 256 length vector
8. FCN(64 units) -> RELU
9. Batch normalize
10. Dropout(0.5)
11. FCN(1 unit) -> Sigmoid

(categorical_crossentropy)

In [None]:
def create_model(input_max_size, embedding_size, kernel_sizes, dropout):

  # Input Layer
  # URLdecode -> Unicode encode -> npumpy.darrayに変換されたURLをInputとして与える。
  inputs = Input(shape=(input_max_size,), name='URL_input')

  # Embedding Layer
  x = Embedding(0xffff, embedding_size, name='Embedding')(inputs)
  x = Reshape((input_max_size, embedding_size), name='Reshape_into_128_legnth_vector')(x)

  # Convolution Layers
  convolution_output = []

  for kernel_size in kernel_sizes:
    conv1 = Conv1D(64, kernel_size, activation='relu', padding='same', strides=1)(x)
    pool1 = MaxPool1D(pool_size=kernel_size, padding='same', strides=1)(conv1)
    conv2 = Conv1D(64, kernel_size, activation='relu', padding='same', strides=1)(pool1)
    pool2 = GlobalMaxPooling1D()(conv2)
    convolution_output.append(pool2)

  # concat output
  x = Concatenate(name='Concat_the_outputs')(convolution_output)

  # reshape into 256 length vector
  x = Reshape((256, ), name='Reshape_into_256_length_vector')(x)

  # Fully Connected Layers
  x = Dense(64, activation='relu', name='FullyConnectedLayer')(x)

  # Batch Normalization
  x = normalization.BatchNormalization()(x)

  # DropOut
  x = Dropout(dropout)(x)

  # Fully Connected Layers
  predictions = Dense(1, activation='sigmoid', name='Prediction')(x)

  model = Model(inputs=inputs, outputs=predictions, name='Character-level_CNN')

  return model

In [None]:
# config
learning_rate = 0.001
epochs = 200

input_max_size = 1000
embedding_size = 128
kernel_sizes = [4,5,6,7]
dropout = 0.5

loss = 'binary_crossentropy'
optimizer = 'adam'

batch_size = 128

model_filepath = 'model_dir/model'
checkpoint_filepath = '/logs/checkpoint'

In [None]:
def train(learning_rate, input_max_size, embedding_size, kernel_sizes, dropout, loss, optimizer, train_data, train_classes, epochs, batch_size, val_data, val_classes, model_filepath):

    # 学習率を少しずつ下げるようにする
    start = learning_rate
    stop = learning_rate * 0.01
    learning_rates = np.linspace(start, stop, epochs)

    # モデル作成
    model = create_model(input_max_size, embedding_size, kernel_sizes, dropout)
    optimizer = optimizers.Adam(lr=learning_rate)
    model.compile(loss=loss,
                  optimizer=optimizer,
                  metrics=['accuracy', Precision(), Recall(), 'binary_accuracy', 'categorical_accuracy'])
    
    tf_callback = TensorBoard(log_dir="logs", histogram_freq=1)
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True
        )


    # 学習
    model.fit(train_data, train_classes,
              epochs=epochs,
              batch_size=batch_size,
              verbose=2,
              validation_data=(val_data, val_classes),
              callbacks=[model_checkpoint_callback]
              )

    model.save(model_filepath + '.h5')
    model.save_weights(model_filepath + '_weight.h5')

In [None]:
train(learning_rate, input_max_size, embedding_size, kernel_sizes, dropout, loss, optimizer, train_data, train_classes, epochs, batch_size, val_data, val_classes, model_filepath)

In [None]:
model_filepath = 'model_dir/model'
model = load_model(model_filepath+'.h5')

In [None]:
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(test_data, test_classes, verbose=2)

print("test_loss: ", test_loss)
print("test_accuracy: ", test_accuracy)
print("test_precision: ", test_precision)
print("test_recall: ", test_recall)