# KDDCup99 10%Data Evaluation
- Import KDDCup99 10%data from network and check performance of anomaly detection.
- To execute this notebook, need python(3.6), tensorflow, pandas, numpy, sklearn.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
%cd /content/drive/MyDrive/빅분기 과제/DAGMM

/content/drive/MyDrive/빅분기 과제/DAGMM


In [7]:
%tensorflow_version 1.x
import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

from dagmm import DAGMM

## Data Import

In [8]:
url_base = "http://kdd.ics.uci.edu/databases/kddcup99"

# KDDCup 10% Data
url_data = f"{url_base}/kddcup.data_10_percent.gz"
# info data (column names, col types)
url_info = f"{url_base}/kddcup.names"

In [9]:
# Import info data
df_info = pd.read_csv(url_info, sep=":", skiprows=1, index_col=False, names=["colname", "type"])
colnames = df_info.colname.values
coltypes = np.where(df_info["type"].str.contains("continuous"), "float", "str")
colnames = np.append(colnames, ["status"])
coltypes = np.append(coltypes, ["str"])

# Import data
df = pd.read_csv(url_data, names=colnames, index_col=False,
                 dtype=dict(zip(colnames, coltypes)))

In [10]:
# Dumminize
X = pd.get_dummies(df.iloc[:,:-1]).values

# Create Traget Flag
# Anomaly data when status is normal, Otherwise, Not anomaly.
y = np.where(df.status == "normal.", 1, 0)

In [11]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=123)
X_train, y_train = X_train[y_train == 0], y_train[y_train == 0]

## Fit Data to DAGMM Model
next points are different from original paper:
- $\lambda_2$ is set to 0.0001 (paper: 0.005)
- Add small value($10^{-6}$) to diagonal elements of GMM covariance (paper: no additional value)

Standard Scaler is applied to input data (This DAGMM implementation default)

In [12]:
model = DAGMM(
    comp_hiddens=[60, 30, 10, 1], comp_activation=tf.nn.tanh,
    est_hiddens=[10, 4], est_dropout_ratio=0.5, est_activation=tf.nn.tanh,
    learning_rate=0.0001, epoch_size=200, minibatch_size=1024, random_seed=1111
)

In [13]:
model.fit(X_train)




Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.dropout instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



 epoch 100/200 : loss = 80.538
 epoch 200/200 : loss = 72.653





## Apply model to test data

In [21]:
y_pred = model.predict(X_test)

In [22]:
# Energy thleshold to detect anomaly = 80% percentile of energies
anomaly_energy_threshold = np.percentile(y_pred, 80)
print(f"Energy thleshold to detect anomaly : {anomaly_energy_threshold:.3f}")

Energy thleshold to detect anomaly : 6.668


In [23]:
# Detect anomalies from test data
y_pred_flag = np.where(y_pred >= anomaly_energy_threshold, 1, 0)

In [24]:
prec, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred_flag, average="binary")
print(f" Precision = {prec:.3f}")
print(f" Recall    = {recall:.3f}")
print(f" F1-Score  = {fscore:.3f}")

 Precision = 0.932
 Recall    = 0.942
 F1-Score  = 0.937
