# Decision Tree (DT) for multiclass DDoS attack detection. 
In this lab we train a DT with benign network traffic and four classes of DDoS attacks from the CIC-DDoS2019 dataset of the University of New Brunswick. The network traffic has been previously pre-processed in a way that packets are grouped in bi-directional traffic flows using the 5-tuple (source IP, destination IP, source Port, destination Port, protocol). Each flow is represented with 21 packet-header features computed from max 1000 packets:

| Feature nr.         | Feature Name |
|---------------------|---------------------|
| 00 | timestamp (mean IAT) | 
| 01 | packet_length (mean)| 
| 02 | IP_flags_df (sum) |
| 03 | IP_flags_mf (sum) |
| 04 | IP_flags_rb (sum) | 
| 05 | IP_frag_off (sum) |
| 06 | protocols (mean) |
| 07 | TCP_length (mean) |
| 08 | TCP_flags_ack (sum) |
| 09 | TCP_flags_cwr (sum) |
| 10 | TCP_flags_ece (sum) |
| 11 | TCP_flags_fin (sum) |
| 12 | TCP_flags_push (sum) |
| 13 | TCP_flags_res (sum) |
| 14 | TCP_flags_reset (sum) |
| 15 | TCP_flags_syn (sum) |
| 16 | TCP_flags_urg (sum) |
| 17 | TCP_window_size (mean) |
| 18 | UDP_length (mean) |
| 19 | ICMP_type (mean) |
| 20 | Packets (counter)|

In [None]:
# Author: Roberto Doriguzzi-Corin
# Project: Course on Network Intrusion and Anomaly Detection with Deep Learning
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from IPython.display import Image, display
from util_functions import *

OUTPUT_FILE = "./ddos_tree"
DATASET_FOLDER = "./DOS2019"
X_train, y_train = load_dataset(DATASET_FOLDER + "/*" + '-train.hdf5')

feature_names = get_feature_names(flatten=True)
target_names = ['benign', 'dns',  'syn', 'udplag', 'webddos'] #IMPORTANT: when adding new classes, maintain the alphabetical order
target_names_full = ['benign', 'dns', 'ldap', 'mssql', 'netbios', 'ntp', 'portmap', 'snmp', 'ssdp', 'syn', 'tftp', 'udp', 'udplag', 'webddos'] # we use this to match class names with the class numbers returned by the DT

X = X_train
y = np.where(y_train==1)[1] 

tree_clf = DecisionTreeClassifier(max_depth=3) # other stopping paramters are min_samples_split and min_samples_leaf
tree_clf.fit(X,y)

In [None]:
export_graphviz(
    tree_clf,
    out_file=OUTPUT_FILE + ".dot",
    feature_names=feature_names,
    class_names=target_names,
    rounded=True,
    filled=True
)

In [None]:
# comvert the "dot" file into a png image
os.system("dot -Tpng " + OUTPUT_FILE + ".dot -o " + OUTPUT_FILE + ".png")
display(Image(filename=OUTPUT_FILE + ".png"))

## Prediction
Now, we use the trained decision tree to make prediction on unseen data (the test set).

In [None]:
X_test, y_test = load_dataset(DATASET_FOLDER + "/*" + '-test.hdf5')
y_test = np.where(y_test==1)[1] #from one-shot-encoding to numbers

y_pred = tree_clf.predict(X_test)

for y_index in range(y_pred.shape[0]):
    dt_result_string = "" if y_pred[y_index] == y_test[y_index] else " <-- Mistake!!! Predicted " + target_names_full[y_pred[y_index]] + " instead of " + target_names_full[y_test[y_index]]
    print ("Test sample " + str(y_index) + " - " + "Pred: " + str(y_pred[y_index]) + " Label: " + str(y_test[y_index]) + dt_result_string)


# Debugging the wrong predictions
Decision Trees are intuitive, and their decisions are easy to interpret. Such models are often called *white box models*. In contrast, as we will see, Random Forests or neural networks are generally considered black box models.
Let's check where the DT did wrong!

In [None]:
float_formatter = "{:.2f}".format
np.set_printoptions(formatter={'float_kind':float_formatter})
                    
for y_index in range(y_pred.shape[0]):
    if y_pred[y_index] != y_test[y_index]:
        print("Wrong sample n. " + str(y_index) + " (Predicted " + target_names_full[y_pred[y_index]] + " instead of " + target_names_full[y_test[y_index]] + ")")
        for feature_index in range(len(feature_names)):
            print (feature_names[feature_index] + ": " + str(X_test[y_index][feature_index]))
        print("")

# Accuracy metrics
Now we print the confusion matrix and the other accuracy metrics for this multiclass problem.

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix, without normalization", None),
    ("Normalized confusion matrix", "true"),
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        tree_clf,
        X_test,
        y_test,
        display_labels=target_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=target_names))