# Dimensionality reduction with a Linear Autoencoder
In this notebook, we will show how to use a Linear Autoencoder to visualise the network flow samples in 1, 2 and 3 dimensional spaces. To do so, we reduce the dimension of the data points from 21 features (see the flow representation below) to 1, 2 and 3 respectively. 
We do the same using PCA and we compare the results.

We will use a dataset of benign and various DDoS attacks from the CIC-DDoS2019 dataset (https://www.unb.ca/cic/datasets/ddos-2019.html).
The network traffic has been previously pre-processed in a way that packets are grouped in bi-directional traffic flows using the 5-tuple (source IP, destination IP, source Port, destination Port, protocol). Each flow is represented with 21 packet-header features computed from max 1000 packets:

| Feature nr.         | Feature Name |
|---------------------|---------------------|
| 00 | timestamp (mean IAT) | 
| 01 | packet_length (mean)| 
| 02 | IP_flags_df (sum) |
| 03 | IP_flags_mf (sum) |
| 04 | IP_flags_rb (sum) | 
| 05 | IP_frag_off (sum) |
| 06 | protocols (mean) |
| 07 | TCP_length (mean) |
| 08 | TCP_flags_ack (sum) |
| 09 | TCP_flags_cwr (sum) |
| 10 | TCP_flags_ece (sum) |
| 11 | TCP_flags_fin (sum) |
| 12 | TCP_flags_push (sum) |
| 13 | TCP_flags_res (sum) |
| 14 | TCP_flags_reset (sum) |
| 15 | TCP_flags_syn (sum) |
| 16 | TCP_flags_urg (sum) |
| 17 | TCP_window_size (mean) |
| 18 | UDP_length (mean) |
| 19 | ICMP_type (mean) |
| 20 | Packets (counter)|

In [None]:
# Author: Roberto Doriguzzi-Corin
# Project: Course on Network Intrusion and Anomaly Detection with Machine Learning
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from util_functions import *
DATASET_FOLDER = "./DOS2019_4_ATTACKS"
target_names = ['dns', 'syn','tftp','webddos'] 
CLASSES = len(target_names)
data, labels = load_dataset(DATASET_FOLDER + "/*" + '-train.hdf5')

In [None]:
# Define colors for the three classes
def plot_components(data_1d,data_2d,data_3d):
    colors = ['blue', 'orange','yellow','red','black']

    # Plotting the original data in 3D with colored labels
    fig = plt.figure(figsize=(18, 6))

    ax = fig.add_subplot(131, projection='3d')
    for label, color in zip(range(CLASSES), colors):
        indices = labels == label
        ax.scatter(data_3d[indices, 0], data_3d[indices, 1], data_3d[indices, 2], label=f'{target_names[label]}', marker='o', color=color)
    ax.set_title('3D Projection with Labels')
    ax.set_xlabel('PCA1')
    ax.set_ylabel('PCA2')
    ax.set_zlabel('PCA3')
    ax.legend()
    # Set azimuth and elevation angles for rotation
    ax.view_init(elev=30, azim=0)

    # Plotting the reduced data in 2D with colored labels
    ax = fig.add_subplot(132)
    for label, color in zip(range(CLASSES), colors):
        indices = labels == label
        ax.scatter(data_2d[indices, 0], data_2d[indices, 1], label=f'{target_names[label]}', marker='o', color=color)
    ax.set_title('2D Projection with Labels')
    ax.set_xlabel('PCA1')
    ax.set_ylabel('PCA2')
    ax.legend()

    # Plotting the data in 1D along the first principal component
    ax = fig.add_subplot(133)
    for label, color in zip(range(CLASSES), colors):
        indices = labels == label
        ax.scatter(data_1d[indices], np.zeros_like(data_1d[indices]) + label, label=f'{target_names[label]}', marker='o', color=color)

    ax.set_title('1D Projection with Labels')
    ax.set_xlabel('PCA1')
    ax.set_yticks([], [])
    ax.legend()

    plt.tight_layout()
    plt.show()

# Dimensionality reduction with PCA

In [None]:
# Performing PCA to reduce to 3 dimensions
pca_3d = PCA(n_components=3)
pca_data_3d = pca_3d.fit_transform(data)

# Further reducing to 2 dimensions
pca_2d = PCA(n_components=2)
pca_data_2d = pca_2d.fit_transform(data)

# Performing PCA to reduce to 1 dimension
pca_1d = PCA(n_components=1)
pca_data_1d = pca_1d.fit_transform(data)

# Dimensionality reduction with Linear Autoencoder

| <img src="./autoencoder_reduction.png" width="80%">  |
|--|
| Reduction to 1D, 2D and 3D data representations.|

In [None]:
import numpy as np
from keras.layers import Input, Dense
from keras.models import Model

# 1D
input_layer = Input(shape=(data.shape[1],))
encoded_layer = Dense(1, activation='linear')(input_layer)
decoded_layer = Dense(data.shape[1], activation='linear')(encoded_layer)

autoencoder = Model(input_layer, decoded_layer)
print(autoencoder.summary())

autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(data, data, epochs=100, batch_size=32, shuffle=True)
# Extract the encoder model for dimensionality reduction
encoder = Model(input_layer, encoded_layer)
encoded_data_1d = encoder.predict(data)

# 2D
input_layer = Input(shape=(data.shape[1],))
encoded_layer = Dense(2, activation='linear')(input_layer)
decoded_layer = Dense(data.shape[1], activation='linear')(encoded_layer)

autoencoder = Model(input_layer, decoded_layer)
print(autoencoder.summary())

autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(data, data, epochs=100, batch_size=32, shuffle=True)
# Extract the encoder model for dimensionality reduction
encoder = Model(input_layer, encoded_layer)
encoded_data_2d = encoder.predict(data)

# 3D
input_layer = Input(shape=(data.shape[1],))
encoded_layer = Dense(3, activation='linear')(input_layer)
decoded_layer = Dense(data.shape[1], activation='linear')(encoded_layer)

autoencoder = Model(input_layer, decoded_layer)
print(autoencoder.summary())

autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(data, data, epochs=100, batch_size=32, shuffle=True)
# Extract the encoder model for dimensionality reduction
encoder = Model(input_layer, encoded_layer)
encoded_data_3d = encoder.predict(data)

# Plot components

In [None]:
plot_components(pca_data_1d,pca_data_2d,pca_data_3d)
plot_components(encoded_data_1d,encoded_data_2d,encoded_data_3d)