In [1]:
#!/usr/bin/env python
# coding: utf-8

# Import packages

import pandas as pd
import numpy as np

import tensorflow as tf
from keras import Sequential
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report, accuracy_score

import matplotlib.pyplot as plt

import pickle

import seaborn as sns

sns.set(style='whitegrid', palette='muted', font_scale=1.5)

  from pandas import MultiIndex, Int64Index


In [2]:
def drop_infs(ds):
    ds.replace([np.inf, -np.inf], np.nan, inplace=True)
    ds.dropna(how='any', inplace=True)
    return ds

In [3]:
# Get dataset

dataset = pd.read_csv("test_dataset.csv")
print("Dataset:")
print(dataset.head())
print("____________________________________________ \n \n")

dataset = drop_infs(dataset)

Dataset:
    Destination Port   Flow Duration   Total Fwd Packets  \
0              49188               4                   2   
1              49188               1                   2   
2              49188               1                   2   
3              49188               1                   2   
4              49486               3                   2   

    Total Backward Packets  Total Length of Fwd Packets  \
0                        0                           12   
1                        0                           12   
2                        0                           12   
3                        0                           12   
4                        0                           12   

    Total Length of Bwd Packets   Fwd Packet Length Max  \
0                             0                       6   
1                             0                       6   
2                             0                       6   
3                             0        

In [4]:
dataset.shape

(2827876, 79)

In [5]:
labels = dataset[' Label'].copy()

le = preprocessing.LabelEncoder()
labels_encoded = le.fit_transform(labels)
labels_encoded = pd.DataFrame(labels_encoded, columns=[' Label'])
labels_encoded = labels_encoded[' Label'].copy()

print("Data Labels:")
print(labels.unique())
print("____________________________________________")

print("Encoded Labels:")
print(labels_encoded.unique())
print("____________________________________________ \n \n")

Data Labels:
['BENIGN' 'DDoS' 'PortScan' 'Bot' 'Infiltration'
 'Web Attack � Brute Force' 'Web Attack � XSS'
 'Web Attack � Sql Injection' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris'
 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye' 'Heartbleed']
____________________________________________
Encoded Labels:
[ 0  2 10  1  9 12 14 13  7 11  6  5  4  3  8]
____________________________________________ 
 



In [6]:
# Dataseti x ve y olarak ayırma
y = labels_encoded
x = dataset.drop(' Label', axis=1)

In [7]:
# Normalizasyon
scaler = preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(x)

In [8]:
x_train = np.asarray(x_scaled).astype(np.float32)

x_train = tf.cast(x_train, tf.float32)

2022-12-18 08:54:53.743452: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
autoencoder = Sequential()
autoencoder.add(Dense(32, activation='relu', input_shape=(78,)))
autoencoder.add(Dense(16, activation='relu'))
autoencoder.add(Dense(8, activation='linear', name="Compressed"))
autoencoder.add(Dense(16, activation='relu'))
autoencoder.add(Dense(32, activation='relu'))
autoencoder.add(Dense(78, activation='sigmoid'))
autoencoder.compile(loss='mean_squared_error', optimizer='adam')

In [10]:
print("Training Autoencoder...")
history = autoencoder.fit(x_train,
                          x_train,
                          batch_size=80,
                          epochs=10,
                          verbose=1,
                          validation_split=0.2)
print("Training Complete.")

Training Autoencoder...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Complete.


In [11]:
encoder = Model(autoencoder.input, autoencoder.get_layer('Compressed').output)

encoder.compile(loss='mean_squared_error', optimizer='adam')

print("Creating Encoded Data...")
encoded_x = encoder.predict(x_train)
print("Encoding Complete.")

Creating Encoded Data...
Encoding Complete.


In [12]:
x_train, x_test, y_train, y_test = train_test_split(encoded_x, y,
                                                    test_size=0.25,
                                                    random_state=12345,
                                                    stratify=y)

In [17]:
# XGBoost
model_xgb = xgb.XGBClassifier(n_estimators=100, max_depth=1, learning_rate=0.2, verbosity=1,
                              use_label_encoder=False,
                              eval_metric='merror')

In [None]:
print("XGBoost Cross Validation:")
cv = RepeatedStratifiedKFold(n_splits=8, random_state=12345)
n_scores = cross_val_score(model_xgb, x_train, y_train, scoring='accuracy', cv=cv, error_score='raise')

XGBoost Cross Validation:


In [None]:
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
print("____________________________________________ \n \n")

In [None]:
print("XGBoost Training....")
model_xgb.fit(x_train, y_train)
xgb_preds = model_xgb.predict(x_test)
print("Training Complete.")

In [None]:
print("XGBoost Score:")
model_xgb.score(x_test, y_test)
print("____________________________________________ \n \n")

In [None]:
# First training
print("Benign - Attack Classification")
print("")
print("Classification Report: ")
print(classification_report(y_test, xgb_preds))

print("")
print("Accuracy Score: ", accuracy_score(y_test, xgb_preds))
print("____________________________________________ \n \n")

In [None]:
LABELS = le.inverse_transform(y)
conf_matrix = confusion_matrix(y_test, xgb_preds)
plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix, xticklabels=LABELS,
            yticklabels=LABELS, annot=True, fmt="d")
plt.title("Benign - Attack Classification")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()