Imports

In [34]:
# Imports
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import math

# Tensorflow/keras imports
from keras import Sequential
from keras import layers
from keras import regularizers
from keras.optimizers import SGD
from keras.callbacks import LearningRateScheduler

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.utils import class_weight
from sklearn.compose import ColumnTransformer

Data Import and Preprocessing

In [41]:
# Data files
X_train = pd.read_csv("UNSW_NB15_training-set.csv")
X_test = pd.read_csv("UNSW_NB15_testing-set.csv")

# Create train and test dataframes, dropping the id and attack_cat columns from x
# and setting label as y
X_train = X_train.drop(columns=["id", "attack_cat"])
y_train = X_train.pop("label")
X_test = X_test.drop(columns=["id", "attack_cat"])
y_test = X_test.pop("label")

print(f"Samples in X_train: {len(X_train)}")
print(f"Samples in X_test: {len(X_test)}")

# Encode categorical features using one hot encoding
# Scale numerical features using standard scaler
cat_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()
num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
print(f"Categorical features: {cat_features}")
print(f"Numerical features: {num_features}")

# One hot encoder used over label encoder since it's better for non-ordinal data
preprocessor = ColumnTransformer(
    transformers=[
        ("numerical", StandardScaler(), num_features),
        ("categorical", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ]
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

print(f"Number of features after preprocessing: {X_train.shape[1]}")

Samples in X_train: 175341
Samples in X_test: 82332
Categorical features: ['proto', 'service', 'state']
Numerical features: ['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports']
Number of features after preprocessing: 194


In [42]:
# Use class weights to balance the classes for the training set
class_weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
class_weights = dict(zip(np.unique(y_train), class_weights))

Model structure and training

In [43]:
# Hyperparameters
epochs = 5
batch_size = 32
initial_lr = 0.01

# Function which will decrease the lr by 10% every n epochs
def lr_step_decay(epoch, lr):
    epochs_per_drop = 5
    return initial_lr * math.pow(0.9, math.floor(epoch/epochs_per_drop))

# Model

model = Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(194, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(12, activation='relu'),
    layers.Dense(12, activation='relu'),
    layers.Dense(1),
    layers.Activation("sigmoid")
])

model.summary()

In [44]:
# Compile model
model.compile(
    optimizer = SGD(learning_rate=initial_lr, momentum=0.9),
    loss = "binary_crossentropy",
    metrics = ["accuracy"]
)

# Train model with proper callback
model.fit(X_train, y_train, validation_split=0.2, batch_size=batch_size, epochs=epochs, class_weight=class_weights, verbose=1, callbacks=[LearningRateScheduler(lr_step_decay, verbose=1)])

# Evaluate on test data
print("\nTest data loss/accuracy:")
model.evaluate(X_test, y_test)


Epoch 1: LearningRateScheduler setting learning rate to 0.01.
Epoch 1/5
[1m4384/4384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 13ms/step - accuracy: 0.8804 - loss: 0.2453 - val_accuracy: 0.9855 - val_loss: 0.0396 - learning_rate: 0.0100

Epoch 2: LearningRateScheduler setting learning rate to 0.01.
Epoch 2/5
[1m4384/4384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 11ms/step - accuracy: 0.9092 - loss: 0.1687 - val_accuracy: 0.9756 - val_loss: 0.0513 - learning_rate: 0.0100

Epoch 3: LearningRateScheduler setting learning rate to 0.01.
Epoch 3/5
[1m4384/4384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 11ms/step - accuracy: 0.9097 - loss: 0.1655 - val_accuracy: 0.9884 - val_loss: 0.0374 - learning_rate: 0.0100

Epoch 4: LearningRateScheduler setting learning rate to 0.01.
Epoch 4/5
[1m4384/4384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 12ms/step - accuracy: 0.9128 - loss: 0.1606 - val_accuracy: 0.9677 - val_loss: 0.0551 - learning_rate:

[0.19758661091327667, 0.8975246548652649]

In [45]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter = 1000, class_weight=class_weights)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8356896467958995

In [46]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(class_weight=class_weights)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8722489433027255