# Preprocessing Data

In [None]:
import typing
import pandas as pd
import numpy as np
from sklearn import preprocessing as skp

## First Formatting of the Data

In [None]:
train_data = pd.read_csv("data/UNSW_NB15_training-set.csv")
test_data = pd.read_csv("data/UNSW_NB15_testing-set.csv")

## Some Small Exploration of the Data

In [None]:
train_data.describe()

In [None]:
train_data.info()

## Removing Unneeded Columns

In [None]:
train_data = train_data.drop(columns=["id", "attack_cat"])
test_data = test_data.drop(columns=["id", "attack_cat"])

## Converting Data to Numerical Values

In [None]:
def encode_categoricals(data: pd.DataFrame):
    "Encode the categorical columns in data as integers using enumeration"
    categorical_cols = [c for c in data if data[c].dtype == "object"]
    data[categorical_cols] = skp.OrdinalEncoder().fit_transform(data[categorical_cols])
    
encode_categoricals(train_data)
encode_categoricals(test_data)

## Normalization

In [None]:
def byterize(x: typing.Collection[float]) -> typing.Collection[float]:
    "Return the number of bits in the nearest byte that can contain x"
    return 2**np.ceil(np.log2(x))

def min_max_scale(x: float|typing.Collection[float], min_val: float, max_val: float) -> float|typing.Collection[float]:
    "Scale x into the range [0, 1] uniformly where x = 1 means before scaling x = max_val, and x = 0 means before scaling x = min_val"
    return (x - min_val) / (max_val - min_val)

def normalize(train_data: pd.DataFrame, test_data: pd.DataFrame):
    "Normalize all the data into the [0, 1] according to the training data's properties"
    for col in train_data:
        min_val = np.floor(train_data[col].min())
        max_val = byterize(train_data[col].max())
        test_data[col] = test_data[col].clip(upper=max_val)
        train_data[col] = min_max_scale(train_data[col], min_val, max_val)
        test_data[col] = min_max_scale(test_data[col], min_val, max_val)
        
normalize(train_data, test_data)

## Final Formatting

In [None]:
def extract_X_Y(data: pd.DataFrame):
    "Extract the samples and labels into the numpy arrays X, Y respectively"
    Y = data.label.to_numpy()
    X = data.drop(columns="label").to_numpy()
    return X, Y

X_train, Y_train = extract_X_Y(train_data)
X_test, Y_test = extract_X_Y(test_data)

# Supervised Machine Learning

In [None]:
import tensorflow as tf

def create_model(sample_shape):
    inputs = tf.keras.Input(sample_shape)
    x = tf.keras.layers.Dense(100, activation="relu")(inputs)
    x = tf.keras.layers.Dense(50, activation="relu")(x)
    x = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    model = tf.keras.Model(inputs=inputs, outputs=x)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
        loss="binary_crossentropy",
        metrics=["binary_accuracy", "AUC"]
    )
    return model

model = create_model(X_train.shape[1:])
model.fit(x=X_train, y=Y_train, batch_size=128, epochs=3)
loss, acc, auc = model.evaluate(X_test, Y_test)
print(f"LOSS: {loss}, Accuracy: {acc:%}, AUC: {auc}")

# Unsupervised Machine Learning

In [None]:
from sklearn import cluster as skc
from sklearn import metrics as skm

model = skc.KMeans(n_clusters=2, n_init='auto')
model.fit(X_train)

attack_idx = Y_train == 1
attack_cluster = model.predict(X_train[attack_idx]).mean().round()

In [None]:
def predict(model, X_train, Y_train, X_test, Y_test):
    attack_idx = Y_train == 1
    attack_cluster = model.predict(X_train[attack_idx]).mean().round()
    return 1 - model.predict(X_test) if attack_cluster == 0 else model.predict(X_test)

preds = predict(model, X_train, Y_train, X_test, Y_test)

In [None]:
loss = skm.log_loss(Y_test, preds)
acc = skm.accuracy_score(Y_test, preds)
mcc = skm.matthews_corrcoef(Y_test, preds)
print(f"Loss: {loss}, Accuracy: {acc:%}, MCC: {mcc}")