In [4]:
TRAIN_DATA_FILE = "UNSW_NB15_training-set.csv"
TEST_DATA_FILE = "UNSW_NB15_testing-set.csv"

In [11]:
import CS537_Project_Helper as helper

import math
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from category_encoders import BinaryEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

from river import stream, compose, linear_model, metrics, preprocessing, ensemble, tree, evaluate

In [15]:
train_set = pd.read_csv(TRAIN_DATA_FILE).drop(columns=["id"])
test_set = pd.read_csv(TEST_DATA_FILE).drop(columns=["id"])

X_train = train_set.drop(columns=["label"])
y_train = X_train.pop("attack_cat")
X_test = test_set.drop(columns=["label"])
y_test = X_test.pop("attack_cat")

# Encode categorical features using binary encoding
# Scale numerical features using standard scaler
cat_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()
num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
print(f"Categorical features: {cat_features}")
print(f"Numerical features: {num_features}")

preprocessor = ColumnTransformer(
    transformers=[
        ("numerical", StandardScaler(), num_features),
        ("categorical", BinaryEncoder(handle_unknown="ignore"), cat_features)
    ], sparse_threshold=0
)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
print(f"Number of features after preprocessing: {X_train.shape[1]}")

class_labels = np.unique(y_train)
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

metric = metrics.BalancedAccuracy()
model = ensemble.AdaBoostClassifier(
    model=(
        tree.HoeffdingTreeClassifier(
            split_criterion='gini',
            delta=1e-5,
            grace_period=2000
        )
    ),
    n_models=5
)

i = 0        
for xi, yi in stream.iter_array(X_train, y_train):
    y_pred = model.predict_one(xi)
    metric.update(yi, y_pred)
    model.learn_one(xi, yi)
    i += 1
    if i > 30000:
        break
    
print(metric)

evaluate.progressive_val_score(
    model=model,
    dataset=stream.iter_array(X_test, y_test),
    metric=metrics.BalancedAccuracy(),
    print_every=200
)

Categorical features: ['proto', 'service', 'state']
Numerical features: ['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports']
Number of features after preprocessing: 55
BalancedAccuracy: 50.00%
[200] BalancedAccuracy: 100.00%
[400] BalancedAccuracy: 36.33%
[600] BalancedAccuracy: 34.71%
[800] BalancedAccuracy: 35.46%
[1,000] BalancedAccuracy: 42.05%
[1,200] BalancedAccuracy: 39.51%
[1,400] BalancedAccuracy: 39.80%
[1,600] BalancedAccuracy: 39.97%
[1,800] BalancedAccuracy: 41.52%
[2,000] BalancedAccuracy: 41.72%
[2,200] BalancedAccuracy: 41.51%
[2,400] BalancedAccuracy: 42.01%
[2,600] Bala

KeyboardInterrupt: 