In [18]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# River Imports
from river import feature_extraction
from river import naive_bayes
from river import metrics
from river import feature_extraction as fx
from river import compose, stats
from river import anomaly
from river import stream
from river import preprocessing
from river import linear_model
from river import ensemble
from river import forest

# Data files
train_set = pd.read_csv("UNSW_NB15_training-set.csv")
test_set = pd.read_csv("UNSW_NB15_testing-set.csv")

df = pd.concat([train_set, test_set])
df = df.drop(columns=['proto','service','state','attack_cat','id'])

y = df.pop('label')

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size = 0.3)


Logistic Regression

In [9]:
# Initialize Model pipeline
model = compose.Pipeline(
    preprocessing.MinMaxScaler(),
    linear_model.LogisticRegression()
)
# Initialize metrics
metric = metrics.ROCAUC()
accuracy = metrics.Accuracy()

# Train the model
for x,y in stream.iter_pandas(X_train, y_train):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)
    metric.update(y, y_pred)
    accuracy.update(y, y_pred)

print(f'Training {accuracy}')
# Test on validation set
metric = metrics.ROCAUC() # Reset the metric
accuracy = metrics.Accuracy() # Reset the metric

for x,y in stream.iter_pandas(X_test, y_test):
    y_pred = model.predict_one(x)
    #model.learn_one(x, y)
    metric.update(y, y_pred)
    accuracy.update(y, y_pred)

print(f'Final {accuracy}')

  return a / b if b else 0.0


Training Accuracy: Accuracy: 84.61%
Final Accuracy: Accuracy: 86.46%


Ensemble Bagging Classifier

In [19]:
# Initialize Model pipeline
model = ensemble.BaggingClassifier(
    model=(
        preprocessing.StandardScaler() |
        linear_model.LogisticRegression()
    ),
    n_models=3,
    seed=42
)
# Initialize metrics
metric = metrics.ROCAUC()
accuracy = metrics.Accuracy()

# Train the model
for x,y in stream.iter_pandas(X_train, y_train):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)
    #metric.update(y, y_pred)
    accuracy.update(y, y_pred)

print(f'Training {accuracy}')
# Test on validation set
metric = metrics.ROCAUC() # Reset the metric
accuracy = metrics.Accuracy() # Reset the metric

for x,y in stream.iter_pandas(X_test, y_test):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)
    #metric.update(y, y_pred)
    accuracy.update(y, y_pred)

print(f'Final {accuracy}')

Training Accuracy: 87.70%
Final Accuracy: 88.12%


Ensemble ADWIN Bagging Classifier

In [11]:
# Initialize Model pipeline
model = ensemble.ADWINBaggingClassifier(
    model=(
        preprocessing.StandardScaler() |
        linear_model.LogisticRegression()
    ),
    n_models=3,
    seed=42
)
# Initialize metrics
metric = metrics.ROCAUC()
accuracy = metrics.Accuracy()

# Train the model
for x,y in stream.iter_pandas(X_train, y_train):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)
    #metric.update(y, y_pred)
    accuracy.update(y, y_pred)
print(f'Training {accuracy}')

# Test on validation set
metric = metrics.ROCAUC() # Reset the metric
accuracy = metrics.Accuracy() # Reset the metric

for x,y in stream.iter_pandas(X_test, y_test):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)
    metric.update(y, y_pred)
    accuracy.update(y, y_pred)

print(f'Final {accuracy}')

Training Accuracy: 86.92%
Final Accuracy: 86.81%


Uncombined Data Sets to Test time-based issues

In [27]:
# Binary Classification
train_df = pd.read_csv("UNSW_NB15_training-set.csv")
test_df = pd.read_csv("UNSW_NB15_testing-set.csv")

X_train = train_df.drop(columns=['proto','service','state','attack_cat','id'])
X_test = test_df.drop(columns=['proto','service','state','attack_cat','id'])

y_train = X_train.pop("label")
y_test = X_test.pop("label")

Logistic Regression

In [28]:
# Initialize Model pipeline
model = compose.Pipeline(
    preprocessing.MinMaxScaler(),
    linear_model.LogisticRegression()
)
# Initialize metrics
metric = metrics.ROCAUC()
accuracy = metrics.Accuracy()

# Train the model
for x,y in stream.iter_pandas(X_train, y_train):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)
    metric.update(y, y_pred)
    accuracy.update(y, y_pred)

print(f'Training {accuracy}')
# Test on validation set
metric = metrics.ROCAUC() # Reset the metric
accuracy = metrics.Accuracy() # Reset the metric

for x,y in stream.iter_pandas(X_test, y_test):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)
    metric.update(y, y_pred)
    accuracy.update(y, y_pred)

print(f'Final {accuracy}')

  return a / b if b else 0.0


Training Accuracy: 96.62%
Final Accuracy: 98.63%


Ensemble Bagging Logistic Regression

In [24]:
# Initialize Model pipeline
model = ensemble.BaggingClassifier(
    model=(
        preprocessing.StandardScaler() |
        linear_model.LogisticRegression()
    ),
    n_models=3,
    seed=42
)
# Initialize metrics
metric = metrics.ROCAUC()
accuracy = metrics.Accuracy()

# Train the model
for x,y in stream.iter_pandas(X_train, y_train):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)
    #metric.update(y, y_pred)
    accuracy.update(y, y_pred)

print(f'Training {accuracy}')
# Test on validation set
metric = metrics.ROCAUC() # Reset the metric
accuracy = metrics.Accuracy() # Reset the metric

for x,y in stream.iter_pandas(X_test, y_test):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)
    #metric.update(y, y_pred)
    accuracy.update(y, y_pred)

print(f'Final {accuracy}')

Training Accuracy: 97.92%
Final Accuracy: 99.01%


ADWIN Bagging Classifier

In [17]:
# Initialize Model pipeline
model = ensemble.ADWINBaggingClassifier(
    model=(
        preprocessing.StandardScaler() |
        linear_model.LogisticRegression()
    ),
    n_models=3,
    seed=42
)
# Initialize metrics
metric = metrics.ROCAUC()
accuracy = metrics.Accuracy()

# Train the model
for x,y in stream.iter_pandas(X_train, y_train):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)
    #metric.update(y, y_pred)
    accuracy.update(y, y_pred)
print(f'Training {accuracy}')

# Test on validation set
metric = metrics.ROCAUC() # Reset the metric
accuracy = metrics.Accuracy() # Reset the metric

for x,y in stream.iter_pandas(X_test, y_test):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)
    metric.update(y, y_pred)
    accuracy.update(y, y_pred)

print(f'Final {accuracy}')

Training Accuracy: 98.07%
Final Accuracy: 99.41%
