In [32]:
import pandas as pd
import numpy as np
import time

import matplotlib.pyplot as plt

In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from utils import *

In [34]:
types = "short"
normalized = False
binary_classify = True

data_path_unsw_train = "dataset/UNSW_NB15_training-set.csv"
data_path_unsw_test = "dataset/UNSW_NB15_testing-set.csv"

n_compnents = 16
normalized = True
binary_classify = False
label = False  # label=False for Feature Extraction

In [35]:
data_train = preprocessing_data_unsw(data_path=data_path_unsw_train, normalized=normalized,
                                     binary_classify=binary_classify)
y_train = data_train['label']
data_train = data_train.drop(columns=['label'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['service'].replace('-', 'other', inplace=True)


In [36]:
# feature extraction using PCA
time_PCA_start = time.process_time()
X = data_train.to_numpy()
X_mean = np.mean(X, axis=0)
X_hat = X - X_mean

pca = PCA(n_components=n_compnents)
pca.fit(data_train)
U = pca.components_.T

X_train = np.dot(U.T, X_hat.T).T
time_PCA_end = time.process_time()
time_PCA = time_PCA_end - time_PCA_start
print("time_PCA: ", time_PCA)

time_PCA:  30.299115915000016


In [37]:

# create a linear regression classifier for PCA
classifier = LinearRegression()

time_train_LR_start = time.process_time()
classifier.fit(X_train, y_train)
time_train_LR_end = time.process_time()
time_train_LR = time_train_LR_end - time_train_LR_start
print("time_train_LR: ", time_train_LR)


time_train_LR:  0.11452299999996285


In [38]:
# data processing for test dataset PCA
data_test = preprocessing_data_unsw(data_path=data_path_unsw_test, normalized=normalized,
                                    binary_classify=binary_classify)
y_test = data_test['label']
data_test = data_test.drop(columns=['label'])
data_test = align_test_dataset(data_test, data_train)

time_predict_PCA_start = time.process_time()
X_test_PCA = np.dot(U.T, (data_test.to_numpy() - X_mean).T).T
y_pred_PCA = classifier.predict(X_test_PCA)
time_predict_PCA_end = time.process_time()
time_predict = (time_predict_PCA_end - time_predict_PCA_start) / len(y_test)
print(time_predict)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['service'].replace('-', 'other', inplace=True)


7.656903996016134e-05


In [39]:
# results of PCA
display_results(y_test=y_test, y_pred=y_pred_PCA, run_time=time_predict)


Mean Absolute Error -  0.8755769324199583
Precision -  74.39204985910374
Recall -  59.541854928824755
F1 score -  66.14367956466658
Accuracy -  59.541854928824755
Running training -  7.656903996016134e-05


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [40]:
from sklearn.cross_decomposition import PLSRegression

# feature extraction using PLS
time_PLS_start = time.process_time()
pls = PLSRegression(n_components=n_compnents)
pls.fit(data_train, y_train)
X_train_pls = pls.transform(data_train)
time_PLS_end = time.process_time()
time_PLS = time_PLS_end - time_PLS_start
print("time_PLS: ", time_PLS)


time_PLS:  28.752743074000023


In [41]:

# Transform test data using PLS
X_test_PLS = pls.transform(data_test)

time_predict_PLS_start = time.process_time()
# Predict using PLS model
y_pred_PLS = classifier.predict(X_test_PLS)

time_predict_PLS_end = time.process_time()
time_predict_PLS = (time_predict_PLS_end - time_predict_PLS_start) / len(y_test)

print(time_predict_PLS)
display_results(y_test=y_test, y_pred=y_pred_PLS, run_time=time_predict)

2.4414216829393943e-07

Mean Absolute Error -  2.826774522664335
Precision -  44.19662731583867
Recall -  15.037895350531993
F1 score -  22.440435965541226
Accuracy -  15.037895350531993
Running training -  7.656903996016134e-05


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
