In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV,ParameterGrid
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold

In [14]:
train = pd.read_csv('data_source/train.csv')
test = pd.read_csv('data_source/test.csv')

# x_train = train.loc[:,['Destination Port', 'NAT Source Port','Packets', 'Elapsed Time (sec)','Bytes Received']]
x_train = train.drop(columns=["Action"])
y_train = train["Action"].map({'allow':0, 'deny':1, 'drop':2, 'reset-both':3})
x_test = test.drop(columns=["Action"])
y_test = test["Action"].map({'allow':0, 'deny':1, 'drop':2, 'reset-both':3})


In [15]:
pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('selector', VarianceThreshold()),
    ('classifier', XGBClassifier())
])

In [12]:
# x_train.loc[:,['Packets','Elapsed Time (sec)','Bytes Received']] = pipe.fit_transform(x_train.loc[:,['Packets','Elapsed Time (sec)','Bytes Received']])
# x_test.loc[:,['Packets','Elapsed Time (sec)','Bytes Received']] = pipe.fit_transform(x_test.loc[:,['Packets','Elapsed Time (sec)','Bytes Received']])
# y_train.replace({'allow':0, 'deny':1, 'drop':2, 'reset-both':3},inplace=True)
# y_test.replace({'allow':0, 'deny':1, 'drop':2, 'reset-both':3},inplace=True)

In [13]:
# grid = ParameterGrid(parameters)
# best_score = 0
# best_params = {}

# for params in grid:
#     clf = Pipeline(**params)
#     clf.fit(x_train, y_train)
#     y_pred = clf.predict(x_train)
#     f1 = f1_score(y_train, y_pred,average='weighted')
#     if f1 > best_score:
#         best_score = f1
#         best_params = params

# print("Best parameters:", best_params)
# print("Best f1 score:", best_score)

In [16]:
pipe.fit(x_train, y_train)

print('Training set score: ' + str(pipe.score(x_train,y_train)))
print('Test set score: ' + str(pipe.score(x_test,y_test)))

Training set score: 0.9993024066968957
Test set score: 0.9986775178026449


In [17]:
score = pipe.score(x_test, y_test)
score

0.9986775178026449

In [8]:
pipe

In [18]:
pipe.fit(x_train, y_train)

train_yhat = pipe.predict(x_train)
train_pipe = pipe.score(x_train,y_train)

test_yhat = pipe.predict(x_test)
test_pipe = pipe.score(x_test,y_test)
print('Train Score\n',classification_report(y_train,train_yhat))
print('Test Score\n',classification_report(y_test, test_yhat))
print(f"AVG F1-Score Train: {train_pipe}\nAVG F1-Score Test: {test_pipe}")

Train Score
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     26348
           1       1.00      1.00      1.00     10491
           2       1.00      1.00      1.00      8995
           3       1.00      1.00      1.00        38

    accuracy                           1.00     45872
   macro avg       1.00      1.00      1.00     45872
weighted avg       1.00      1.00      1.00     45872

Test Score
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     11292
           1       1.00      1.00      1.00      4496
           2       1.00      1.00      1.00      3856
           3       1.00      0.38      0.55        16

    accuracy                           1.00     19660
   macro avg       1.00      0.84      0.89     19660
weighted avg       1.00      1.00      1.00     19660

AVG F1-Score Train: 0.9993024066968957
AVG F1-Score Test: 0.9986775178026449
