In [1]:
import pandas as pd
import numpy as np
import socket
import struct
import pennylane as qml
import base64
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                            ExtraTreesClassifier, GradientBoostingClassifier)
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from pathlib import Path
import json
from collections import defaultdict
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score

In [2]:
df = pd.read_csv(r"CSV\TestbedThuJun17Flows.csv")
df.shape


(397595, 21)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397595 entries, 0 to 397594
Data columns (total 21 columns):
 #   Column                          Non-Null Count   Dtype 
---  ------                          --------------   ----- 
 0   generated                       397595 non-null  object
 1   appName                         397595 non-null  object
 2   totalSourceBytes                397595 non-null  int64 
 3   totalDestinationBytes           397595 non-null  int64 
 4   totalDestinationPackets         397595 non-null  int64 
 5   totalSourcePackets              397595 non-null  int64 
 6   sourcePayloadAsBase64           187817 non-null  object
 7   sourcePayloadAsUTF              187815 non-null  object
 8   destinationPayloadAsBase64      180571 non-null  object
 9   destinationPayloadAsUTF         180567 non-null  object
 10  direction                       397595 non-null  object
 11  sourceTCPFlagsDescription       328198 non-null  object
 12  destinationTCPFlagsDescription

In [4]:
# Drop payload columns
payload_columns = [
    "sourcePayloadAsBase64", "sourcePayloadAsUTF",
    "destinationPayloadAsBase64", "destinationPayloadAsUTF"
]
df.drop(columns=payload_columns, inplace=True)

In [5]:
# Convert labels
df["Label"] = df["Label"].map({"Normal": 0, "Attack": 1})

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397595 entries, 0 to 397594
Data columns (total 17 columns):
 #   Column                          Non-Null Count   Dtype 
---  ------                          --------------   ----- 
 0   generated                       397595 non-null  object
 1   appName                         397595 non-null  object
 2   totalSourceBytes                397595 non-null  int64 
 3   totalDestinationBytes           397595 non-null  int64 
 4   totalDestinationPackets         397595 non-null  int64 
 5   totalSourcePackets              397595 non-null  int64 
 6   direction                       397595 non-null  object
 7   sourceTCPFlagsDescription       328198 non-null  object
 8   destinationTCPFlagsDescription  312094 non-null  object
 9   source                          397595 non-null  object
 10  protocolName                    397595 non-null  object
 11  sourcePort                      397595 non-null  int64 
 12  destination                   

In [7]:
# Select numeric features
numeric_cols = df.select_dtypes(include=['int64']).columns
X = df[numeric_cols].drop(columns="Label")
y = df['Label']


In [8]:
import pennylane as qml
n_features = X.shape[1]
N = int(np.ceil(np.log2(n_features)))
wires = range(N)
dev = qml.device('default.qubit', wires)    

@qml.qnode(dev)
def circuit(f=None):
    qml.AmplitudeEmbedding(f, wires=wires,pad_with=0,normalize=True)
    return qml.state()
X_norm = X.values
X_quantum = circuit(X_norm)
X_real = np.real(np.array(X_quantum))
# Create column names based on index
column_names = [f'feature_{i}' for i in range(X_real.shape[1])]
X_real = pd.DataFrame(X_real, columns=column_names)




In [None]:
# import pennylane as qml
# from pennylane import numpy as np

# N = X.shape[1]
# wires = range(N)
# dev = qml.device("default.qubit", wires)

# @qml.qnode(dev)
# def circuit(val_list):
#     qml.AngleEmbedding(val_list, wires, rotation="Y")
#     return [qml.expval(qml.PauliZ(w)) for w in wires]

# # Function to process DataFrame through quantum circuit
# def quantum_transform(df):
#     # Convert DataFrame to numpy array
#     values = df.values
#     # Process each row through quantum circuit
#     quantum_features = np.array([circuit(row) for row in values])
#     # Remove tensor properties and convert to regular numpy array
#     quantum_features = np.array(quantum_features).astype(float)
#     return quantum_features
# # Transform your data
# X_real = quantum_transform(X)

# quantum_cols = [f'quantum_state_{i}' for i in range(len(X_real[0]))]
# X_real = pd.DataFrame(X_real, columns=quantum_cols)
# X_real.head()


In [10]:
X_real.shape

(397595, 8)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_real, y, test_size=0.3, random_state=42)  

In [12]:
import time
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
# Create a dictionary of models to evaluate
models = {
    # "SVM (Linear)": SVC(kernel="linear", random_state=42),
    # "SVM (Poly)": SVC(kernel="poly", random_state=42),
    # "SVM (RBF)": SVC(kernel="rbf", random_state=42),
    # "SVM (Sigmoid)": SVC(kernel="sigmoid", random_state=42),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Function to calculate metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    start_time = time.time()  # Track model fitting time
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    # Get performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    cohen_kappa = cohen_kappa_score(y_test, y_pred)
    
    # Running time
    end_time = time.time()
    runtime = end_time - start_time
    
    # Detailed classification report
    class_report = classification_report(y_test, y_pred)
    
    
    # Return all metrics
    return {
        "Model": model.__class__.__name__,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Cohen’s Kappa": cohen_kappa,
        "Running Time (s)": runtime,
    }

# Evaluating all models and storing results
results = []

for name, model in models.items():
    print(f"Evaluating model: {name}")
    result = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append(result)

# Convert results into a DataFrame
results_df = pd.DataFrame(results)

# Display all the results
print(results_df)

Evaluating model: KNN
Evaluating model: Random Forest
Evaluating model: AdaBoost




Evaluating model: Extra Trees
Evaluating model: XGBoost
Evaluating model: LightGBM
[LightGBM] [Info] Number of positive: 3640, number of negative: 274676
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001508 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 278316, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.013079 -> initscore=-4.323609
[LightGBM] [Info] Start training from score -4.323609
Evaluating model: Gradient Boosting
                        Model  Accuracy  Precision    Recall  F1 Score  \
0        KNeighborsClassifier  0.999254   0.983766  0.959468  0.971465   
1      RandomForestClassifier  0.999899   0.997460  0.994934  0.996195   
2          AdaBoostClassifier  0.998650   0.955071  0.942369  0.948677   
3        ExtraTreesClassifi

In [13]:
# results_df.to_csv('TestbedThuJun17Flows_ang.csv', index=False)


In [14]:
results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC,Cohen’s Kappa,Running Time (s)
0,KNeighborsClassifier,0.999254,0.983766,0.959468,0.971465,0.99552,0.971087,6.55096
1,RandomForestClassifier,0.999899,0.99746,0.994934,0.996195,0.999999,0.996144,21.969957
2,AdaBoostClassifier,0.99865,0.955071,0.942369,0.948677,0.999893,0.947993,21.219768
3,ExtraTreesClassifier,0.999883,1.0,0.991134,0.995547,0.999999,0.995488,7.423193
4,XGBClassifier,0.999857,0.992434,0.996833,0.994629,0.999997,0.994557,0.717973
5,LGBMClassifier,0.999883,0.996195,0.994934,0.995564,0.999998,0.995505,0.726476
6,GradientBoostingClassifier,0.999422,0.987726,0.968334,0.977934,0.999962,0.977641,68.04063


In [15]:
import numpy as np

# Method 1: Using numpy unique with return_counts
unique_values, counts = np.unique(y, return_counts=True)
print(dict(zip(unique_values, counts)))

# Method 2: Using value_counts if y is a pandas series
print(pd.Series(y).value_counts())



{np.int64(0): np.int64(392376), np.int64(1): np.int64(5219)}
Label
0    392376
1      5219
Name: count, dtype: int64
