In [319]:
import pandas as pd
import numpy as np
import socket
import struct
import pennylane as qml
import base64
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                            ExtraTreesClassifier, GradientBoostingClassifier)
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from pathlib import Path
import json
from collections import defaultdict
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score

In [320]:
df = pd.read_csv(r"data-30s.csv")
df.shape


(14651, 24)

In [321]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14651 entries, 0 to 14650
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   duration            14651 non-null  float64
 1   total_fiat          14651 non-null  float64
 2   total_biat          14651 non-null  float64
 3   min_fiat            14651 non-null  float64
 4   min_biat            14651 non-null  float64
 5   max_fiat            14651 non-null  float64
 6   max_biat            14651 non-null  float64
 7   mean_fiat           14651 non-null  float64
 8   mean_biat           14651 non-null  float64
 9   flowPktsPerSecond   14651 non-null  float64
 10  flowBytesPerSecond  14651 non-null  float64
 11  min_flowiat         14651 non-null  float64
 12  max_flowiat         14651 non-null  float64
 13  mean_flowiat        14651 non-null  float64
 14  std_flowiat         14651 non-null  float64
 15  min_active          14651 non-null  float64
 16  mean

In [322]:
df.isnull().sum()


duration              0
total_fiat            0
total_biat            0
min_fiat              0
min_biat              0
max_fiat              0
max_biat              0
mean_fiat             0
mean_biat             0
flowPktsPerSecond     0
flowBytesPerSecond    0
min_flowiat           0
max_flowiat           0
mean_flowiat          0
std_flowiat           0
min_active            0
mean_active           0
max_active            0
std_active            0
min_idle              0
mean_idle             0
max_idle              0
std_idle              0
class1                0
dtype: int64

In [323]:
le = LabelEncoder()
df["class1"] = le.fit_transform(df["class1"])

In [324]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14651 entries, 0 to 14650
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   duration            14651 non-null  float64
 1   total_fiat          14651 non-null  float64
 2   total_biat          14651 non-null  float64
 3   min_fiat            14651 non-null  float64
 4   min_biat            14651 non-null  float64
 5   max_fiat            14651 non-null  float64
 6   max_biat            14651 non-null  float64
 7   mean_fiat           14651 non-null  float64
 8   mean_biat           14651 non-null  float64
 9   flowPktsPerSecond   14651 non-null  float64
 10  flowBytesPerSecond  14651 non-null  float64
 11  min_flowiat         14651 non-null  float64
 12  max_flowiat         14651 non-null  float64
 13  mean_flowiat        14651 non-null  float64
 14  std_flowiat         14651 non-null  float64
 15  min_active          14651 non-null  float64
 16  mean

In [325]:
X = df.drop(columns=["class1"])  # Features (all columns except 'class1')
y = df["class1"]  # Target label


In [326]:
X.shape

(14651, 23)

In [327]:
# import pennylane as qml
# n_features = X.shape[1]
# N = int(np.ceil(np.log2(n_features)))
# wires = range(N)
# dev = qml.device('default.qubit', wires)    

# @qml.qnode(dev)
# def circuit(f=None):
#     qml.AmplitudeEmbedding(f, wires=wires,pad_with=0,normalize=True)
#     return qml.state()
# X_norm = X.values
# X_quantum = circuit(X_norm)
# X_real = np.real(np.array(X_quantum))
# # Create column names based on index
# column_names = [f'feature_{i}' for i in range(X_real.shape[1])]
# X_real = pd.DataFrame(X_real, columns=column_names)
# X_real.shape



In [None]:
import pennylane as qml
from pennylane import numpy as np

N = X.shape[1]
wires = range(N)
dev = qml.device("default.qubit", wires)

@qml.qnode(dev)
def circuit(val_list):
    qml.AngleEmbedding(val_list, wires, rotation="Y")
    return [qml.expval(qml.PauliZ(w)) for w in wires]

# Function to process DataFrame through quantum circuit
def quantum_transform(df):
    # Convert DataFrame to numpy array
    values = df.values
    # Process each row through quantum circuit
    quantum_features = np.array([circuit(row) for row in values])
    # Remove tensor properties and convert to regular numpy array
    quantum_features = np.array(quantum_features).astype(float)
    return quantum_features
# Transform your data
X_real = quantum_transform(X)

quantum_cols = [f'quantum_state_{i}' for i in range(len(X_real[0]))]
X_real = pd.DataFrame(X_real, columns=quantum_cols)
X_real.head()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_real, y, test_size=0.3, random_state=42)  

In [330]:
import pennylane as qml

# For X_train
n_features_train = X_train.shape[1]
N_train = int(np.ceil(np.log2(n_features_train)))
wires = range(N_train)
dev = qml.device('default.qubit', wires)

@qml.qnode(dev)
def circuit(f=None):
    qml.AmplitudeEmbedding(f, wires=wires, pad_with=0, normalize=True)
    return qml.state()

X_train_norm = X_train.values
X_train_quantum = circuit(X_train_norm)
X_train = np.real(np.array(X_train_quantum))

# For X_test
n_features_test = X_test.shape[1]
N_test = int(np.ceil(np.log2(n_features_test)))
wires = range(N_test)
dev = qml.device('default.qubit', wires)

@qml.qnode(dev)
def circuit(f=None):
    qml.AmplitudeEmbedding(f, wires=wires, pad_with=0, normalize=True)
    return qml.state()

X_test_norm = X_test.values
X_test_quantum = circuit(X_test_norm)
X_test = np.real(np.array(X_test_quantum))


In [331]:

# 6. Quantile Transformer (transforms to normal or uniform distribution)
from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer(output_distribution='normal')
X_train = qt.fit_transform(X_train)
X_test = qt.transform(X_test)

In [332]:
from sklearn.preprocessing import StandardScaler


# Initialize the scaler
scaler = StandardScaler()


# Fit on training data only, then transform both train and test
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to DataFrame with basic column names
X_train = pd.DataFrame(X_train, columns=[f'feature_{i}' for i in range(X_train.shape[1])])
X_test = pd.DataFrame(X_test, columns=[f'feature_{i}' for i in range(X_test.shape[1])])



In [333]:
import time
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
# Create a dictionary of models to evaluate
models = {
    # "SVM (Linear)": SVC(kernel="linear", random_state=42),
    # "SVM (Poly)": SVC(kernel="poly", random_state=42),
    # "SVM (RBF)": SVC(kernel="rbf", random_state=42),
    # "SVM (Sigmoid)": SVC(kernel="sigmoid", random_state=42),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Function to calculate metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    start_time = time.time()  # Track model fitting time
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    # Get performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    cohen_kappa = cohen_kappa_score(y_test, y_pred)
    
    # Running time
    end_time = time.time()
    runtime = end_time - start_time
    
    # Detailed classification report
    class_report = classification_report(y_test, y_pred)
    
    
    # Return all metrics
    return {
        "Model": model.__class__.__name__,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Cohen’s Kappa": cohen_kappa,
        "Running Time (s)": runtime,
    }

# Evaluating all models and storing results
results = []

for name, model in models.items():
    print(f"Evaluating model: {name}")
    result = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append(result)

# Convert results into a DataFrame
results_df = pd.DataFrame(results)

# Display all the results
print(results_df)

Evaluating model: KNN
Evaluating model: Random Forest
Evaluating model: AdaBoost




Evaluating model: Extra Trees
Evaluating model: XGBoost
Evaluating model: LightGBM
[LightGBM] [Info] Number of positive: 5426, number of negative: 4829
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000439 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5865
[LightGBM] [Info] Number of data points in the train set: 10255, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.529108 -> initscore=0.116563
[LightGBM] [Info] Start training from score 0.116563
Evaluating model: Gradient Boosting
                        Model  Accuracy  Precision    Recall  F1 Score  \
0        KNeighborsClassifier  0.869199   0.857910  0.899913  0.878410   
1      RandomForestClassifier  0.899909   0.883730  0.931976  0.907212   
2          AdaBoostClassifier  0.796178   0.779051  0.853986  0.814800   
3        ExtraTreesClassifier  0.897862   0.884886  0.925910  0.904933   
4               XGBCl

In [334]:
results_df.to_csv('data-30s_amp_preprocess.csv', index=False)


In [335]:
results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC,Cohen’s Kappa,Running Time (s)
0,KNeighborsClassifier,0.869199,0.85791,0.899913,0.87841,0.935297,0.737064,0.296496
1,RandomForestClassifier,0.899909,0.88373,0.931976,0.907212,0.964,0.798737,2.309121
2,AdaBoostClassifier,0.796178,0.779051,0.853986,0.8148,0.870523,0.589252,1.133986
3,ExtraTreesClassifier,0.897862,0.884886,0.92591,0.904933,0.960826,0.794709,0.78004
4,XGBClassifier,0.905141,0.889255,0.935875,0.91197,0.968712,0.809279,0.216722
5,LGBMClassifier,0.898772,0.884758,0.928076,0.9059,0.965541,0.79651,0.149221
6,GradientBoostingClassifier,0.863967,0.840909,0.913778,0.875831,0.936009,0.726001,4.89807


In [336]:
import numpy as np

# Method 1: Using numpy unique with return_counts
unique_values, counts = np.unique(y, return_counts=True)
print(dict(zip(unique_values, counts)))

# Method 2: Using value_counts if y is a pandas series
print(pd.Series(y).value_counts())


{np.int64(0): np.int64(6917), np.int64(1): np.int64(7734)}
class1
1    7734
0    6917
Name: count, dtype: int64
