In [69]:
import pandas as pd
import numpy as np
import socket
import struct
import pennylane as qml
import base64
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                            ExtraTreesClassifier, GradientBoostingClassifier)
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from pathlib import Path
import json
from collections import defaultdict
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score

In [70]:
df = pd.read_csv(r"Spam_BestFirst.csv")
df.head()


Unnamed: 0,domain_token_count,tld,ldl_getArg,NumberofDotsinURL,delimeter_path,SymbolCount_Domain,class
0,2,2,0,1,7,1,benign
1,3,3,0,3,8,2,benign
2,2,2,0,1,3,1,benign
3,2,2,0,1,3,1,benign
4,2,2,0,2,4,1,benign


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14479 entries, 0 to 14478
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   domain_token_count  14479 non-null  int64 
 1   tld                 14479 non-null  int64 
 2   ldl_getArg          14479 non-null  int64 
 3   NumberofDotsinURL   14479 non-null  int64 
 4   delimeter_path      14479 non-null  int64 
 5   SymbolCount_Domain  14479 non-null  int64 
 6   class               14479 non-null  object
dtypes: int64(6), object(1)
memory usage: 791.9+ KB


In [72]:
from sklearn.preprocessing import LabelEncoder

if df['class'].dtype not in ['int64', 'float64']:
    le = LabelEncoder()
    df['class'] = le.fit_transform(df['class'])


In [73]:
df.isnull().sum()[df.isnull().sum() > 0]


Series([], dtype: int64)

In [74]:
df.nunique()

domain_token_count      4
tld                     4
ldl_getArg            131
NumberofDotsinURL      18
delimeter_path         35
SymbolCount_Domain      4
class                   2
dtype: int64

In [75]:
X = df.drop(columns="class")
y = df['class']
X.head()

Unnamed: 0,domain_token_count,tld,ldl_getArg,NumberofDotsinURL,delimeter_path,SymbolCount_Domain
0,2,2,0,1,7,1
1,3,3,0,3,8,2
2,2,2,0,1,3,1
3,2,2,0,1,3,1
4,2,2,0,2,4,1


In [76]:
# import pennylane as qml
# n_features = X.shape[1]
# N = int(np.ceil(np.log2(n_features)))
# wires = range(N)
# dev = qml.device('default.qubit', wires)    

# @qml.qnode(dev)
# def circuit(f=None):
#     qml.AmplitudeEmbedding(f, wires=wires,pad_with=0,normalize=True)
#     return qml.state()
# X_norm = X.values
# X_quantum = circuit(X_norm)
# X_real = np.real(np.array(X_quantum))
# # Create column names based on index
# column_names = [f'feature_{i}' for i in range(X_real.shape[1])]
# X_real = pd.DataFrame(X_real, columns=column_names)



In [77]:
import pennylane as qml
from pennylane import numpy as np
from sklearn.preprocessing import MinMaxScaler

N = X.shape[1]
wires = range(N)
dev = qml.device("default.qubit", wires)

@qml.qnode(dev)
def circuit(data):
    scaler = MinMaxScaler(feature_range=(0, np.pi))
    data = scaler.fit_transform(data.reshape(-1,1))
    data = data.reshape(-1)
    qml.AngleEmbedding(data, wires, rotation="Y")
    return [qml.expval(qml.PauliZ(w)) for w in wires]

# Function to process DataFrame through quantum circuit
def quantum_transform(df):
    # Convert DataFrame to numpy array
    values = df.values
    # Process each row through quantum circuit
    quantum_features = np.array([circuit(row) for row in values])
    # Remove tensor properties and convert to regular numpy array
    quantum_features = np.array(quantum_features).astype(float)
    return quantum_features
# Transform your data
X_real = quantum_transform(X)
quantum_cols = [f'quantum_state_{i}' for i in range(len(X_real[0]))]
X_real = pd.DataFrame(X_real, columns=quantum_cols)
X_real.head()


Unnamed: 0,quantum_state_0,quantum_state_1,quantum_state_2,quantum_state_3,quantum_state_4,quantum_state_5
0,0.62349,0.62349,1.0,0.900969,-1.0,0.900969
1,0.382683,0.382683,1.0,0.382683,-1.0,0.707107
2,-0.5,-0.5,1.0,0.5,-1.0,0.5
3,-0.5,-0.5,1.0,0.5,-1.0,0.5
4,0.0,0.0,1.0,0.0,-1.0,0.707107


In [78]:
# Now use this balanced data for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
import time
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
# Create a dictionary of models to evaluate
models = {
    # "SVM (Linear)": SVC(kernel="linear", random_state=42),
    # "SVM (Poly)": SVC(kernel="poly", random_state=42),
    # "SVM (RBF)": SVC(kernel="rbf", random_state=42),
    # "SVM (Sigmoid)": SVC(kernel="sigmoid", random_state=42),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Function to calculate metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    start_time = time.time()  # Track model fitting time
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    # Get performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    cohen_kappa = cohen_kappa_score(y_test, y_pred)
    
    # Running time
    end_time = time.time()
    runtime = end_time - start_time
    
    # Detailed classification report
    class_report = classification_report(y_test, y_pred)
    
    
    # Return all metrics
    return {
        "Model": model.__class__.__name__,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc,
        "Cohen's Kappa": cohen_kappa,
        "Running Time (s)": runtime,
    }

# Evaluating all models and storing results
results = []

for name, model in models.items():
    print(f"Evaluating model: {name}")
    result = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append(result)

# Convert results into a DataFrame
results_df = pd.DataFrame(results)
results_df.to_csv('results/spam_bestfirst_ang_norm.csv', index=False)

# Display all the results
print(results_df)

Evaluating model: KNN
Evaluating model: Random Forest
Evaluating model: AdaBoost




Evaluating model: Extra Trees
Evaluating model: XGBoost
Evaluating model: LightGBM
[LightGBM] [Info] Number of positive: 5352, number of negative: 6231
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 159
[LightGBM] [Info] Number of data points in the train set: 11583, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.462056 -> initscore=-0.152067
[LightGBM] [Info] Start training from score -0.152067
Evaluating model: Gradient Boosting
                        Model  Accuracy  Precision    Recall  F1 Score  \
0        KNeighborsClassifier  0.986533   0.985874  0.985141  0.985507   
1      RandomForestClassifier  0.987569   0.980910  0.992571  0.986706   
2          AdaBoostClassifier  0.987224   0.980896  0.991828  0.986332   
3        ExtraTreesClassifier  