In [None]:
import pandas as pd
import numpy as np
import socket
import struct
import pennylane as qml
import base64
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                            ExtraTreesClassifier, GradientBoostingClassifier)
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from pathlib import Path
import json
from collections import defaultdict
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score

In [None]:
df = pd.read_csv(r"All.csv")
df.head()


In [None]:
df.info()

In [None]:
df.isnull().sum()[df.isnull().sum() > 0]


In [None]:
# First drop the NumberRate_Extension column
df = df.drop('NumberRate_Extension', axis=1)

# Then drop rows with missing values in remaining columns
df = df.dropna()

In [None]:
df.nunique()

In [None]:
from sklearn.preprocessing import LabelEncoder

X = df.drop(columns="URL_Type_obf_Type")
y = df['URL_Type_obf_Type']
# Create encoder
le = LabelEncoder()

# Fit and transform y
y = le.fit_transform(y)
X.head()

In [None]:
import numpy as np
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# First handle high correlations
correlation_matrix = X.corr()
high_corr_features = set()  # Keep track of features to drop

# Find features with correlations > 0.8
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            # Keep feature with lower mean correlation with other features
            feat1, feat2 = correlation_matrix.columns[i], correlation_matrix.columns[j]
            corr1 = correlation_matrix[feat1].abs().mean()
            corr2 = correlation_matrix[feat2].abs().mean()
            high_corr_features.add(feat1 if corr1 > corr2 else feat2)

# Drop highly correlated features
X_cleaned = X.drop(columns=list(high_corr_features))
print(f"Dropped {len(high_corr_features)} features due to high correlation")

# Now handle VIF
def calculate_vif(data):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = data.columns
    vif_data["VIF"] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
    vif_data["VIF"] = vif_data["VIF"].replace([np.inf], 1e10)
    return vif_data.sort_values('VIF', ascending=False)

# Iteratively remove high VIF features
while True:
    vif_data = calculate_vif(X_cleaned)
    if vif_data['VIF'].max() <= 10:
        break
    worst_feature = vif_data.iloc[0]['Feature']
    X_cleaned = X_cleaned.drop(worst_feature, axis=1)
    print(f"Dropped {worst_feature} with VIF: {vif_data.iloc[0]['VIF']:.2f}")

print(f"\nOriginal shape: {X.shape}")
print(f"Final shape: {X_cleaned.shape}")
print(f"\nRemaining features:\n{X_cleaned.columns.tolist()}")


In [None]:
X = X_cleaned

In [None]:
# import pennylane as qml
# n_features = X.shape[1]
# N = int(np.ceil(np.log2(n_features)))
# wires = range(N)
# dev = qml.device('default.qubit', wires)    

# @qml.qnode(dev)
# def circuit(f=None):
#     qml.AmplitudeEmbedding(f, wires=wires,pad_with=0,normalize=True)
#     return qml.state()
# X_norm = X.values
# X_quantum = circuit(X_norm)
# X_real = np.real(np.array(X_quantum))
# # Create column names based on index
# column_names = [f'feature_{i}' for i in range(X_real.shape[1])]
# X_real = pd.DataFrame(X_real, columns=column_names)



In [None]:
import pennylane as qml
from pennylane import numpy as np

N = X.shape[1]
wires = range(N)
dev = qml.device("default.qubit", wires)

@qml.qnode(dev)
def circuit(val_list):
    qml.AngleEmbedding(val_list, wires, rotation="Y")
    return [qml.expval(qml.PauliZ(w)) for w in wires]

# Function to process DataFrame through quantum circuit
def quantum_transform(df):
    # Convert DataFrame to numpy array
    values = df.values
    values = values.reshape(values.shape[0], -1)
    # Process each row through quantum circuit
    quantum_features = np.array([circuit(row) for row in values])
    # Remove tensor properties and convert to regular numpy array
    quantum_features = np.array(quantum_features).astype(float)
    return quantum_features
# Transform your data
X_real = quantum_transform(X)


In [None]:
quantum_cols = [f'quantum_state_{i}' for i in range(len(X_real[0]))]
X_real = pd.DataFrame(X_real, columns=quantum_cols)
X_real.head()

In [None]:
# Now use this balanced data for training
X_train, X_test, y_train, y_test = train_test_split(X_real, y, test_size=0.2, random_state=42)

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# XGBoost
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# CatBoost
# cat_model = CatBoostClassifier(n_estimators=100, random_state=42, verbose=False)
# cat_model.fit(X_train, y_train)

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)


In [None]:
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# import seaborn as sns
# import matplotlib.pyplot as plt
# # Print accuracy
# print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# # Print detailed classification report
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))

# # Create confusion matrix visualization
# plt.figure(figsize=(8, 6))
# sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
# plt.title('Confusion Matrix')
# plt.ylabel('True Label')
# plt.xlabel('Predicted Label')
# plt.show()

# # Feature importance
# feature_importance = pd.DataFrame({
#     'feature': X.columns,
#     'importance': rf_model.feature_importances_
# }).sort_values('importance', ascending=False)

# print("\nFeature Importance:")
# print(feature_importance)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import seaborn as sns


# Create a dictionary of all models
models = {
    "SVM (Linear)": SVC(kernel="linear", random_state=42),
    "SVM (Poly)": SVC(kernel="poly", random_state=42),
    "SVM (RBF)": SVC(kernel="rbf", random_state=42),
    "SVM (Sigmoid)": SVC(kernel="sigmoid", random_state=42),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Loop through each model for evaluation
for name, model in models.items():
    predictions = model.predict(X_test)
    
    print(f"\n{'='*50}")
    print(f"{name} Results:")
    print(f"{'='*50}")
    
    # Accuracy
    print(f"Accuracy: {accuracy_score(y_test, predictions):.4f}")
    
    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_test, predictions))
    
    # Confusion Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_matrix(y_test, predictions), annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    # Feature Importance
    feature_importance = pd.DataFrame({
        'feature': X_real.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance)
