In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score,
                             recall_score, f1_score, roc_auc_score,
                             classification_report)
import xgboost as xgb
from sklearn.impute import SimpleImputer  # For handling missing values
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

# 1. Load the data
df = pd.read_csv("PhiUSIIL_Phishing_URL_Dataset.csv")

# 2. Data Preprocessing
# Separate features and target variable
X = df.drop(['label', 'FILENAME', 'URL', 'Domain', 'TLD', 'Title'], axis=1)
y = df['label']

# Handle missing values using imputation
imputer = SimpleImputer(strategy='mean')  # Replace missing values with the mean
X = imputer.fit_transform(X)

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing with an 80/20 ratio
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 3. Define Base Models
# Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
# Support Vector Machine
svm_clf = SVC(probability=True, random_state=42)  # probability=True for predict_proba
# Gaussian Naive Bayes
gnb_clf = GaussianNB()

# 4. Train Base Models and Make Predictions
rf_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)
gnb_clf.fit(X_train, y_train)

rf_train_pred = rf_clf.predict_proba(X_train)[:, 1]
svm_train_pred = svm_clf.predict_proba(X_train)[:, 1]
gnb_train_pred = gnb_clf.predict_proba(X_train)[:, 1]

rf_test_pred = rf_clf.predict_proba(X_test)[:, 1]
svm_test_pred = svm_clf.predict_proba(X_test)[:, 1]
gnb_test_pred = gnb_clf.predict_proba(X_test)[:, 1]

# 5. Prepare Meta-Learner Training Data
X_train_meta = np.column_stack((rf_train_pred, svm_train_pred, gnb_train_pred))
X_test_meta = np.column_stack((rf_test_pred, svm_test_pred, gnb_test_pred))

# 6. Define and Train the Meta-Learner (XGBoost)
xgb_clf = xgb.XGBClassifier(random_state=42)
xgb_clf.fit(X_train_meta, y_train)

# 7. Make Predictions with the Stacked Ensemble
y_pred_meta = xgb_clf.predict(X_test_meta)

# 8. Evaluate the Stacked Ensemble
accuracy = accuracy_score(y_test, y_pred_meta)
precision = precision_score(y_test, y_pred_meta)
recall = recall_score(y_test, y_pred_meta)
f1 = f1_score(y_test, y_pred_meta)
roc_auc = roc_auc_score(y_test, y_pred_meta)

print("Stacked Ensemble Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_meta))

Stacked Ensemble Performance:
Accuracy: 0.9996
Precision: 1.0000
Recall: 0.9993
F1 Score: 0.9996
ROC AUC: 0.9996

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20124
           1       1.00      1.00      1.00     27035

    accuracy                           1.00     47159
   macro avg       1.00      1.00      1.00     47159
weighted avg       1.00      1.00      1.00     47159

