In [None]:
!pip install datasets==3.5.0

Collecting datasets==3.5.0
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets==3.5.0)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.0
    Uninstalling fsspec-2025.3.0:
      Successfully uninstalled fsspec-2025.3.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
[31mERROR: pip

In [None]:

import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    pipeline
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import userdata
from huggingface_hub import login
import re
from urllib.parse import urlparse
import warnings
warnings.filterwarnings('ignore')

# ================================
# SETUP AND CONFIGURATION
# ================================
print(" ModernBERT Phishing URL Classification ")
print("=" * 75)

# Disable wandb completely
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"

# Authenicate
print(" Authenticating with Hugging Face...")
try:
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token)
    print(" Successfully authenticated!")
except Exception as e:
    print(f" Authentication failed: {e}")

# GPU check
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
GPU_AVAILABLE = torch.cuda.is_available()
print(f"🖥️  Device: {device}")
if GPU_AVAILABLE:
    print(f"   GPU: {torch.cuda.get_device_name()}")
    print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
print()

# ================================
# ENHANCED URL PREPROCESSING
# ================================
def preprocess_url(url):
    """
    Enhanced URL preprocessing for better classification
    """
    # Basic cleaning
    url = str(url).strip().lower()

    # Remove common prefixes that might confuse the model
    if url.startswith('http://'):
        url_clean = url[7:]
    elif url.startswith('https://'):
        url_clean = url[8:]
    else:
        url_clean = url

    # Remove www. prefix for consistency
    if url_clean.startswith('www.'):
        url_clean = url_clean[4:]

    # Add protocol back for standardization
    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url_clean

    return url

def analyze_url_features(url):
    """Analyze URL for suspicious patterns"""
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.lower()

        features = {
            'has_ip': bool(re.match(r'\d+\.\d+\.\d+\.\d+', domain)),
            'suspicious_tld': any(tld in domain for tld in ['.tk', '.ml', '.ga', '.cf']),
            'long_subdomain': len(domain.split('.')) > 3,
            'has_dashes': '-' in domain,
            'short_domain': len(domain) < 8,
            'has_numbers': bool(re.search(r'\d', domain)),
            'suspicious_keywords': any(word in url.lower() for word in [
                'secure', 'verify', 'update', 'suspend', 'alert', 'warning',
                'confirm', 'validate', 'login', 'signin', 'account'
            ])
        }
        return features
    except:
        return {}

# ================================
# LOAD AND PREPARE DATA
# ================================
print(" Loading PhishingURLsDataset...")
ds = load_dataset("semihGuner2002/PhishingURLsDataset")

# Combine datasets
train_df = ds['train'].to_pandas()
test_df = ds['test'].to_pandas()
all_data = pd.concat([train_df, test_df], ignore_index=True)

print(f" Dataset loaded: {all_data.shape}")
print(f"  Class distribution: {(all_data['label']==0).sum():,} benign, {(all_data['label']==1).sum():,} phishing")

# ================================
# DATA QUALITY ANALYSIS
# ================================
print("\n DATA QUALITY ANALYSIS")
print("=" * 50)

# Sample analysis
print(" Sample URLs with analysis:")
for i in range(5):
    benign_url = all_data[all_data['label'] == 0].iloc[i]['url']
    phishing_url = all_data[all_data['label'] == 1].iloc[i]['url']

    benign_features = analyze_url_features(benign_url)
    phishing_features = analyze_url_features(phishing_url)

    print(f"\nBenign: {benign_url}")
    print(f"  Features: {benign_features}")
    print(f"Phishing: {phishing_url}")
    print(f"  Features: {phishing_features}")

print()

# ================================
# PREPARE TRAINING DATA
# ================================
print("🔧 Preparing training data...")

# Use 30k samples for better training
SAMPLE_SIZE = 30000
sampled_data, _ = train_test_split(
    all_data,
    train_size=SAMPLE_SIZE,
    random_state=42,
    stratify=all_data['label']
)

# Preprocess URLs
sampled_data['url'] = sampled_data['url'].apply(preprocess_url)

# Split data
train_data, temp_data = train_test_split(
    sampled_data, test_size=0.4, random_state=42, stratify=sampled_data['label']
)
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, random_state=42, stratify=temp_data['label']
)

print(f" Data splits: {len(train_data):,} train, {len(val_data):,} val, {len(test_data):,} test")

# Convert to HF datasets
train_dataset = Dataset.from_pandas(train_data[['url', 'label']].reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_data[['url', 'label']].reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_data[['url', 'label']].reset_index(drop=True))

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print(" Datasets prepared!")
print()

# ================================
# MODERNBERT SETUP
# ================================
print(" Loading ModernBERT...")

MODEL_NAME = "answerdotai/ModernBERT-base"
MAX_LENGTH = 256

# Load tokenizer with special configuration
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Check if special tokens are needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f" Tokenizer configured")

# Load model with proper configuration
id2label = {0: "Benign", 1: "Phishing"}
label2id = {"Benign": 0, "Phishing": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    torch_dtype=torch.float32,
    trust_remote_code=True,
    problem_type="single_label_classification"
)

print(f" ModernBERT loaded: {model.num_parameters():,} parameters")
print()

# ================================
# TOKENIZATION
# ================================
print(" Tokenizing URLs...")

def tokenize_function(examples):
    """Enhanced tokenization function"""
    return tokenizer(
        examples['url'],
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors=None,
        add_special_tokens=True
    )

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['url']
)

print(" Tokenization complete!")
print()

# ================================
# ENHANCED TRAINING
# ================================
print("  Setting up enhanced training...")

def compute_metrics(eval_pred):
    """Comprehensive metrics computation"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

    # Per-class metrics
    precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
        labels, predictions, average=None
    )

    # Show confusion matrix periodically
    cm = confusion_matrix(labels, predictions)
    print(f"\n Confusion Matrix:")
    print(f"         Predicted")
    print(f"       Benign Phishing")
    print(f"Benign   {cm[0,0]:4d}    {cm[0,1]:4d}")
    print(f"Phish    {cm[1,0]:4d}    {cm[1,1]:4d}")

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'f1_benign': f1_per_class[0],
        'f1_phishing': f1_per_class[1]
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./modernbert-phishing-final",
    num_train_epochs=3,
    per_device_train_batch_size=20,  # Slightly larger for L4
    per_device_eval_batch_size=40,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,  # Lower learning rate for stability
    weight_decay=0.01,
    warmup_ratio=0.1,
    eval_strategy="steps",
    eval_steps=150,
    save_strategy="steps",
    save_steps=150,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    logging_steps=50,
    report_to=[],
    fp16=False,
    push_to_hub=False,
    dataloader_pin_memory=True,
    remove_unused_columns=True
)

# Setup trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("  Enhanced training setup complete!")
print()

# ================================
# TRAINING
# ================================
print(" Starting optimized training...")
training_results = trainer.train()
print(" Training completed!")
print()

# ================================
# COMPREHENSIVE EVALUATION
# ================================
print(" FINAL EVALUATION")
print("=" * 60)

# Test set evaluation
test_results = trainer.evaluate(tokenized_dataset['test'])
print(" Test Set Results:")
for key, value in test_results.items():
    if key.startswith('eval_'):
        print(f"   {key.replace('eval_', '').title()}: {value:.4f}")

# Save the model
model_save_path = "./modernbert-phishing-final"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"✅ Model saved to: {model_save_path}")
print()

# ================================
# PRODUCTION-READY INFERENCE
# ================================
print(" PRODUCTION INFERENCE SETUP")
print("=" * 60)

class AdvancedPhishingDetector:
    """Production-ready phishing detector with enhanced URL processing"""

    def __init__(self, model_path="./modernbert-phishing-final"):
        self.classifier = pipeline(
            "text-classification",
            model=model_path,
            tokenizer=model_path,
            device=0 if torch.cuda.is_available() else -1,
            return_all_scores=True,
            truncation=True,
            max_length=256
        )

        # Whitelist of known legitimate domains
        self.legitimate_domains = {
            'google.com', 'github.com', 'stackoverflow.com', 'wikipedia.org',
            'youtube.com', 'amazon.com', 'microsoft.com', 'apple.com',
            'paypal.com', 'ebay.com', 'linkedin.com', 'twitter.com',
            'facebook.com', 'instagram.com', 'reddit.com', 'netflix.com'
        }

        # Known suspicious TLDs
        self.suspicious_tlds = {'.tk', '.ml', '.ga', '.cf', '.pw'}

    def extract_domain(self, url):
        """Extract clean domain from URL"""
        try:
            if not url.startswith(('http://', 'https://')):
                url = 'https://' + url
            parsed = urlparse(url)
            domain = parsed.netloc.lower()
            # Remove www. prefix
            if domain.startswith('www.'):
                domain = domain[4:]
            return domain
        except:
            return url.lower()

    def get_url_risk_factors(self, url):
        """Analyze URL for risk factors"""
        domain = self.extract_domain(url)

        factors = {
            'is_ip_address': bool(re.match(r'\d+\.\d+\.\d+\.\d+', domain)),
            'suspicious_tld': any(domain.endswith(tld) for tld in self.suspicious_tlds),
            'is_whitelisted': domain in self.legitimate_domains,
            'has_suspicious_keywords': any(word in url.lower() for word in [
                'verify', 'suspend', 'secure-', 'update-', 'confirm-',
                'validate', 'alert', 'warning', 'claim', 'winner'
            ]),
            'domain_length': len(domain),
            'url_length': len(url),
            'subdomain_count': len(domain.split('.')) - 2
        }

        return factors

    def predict(self, url, use_risk_analysis=True):
        """
        Enhanced prediction with risk factor analysis
        """
        try:
            # Preprocess URL
            processed_url = preprocess_url(url)

            # Get model prediction
            result = self.classifier(processed_url)
            scores = {item['label']: item['score'] for item in result[0]}

            # Handle different label formats
            if 'LABEL_0' in scores:
                benign_score = scores['LABEL_0']
                phishing_score = scores['LABEL_1']
            else:
                benign_score = scores.get('Benign', 0)
                phishing_score = scores.get('Phishing', 0)

            # Get risk factors
            risk_factors = self.get_url_risk_factors(url)

            # Enhanced decision logic
            model_prediction = "Phishing" if phishing_score > benign_score else "Benign"
            model_confidence = max(benign_score, phishing_score)

            # Apply risk factor adjustments if enabled
            if use_risk_analysis:
                # Strong whitelist override
                if risk_factors['is_whitelisted'] and model_prediction == "Phishing":
                    final_prediction = "Benign"
                    final_confidence = 0.95
                    adjustment = "Whitelisted domain override"

                # Strong suspicious indicators
                elif (risk_factors['is_ip_address'] or
                      risk_factors['suspicious_tld'] or
                      (risk_factors['has_suspicious_keywords'] and risk_factors['subdomain_count'] > 2)):
                    final_prediction = "Phishing"
                    final_confidence = max(0.8, phishing_score)
                    adjustment = "High-risk indicators detected"

                else:
                    final_prediction = model_prediction
                    final_confidence = model_confidence
                    adjustment = "Model prediction used"
            else:
                final_prediction = model_prediction
                final_confidence = model_confidence
                adjustment = "Model only"

            return {
                'url': url,
                'processed_url': processed_url,
                'prediction': final_prediction,
                'confidence': final_confidence,
                'model_prediction': model_prediction,
                'model_confidence': model_confidence,
                'benign_score': benign_score,
                'phishing_score': phishing_score,
                'risk_factors': risk_factors,
                'adjustment_reason': adjustment,
                'risk_level': 'HIGH' if final_confidence > 0.8 else 'MEDIUM' if final_confidence > 0.6 else 'LOW'
            }

        except Exception as e:
            return {'error': str(e), 'url': url}

    def batch_analyze(self, urls, show_details=True):
        """Analyze multiple URLs with detailed reporting"""
        results = []

        for url in urls:
            result = self.predict(url)
            if 'error' not in result:
                results.append(result)

                if show_details:
                    emoji = "🚨" if result['prediction'] == "Phishing" else "✅"
                    pred = result['prediction']
                    conf = result['confidence']
                    risk = result['risk_level']
                    adj = result['adjustment_reason']

                    print(f"{emoji} {pred:8} | {risk:6} | {conf:.3f} | {adj:25} | {url}")

        # Summary
        phishing_count = sum(1 for r in results if r['prediction'] == 'Phishing')

        return {
            'results': results,
            'summary': {
                'total': len(results),
                'phishing': phishing_count,
                'benign': len(results) - phishing_count,
                'phishing_rate': phishing_count / len(results) if results else 0
            }
        }

# ================================
# TRAINING EXECUTION
# ================================
print(" Starting enhanced ModernBERT training...")
training_results = trainer.train()
print(" Training completed!")

# Final evaluation
test_results = trainer.evaluate(tokenized_dataset['test'])
print(f"\n Final Test Accuracy: {test_results['eval_accuracy']:.4f}")
print(f" Final Test F1-Score: {test_results['eval_f1']:.4f}")

# Save model
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(" Model saved!")
print()

# ================================
# COMPREHENSIVE TESTING
# ================================
print(" COMPREHENSIVE FINAL TESTING")
print("=" * 60)

# Initialize enhanced detector
detector = AdvancedPhishingDetector()

# Test URLs - mix of clearly benign and clearly phishing
comprehensive_test_urls = [
    # Major legitimate sites
    "https://www.google.com",
    "https://github.com",
    "https://www.paypal.com",
    "https://www.amazon.com",
    "https://stackoverflow.com",
    "https://en.wikipedia.org",
    "https://www.microsoft.com",
    "https://www.youtube.com",

    # Government and educational sites
    "https://www.gov.uk",
    "https://www.edu.au",
    "https://mit.edu",

    # Clear phishing patterns
    "http://paypal-security-update.tk",
    "https://amazon-prize-winner.ml",
    "http://google-verification.cf",
    "https://microsoft-account-suspend.ga",
    "http://apple-id-locked.tk",
    "https://facebook-security-alert.ml",

    # IP addresses
    "http://192.168.1.1",
    "https://165.227.23.218",

    # Suspicious patterns
    "http://secure-banking-login.info",
    "https://verify-account-now.biz",
    "http://urgent-paypal-update.net"
]

print(" Enhanced Testing Results:")
print("=" * 70)

scan_results = detector.batch_analyze(comprehensive_test_urls)

print(f"\n COMPREHENSIVE TEST SUMMARY:")
summary = scan_results['summary']
print(f" Total URLs: {summary['total']}")
print(f" Phishing detected: {summary['phishing']}")
print(f"  Benign URLs: {summary['benign']}")
print(f" Detection rate: {summary['phishing_rate']*100:.1f}%")

# ================================
# DATASET VALIDATION
# ================================
print(f"\n DATASET VALIDATION (Random 50 samples)")
print("=" * 60)

# Test on random dataset samples
random_indices = np.random.choice(len(test_data), 50, replace=False)
correct = 0

for idx in random_indices:
    url = test_data.iloc[idx]['url']
    true_label = test_data.iloc[idx]['label']
    true_name = "Benign" if true_label == 0 else "Phishing"

    result = detector.predict(url, use_risk_analysis=False)  # Use model only
    if 'error' not in result:
        pred_name = result['prediction']
        is_correct = pred_name == true_name
        correct += is_correct

        status = "✅" if is_correct else "X"
        conf = result['confidence']
        print(f"{status} True:{true_name:8} Pred:{pred_name:8} Conf:{conf:.3f} {url[:50]}")

dataset_acc = correct / 50 * 100
print(f"\n Dataset Validation Accuracy: {dataset_acc:.1f}%")

# ================================
# PERFORMANCE SUMMARY
# ================================
print(f"\n FINAL PERFORMANCE SUMMARY")
print("=" * 70)
print(f" ModernBERT Phishing Classifier Performance:")
print(f"    Test Set Accuracy: {test_results['eval_accuracy']*100:.1f}%")
print(f"    Test Set F1-Score: {test_results['eval_f1']*100:.1f}%")
print(f"    Manual Test Accuracy: Based on comprehensive testing")
print(f"    Dataset Validation: {dataset_acc:.1f}%")
print(f"    Training Time: ~8-15 minutes on L4 GPU")
print(f"    Model Size: 149M parameters")
print(f"    Inference Speed: 2-4x faster than BERT")



 ModernBERT Phishing URL Classification 
 Authenticating with Hugging Face...
 Successfully authenticated!
🖥️  Device: cuda
   GPU: NVIDIA L4
   VRAM: 22.2 GB

 Loading PhishingURLsDataset...


README.md:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/27.8M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/4.91M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/642533 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/113389 [00:00<?, ? examples/s]

 Dataset loaded: (755922, 3)
  Class distribution: 345,738 benign, 410,184 phishing

 DATA QUALITY ANALYSIS
 Sample URLs with analysis:

Benign: dogsey.com/showthread.php?t=72452&page=10
  Features: {'has_ip': False, 'suspicious_tld': False, 'long_subdomain': False, 'has_dashes': False, 'short_domain': True, 'has_numbers': False, 'suspicious_keywords': False}
Phishing: aidatiye.live
  Features: {'has_ip': False, 'suspicious_tld': False, 'long_subdomain': False, 'has_dashes': False, 'short_domain': True, 'has_numbers': False, 'suspicious_keywords': False}

Benign: animalhospitals-usa.com/dogs/dog-health.html
  Features: {'has_ip': False, 'suspicious_tld': False, 'long_subdomain': False, 'has_dashes': False, 'short_domain': True, 'has_numbers': False, 'suspicious_keywords': False}
Phishing: 165.227.23.218
  Features: {'has_ip': False, 'suspicious_tld': False, 'long_subdomain': False, 'has_dashes': False, 'short_domain': True, 'has_numbers': False, 'suspicious_keywords': False}

Benign: s

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

 Tokenizer configured


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 ModernBERT loaded: 149,606,402 parameters

 Tokenizing URLs...


Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

 Tokenization complete!

  Setting up enhanced training...
  Enhanced training setup complete!

 Starting optimized training...


W0812 14:19:03.771000 1958 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Benign,F1 Phishing
150,0.3261,0.137204,0.946167,0.94606,0.946634,0.946167,0.939907,0.951245
300,0.2026,0.10845,0.965667,0.965707,0.966306,0.965667,0.963148,0.967863
450,0.1579,0.109143,0.9665,0.966544,0.967376,0.9665,0.964165,0.96855
600,0.1288,0.090979,0.972667,0.972694,0.973085,0.972667,0.970557,0.974495
750,0.0757,0.093313,0.975333,0.975346,0.975446,0.975333,0.973227,0.977132
900,0.1105,0.108634,0.968833,0.968774,0.969363,0.968833,0.965235,0.971757
1050,0.0411,0.093031,0.978,0.978001,0.978003,0.978,0.975965,0.979717
1200,0.038,0.096691,0.9775,0.977498,0.977498,0.9775,0.975378,0.979285
1350,0.0228,0.097458,0.978,0.978,0.978,0.978,0.975948,0.97973



 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2526     218
Phish     105    3151

 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2692      52
Phish     154    3102

 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2704      40
Phish     161    3095

 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2703      41
Phish     123    3133

 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2690      54
Phish      94    3162

 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2596     148
Phish      39    3217

 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2680      64
Phish      68    3188

 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2674      70
Phish      65    3191

 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2678      66
Phish      66    3190
 Training completed!

 FINAL EVALUATION



 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2686      58
Phish      61    3195
 Test Set Results:
   Loss: 0.0874
   Accuracy: 0.9802
   F1: 0.9802
   Precision: 0.9802
   Recall: 0.9802
   F1_Benign: 0.9783
   F1_Phishing: 0.9817
   Runtime: 48.7263
   Samples_Per_Second: 123.1370
   Steps_Per_Second: 3.0780
✅ Model saved to: ./modernbert-phishing-final

 PRODUCTION INFERENCE SETUP
 Starting enhanced ModernBERT training...


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,F1 Benign,F1 Phishing
150,0.0987,0.175511,0.961167,0.961022,0.962825,0.961167,0.956029,0.965229
300,0.0457,0.133028,0.977,0.977013,0.977122,0.977,0.975045,0.978671
450,0.0447,0.124684,0.972167,0.97214,0.972304,0.972167,0.969239,0.974585
600,0.0235,0.16068,0.973,0.973034,0.973772,0.973,0.971082,0.97468
750,0.0036,0.142061,0.976,0.975998,0.975998,0.976,0.973732,0.977907



 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2533     211
Phish      22    3234

 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2696      48
Phish      90    3166

 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2631     113
Phish      54    3202

 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2720      24
Phish     138    3118

 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2669      75
Phish      69    3187
 Training completed!



 Confusion Matrix:
         Predicted
       Benign Phishing
Benign   2684      60
Phish      93    3163

 Final Test Accuracy: 0.9745
 Final Test F1-Score: 0.9745


Device set to use cuda:0


 Model saved!

 COMPREHENSIVE FINAL TESTING
 Enhanced Testing Results:
✅ Benign   | HIGH   | 0.994 | Model prediction used     | https://www.google.com
✅ Benign   | HIGH   | 0.999 | Model prediction used     | https://github.com
✅ Benign   | HIGH   | 0.950 | Whitelisted domain override | https://www.paypal.com
✅ Benign   | HIGH   | 0.902 | Model prediction used     | https://www.amazon.com
✅ Benign   | HIGH   | 1.000 | Model prediction used     | https://stackoverflow.com
✅ Benign   | HIGH   | 1.000 | Model prediction used     | https://en.wikipedia.org
✅ Benign   | HIGH   | 0.977 | Model prediction used     | https://www.microsoft.com
✅ Benign   | HIGH   | 1.000 | Model prediction used     | https://www.youtube.com


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


✅ Benign   | HIGH   | 0.996 | Model prediction used     | https://www.gov.uk
✅ Benign   | HIGH   | 0.859 | Model prediction used     | https://www.edu.au
✅ Benign   | HIGH   | 0.996 | Model prediction used     | https://mit.edu
🚨 Phishing | HIGH   | 1.000 | High-risk indicators detected | http://paypal-security-update.tk
🚨 Phishing | HIGH   | 1.000 | High-risk indicators detected | https://amazon-prize-winner.ml
🚨 Phishing | HIGH   | 1.000 | High-risk indicators detected | http://google-verification.cf
🚨 Phishing | HIGH   | 1.000 | High-risk indicators detected | https://microsoft-account-suspend.ga
🚨 Phishing | HIGH   | 1.000 | High-risk indicators detected | http://apple-id-locked.tk
🚨 Phishing | HIGH   | 1.000 | High-risk indicators detected | https://facebook-security-alert.ml
🚨 Phishing | HIGH   | 1.000 | High-risk indicators detected | http://192.168.1.1
🚨 Phishing | HIGH   | 1.000 | High-risk indicators detected | https://165.227.23.218
🚨 Phishing | HIGH   | 1.000 | Model predic

In [None]:
import os
import shutil
from google.colab import files

# Define the directory to zip and the desired zip file name
source_dir = '/content/modernbert-phishing-final'
zip_filename = 'dhanya_modernBERT.zip'

# Create the zip archive
print(f"Compressing {source_dir} into {zip_filename}...")
shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', source_dir)
print("Compression complete.")

# Offer the file for download
print(f"\nYour zipped file '{zip_filename}' is ready for download.")
files.download(zip_filename)

Compressing /content/modernbert-phishing-final into dhanya_modernBERT.zip...
Compression complete.

Your zipped file 'dhanya_modernBERT.zip' is ready for download.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
import os

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted.")

# Define source and destination paths
source_path = '/content/dhanya_modernBERT.zip'
destination_path = '/content/drive/MyDrive/dhanya_modernBERT.zip' # You can change the destination folder if needed

# Check if the source file exists
if os.path.exists(source_path):
    # Move the file
    print(f"Moving {source_path} to {destination_path}...")
    shutil.move(source_path, destination_path)
    print("File moved successfully!")
else:
    print(f"Error: Source file not found at {source_path}")

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted.
Moving /content/dhanya_modernBERT.zip to /content/drive/MyDrive/dhanya_modernBERT.zip...
File moved successfully!
