# Phishing Detection Notebook

This notebook walks through:

- Loading and preprocessing phishing datasets.
- Training Machine Learning (ML) and Artificial Intelligence (AI) models.
- Evaluating YARA rules for phishing detection.


In [None]:
# Import necessary libraries
import logging
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from src.detection.yara_rules.phishing_rules_loader import update_phishing_lists, compile_yara_rules
from src.detection.phishing_ml_model import train_ml_model, evaluate_ml_model
from src.detection.phishing_ai_model import build_ai_model, train_ai_model, evaluate_ai_model, plot_training_history
from src.preprocessing.phishing_preprocessing import load_phishing_data, preprocess_domain_data, preprocess_url_data

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


In [None]:
# Update phishing datasets and YARA rules
logging.info("Updating phishing datasets and YARA rules...")
try:
    update_phishing_lists()
except Exception as e:
    logging.error(f"Error updating phishing datasets: {e}")
    raise


In [None]:
# Load datasets
domain_file = "data/phishing/domains.lst"
url_file = "data/phishing/urls.lst"
logging.info("Loading datasets...")
domains, urls = load_phishing_data(domain_file, url_file)

# Preprocess data
logging.info("Preprocessing datasets...")
domains = preprocess_domain_data(domains)
urls = preprocess_url_data(urls)


In [None]:
# Combine and shuffle datasets
combined_features = pd.concat([domains, urls], ignore_index=True)
combined_features = shuffle(combined_features, random_state=42)  # Shuffle data for better training

# Prepare features and labels
feature_columns = [
    "length", 
    "num_dots", 
    "has_suspicious_keywords", 
    "contains_ip", 
    "num_special_chars", 
    "has_encoded_chars", 
    "has_uncommon_tld"
]
X = combined_features[feature_columns]
y = [1] * len(X)  # Assuming all entries are phishing for simplicity; replace with true labels if available.


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the ML model
logging.info("Training and evaluating the ML model...")
ml_model = train_ml_model(X_train, y_train)
evaluate_ml_model(ml_model, X_test, y_test)

# Train and evaluate the AI model
logging.info("Training and evaluating the AI model...")
ai_model = build_ai_model(input_dim=X_train.shape[1])
history = train_ai_model(ai_model, X_train, y_train, X_val=X_test, y_val=y_test)
evaluate_ai_model(ai_model, X_test, y_test)

# Plot training history
logging.info("Plotting training history...")
plot_training_history(history)


In [None]:
# Compile and test YARA rules
logging.info("Compiling and evaluating YARA rules...")
try:
    yara_rules = compile_yara_rules()
    sample_domain = "secure-login.bank.com"  # Replace with dynamic input if needed
    matches = yara_rules.match(data=sample_domain)
    logging.info(f"YARA Matches for '{sample_domain}': {matches}")
except Exception as e:
    logging.error(f"Error in YARA rule evaluation: {e}")


In [None]:
logging.info("Phishing detection pipeline completed.")
