In [None]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import re
from urllib.parse import urlparse

In [None]:
# Step 2: Load datasets
legitimate_df = pd.read_csv('structured_data_legitimate.csv')
phishing_df = pd.read_csv('structured_data_phishing.csv')

In [None]:
# Step 3: Combine datasets
# Assuming 'label' column contains 0 for legitimate and 1 for phishing
legitimate_df['label'] = 0
phishing_df['label'] = 1
df = pd.concat([legitimate_df, phishing_df], ignore_index=True)

In [None]:
# Step 4: Feature Engineering (extract features from URL)
def extract_features(url):
    features = {}

    # Extracting basic domain and path features
    domain = urlparse(url).hostname
    path = urlparse(url).path

    # Extracting URL length
    features['url_length'] = len(url)

    # Number of subdomains
    features['num_subdomains'] = domain.count('.') - 1  # Subdomains are separated by dots

    # Checking for HTTPS
    features['uses_https'] = url.startswith('https')

    # Presence of IP address in the URL
    features['has_ip'] = bool(re.search(r'\d+\.\d+\.\d+\.\d+', url))

    # Presence of special characters
    features['has_special_chars'] = bool(re.search(r'[^A-Za-z0-9./:_-]', url))

    # Query and fragment analysis (ignore query and fragment for simplicity)
    query_present = bool(urlparse(url).query)
    fragment_present = bool(urlparse(url).fragment)

    features['has_query'] = query_present
    features['has_fragment'] = fragment_present

    # Check if domain is from a known legitimate source (e.g., Google, Facebook)
    known_domains = ['google.com', 'facebook.com', 'twitter.com', 'amazon.com']
    features['is_known_domain'] = int(any(domain.endswith(known_domain) for known_domain in known_domains))

    return features


In [None]:
# Apply the URL feature extraction to all URLs
df_features = df['URL'].apply(extract_features)

# Convert feature dictionaries into a DataFrame
features_df = pd.DataFrame(df_features.tolist())

In [None]:
# Step 5: Prepare the final feature set and labels
X = features_df  # Feature matrix
y = df['label']  # Target vector (0 for legitimate, 1 for phishing)

In [None]:
# Step 6: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Data Scaling (important for models like SVM, KNN, etc.)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Step 8: Train a Random Forest model (or you can choose other classifiers)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Step 9: Evaluate the model
y_pred = model.predict(X_test_scaled)

In [None]:
# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Classification Report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Confusion Matrix
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Step 10: Cross-validation score
cv_scores = cross_val_score(model, X, y, cv=5)
print(f'\nCross-Validation Accuracy: {cv_scores.mean() * 100:.2f}%')


Accuracy: 99.74%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3265
           1       1.00      0.99      1.00      2052

    accuracy                           1.00      5317
   macro avg       1.00      1.00      1.00      5317
weighted avg       1.00      1.00      1.00      5317


Confusion Matrix:
[[3265    0]
 [  14 2038]]

Cross-Validation Accuracy: 99.71%


In [None]:
# Step 11: Dynamic Prediction Function
def dynamic_predict_url():
    print("Welcome to the URL phishing detection system.")
    print("You can enter a URL to check if it is phishing or legitimate.")
    print("To exit, type 'exit'.")

    while True:
        # Get the URL input from the user
        url = input("Enter URL: ")

        # Exit condition
        if url.lower() == 'exit':
            print("Exiting the prediction system.")
            break

        # Extract features from the URL
        features = extract_features(url)
        features_df = pd.DataFrame([features])

        # Scale the features using the scaler
        features_scaled = scaler.transform(features_df)

        # Predict using the trained model
        prediction = model.predict(features_scaled)

        # Display the result
        if prediction == 1:
            print(f"URL: {url} -> Phishing")
        else:
            print(f"URL: {url} -> Legitimate")

# Call the dynamic prediction function
dynamic_predict_url()


Welcome to the URL phishing detection system.
You can enter a URL to check if it is phishing or legitimate.
To exit, type 'exit'.
Enter URL: https://colab.research.google.com/drive/1gxXNmPdXW1vhXif1ztyjPGobbn7WHZDh?authuser=2#scrollTo=pEnO5BZ2kuFC
URL: https://colab.research.google.com/drive/1gxXNmPdXW1vhXif1ztyjPGobbn7WHZDh?authuser=2#scrollTo=pEnO5BZ2kuFC -> Phishing
Enter URL: exit
Exiting the prediction system.


# **Approach 02**

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from urllib.parse import urlparse

# Step 1: Load datasets
legitimate_df = pd.read_csv('structured_data_legitimate.csv')
phishing_df = pd.read_csv('structured_data_phishing.csv')

In [None]:

# Step 2: Combine datasets
legitimate_df['label'] = 0  # Label 0 for legitimate sites
phishing_df['label'] = 1  # Label 1 for phishing sites

# Concatenate the datasets
df = pd.concat([legitimate_df, phishing_df], ignore_index=True)

In [None]:
# Step 3: Feature Extraction Function
def extract_features(url):
    features = {}

    # Extract domain and path
    domain = urlparse(url).hostname
    path = urlparse(url).path

    # URL length
    features['url_length'] = len(url)

    # Number of subdomains
    features['num_subdomains'] = domain.count('.') - 1

    # Presence of HTTPS
    features['uses_https'] = url.startswith('https')

    # Presence of an IP address in the URL
    features['has_ip'] = bool(re.search(r'\d+\.\d+\.\d+\.\d+', url))

    # Presence of special characters
    features['has_special_chars'] = bool(re.search(r'[^A-Za-z0-9./:_-]', url))

    # Query and fragment analysis
    features['has_query'] = bool(urlparse(url).query)
    features['has_fragment'] = bool(urlparse(url).fragment)

    # Check for suspicious domain (random characters in domain name)
    features['has_random_chars'] = bool(re.search(r'\.[a-z]{5,}', domain))

    # Check if the domain is from a known legitimate source
    known_domains = ['google.com', 'facebook.com', 'youtube.com', 'amazon.com', 'colab.research.google.com']
    features['is_known_domain'] = int(any(domain.endswith(known_domain) for known_domain in known_domains))

    return features



In [None]:
# Step 4: Apply feature extraction on all URLs
df_features = df['URL'].apply(extract_features)

# Step 5: Convert the extracted features into a DataFrame
features_df = pd.DataFrame(df_features.tolist())

# Step 6: Prepare the feature matrix (X) and target vector (y)
X = features_df  # Features
y = df['label']  # Target labels (0 = legitimate, 1 = phishing)

# Step 7: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Feature scaling (important for models that use distance metrics)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 9: Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Step 10: Evaluate the model
y_pred = model.predict(X_test_scaled)

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Classification Report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Confusion Matrix
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Step 11: Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)
print(f'\nCross-Validation Accuracy: {cv_scores.mean() * 100:.2f}%')



Accuracy: 99.83%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3265
           1       1.00      1.00      1.00      2052

    accuracy                           1.00      5317
   macro avg       1.00      1.00      1.00      5317
weighted avg       1.00      1.00      1.00      5317


Confusion Matrix:
[[3265    0]
 [   9 2043]]

Cross-Validation Accuracy: 99.76%


In [None]:
# Step 12: Dynamic URL Prediction
def dynamic_predict_url():
    print("Welcome to the URL phishing detection system.")
    print("You can enter a URL to check if it is phishing or legitimate.")
    print("To exit, type 'exit'.")

    while True:
        # Get the URL input from the user
        url = input("Enter URL: ")

        # Exit condition
        if url.lower() == 'exit':
            print("Exiting the prediction system.")
            break

        # Extract features from the URL
        features = extract_features(url)
        features_df = pd.DataFrame([features])

        # Scale the features using the scaler
        features_scaled = scaler.transform(features_df)

        # Predict using the trained model
        prediction = model.predict(features_scaled)

        # Display the result
        if prediction == 1:
            print(f"URL: {url} -> Phishing")
        else:
            print(f"URL: {url} -> Legitimate")

# Call the dynamic prediction function (uncomment the line below to run in interactive mode)
dynamic_predict_url()


Welcome to the URL phishing detection system.
You can enter a URL to check if it is phishing or legitimate.
To exit, type 'exit'.
Enter URL: https://colab.research.google.com/drive/1gxXNmPdXW1vhXif1ztyjPGobbn7WHZDh?authuser=2#scrollTo=4cRTWF3dnprv
URL: https://colab.research.google.com/drive/1gxXNmPdXW1vhXif1ztyjPGobbn7WHZDh?authuser=2#scrollTo=4cRTWF3dnprv -> Phishing
Enter URL: exit
Exiting the prediction system.


In [None]:
pip install tldextract python-whois


Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Downloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-whois
Successfully installed python-whois-0.9.5


In [None]:
import pandas as pd
import re

import tldextract
from urllib.parse import urlparse
from math import log2
from datetime import datetime
import whois
import ssl
import socket

# Load the dataset
data = pd.read_csv('combined_data_randomized.csv')

# Define helper functions for feature extraction

# URL-Based Features
def calculate_entropy(url):
    """Calculate Shannon entropy of a URL."""
    probabilities = [float(url.count(c)) / len(url) for c in set(url)]
    return -sum(p * log2(p) for p in probabilities if p > 0)

def contains_ip(url):
    """Check if the URL contains an IP address."""
    return 1 if re.search(r'\b\d{1,3}(\.\d{1,3}){3}\b', url) else 0

def count_special_chars(url):
    """Count special characters in the URL."""
    return sum(1 for char in url if char in ['@', '%', '#', '&', '?', '='])

def check_suspicious_tld(url):
    """Check if the URL has a suspicious TLD."""
    suspicious_tlds = ['.xyz', '.top', '.info', '.tk', '.ml']  # Example list
    ext = tldextract.extract(url)
    return 1 if f".{ext.suffix}" in suspicious_tlds else 0

# Domain WHOIS-Based Features
def get_domain_info(url):
    """Get domain age and time to expiration using WHOIS."""
    try:
        domain = tldextract.extract(url).registered_domain
        w = whois.whois(domain)
        if isinstance(w.creation_date, list):
            creation_date = w.creation_date[0]
        else:
            creation_date = w.creation_date
        if isinstance(w.expiration_date, list):
            expiration_date = w.expiration_date[0]
        else:
            expiration_date = w.expiration_date
        domain_age = (datetime.now() - creation_date).days if creation_date else None
        time_to_expire = (expiration_date - datetime.now()).days if expiration_date else None
        return domain_age, time_to_expire
    except:
        return None, None

def check_whois_privacy(url):
    """Check if WHOIS privacy protection is enabled."""
    try:
        domain = tldextract.extract(url).registered_domain
        w = whois.whois(domain)
        return 1 if w.privacy else 0
    except:
        return None

def get_registrar_info(url):
    """Get domain registrar information."""
    try:
        domain = tldextract.extract(url).registered_domain
        w = whois.whois(domain)
        return w.registrar
    except:
        return None

# SSL Certificate-Based Features
def get_ssl_info(url):
    """Get SSL certificate validity period and issuer."""
    try:
        hostname = urlparse(url).hostname
        ctx = ssl.create_default_context()
        with ctx.wrap_socket(socket.socket(), server_hostname=hostname) as s:
            s.connect((hostname, 443))
            cert = s.getpeercert()
            issuer = dict(x[0] for x in cert['issuer'])
            validity_period = (datetime.strptime(cert['notAfter'], '%b %d %H:%M:%S %Y GMT') -
                               datetime.strptime(cert['notBefore'], '%b %d %H:%M:%S %Y GMT')).days
            issuer_name = issuer.get('organizationName', 'Unknown')
            return validity_period, issuer_name
    except:
        return None, None

# Extract URL-Based Features
data['URL Length'] = data['URL'].apply(len)
data['URL Entropy'] = data['URL'].apply(calculate_entropy)
data['Contains IP'] = data['URL'].apply(contains_ip)
data['Number of Dots'] = data['URL'].apply(lambda x: x.count('.'))
data['Contains Hyphen'] = data['URL'].apply(lambda x: 1 if '-' in urlparse(x).netloc else 0)
data['Special Characters Count'] = data['URL'].apply(count_special_chars)
data['Suspicious TLD'] = data['URL'].apply(check_suspicious_tld)

# Extract Domain WHOIS-Based Features
data[['Domain Age', 'Time to Expire']] = data['URL'].apply(
    lambda x: pd.Series(get_domain_info(x))
)
data['WHOIS Privacy'] = data['URL'].apply(check_whois_privacy)
data['Registrar'] = data['URL'].apply(get_registrar_info)

# Extract SSL Certificate-Based Features
data[['SSL Validity Period', 'SSL Issuer']] = data['URL'].apply(
    lambda x: pd.Series(get_ssl_info(x))
)

# Save the dataset with extracted features
data.to_csv('new_dataset_with_features.csv', index=False)

# Optionally preview the dataset
print(data.head())

ModuleNotFoundError: No module named 'whois'

In [None]:
import pandas as pd

# Load the original dataset
file_path = 'combined_data_randomized.csv'
df = pd.read_csv(file_path)

# Create a new dataframe with only the 'URL' and 'label' columns
new_df = df[['URL', 'label']]

# Save the new dataframe to a CSV file
new_file_path = 'filtered_data.csv'
new_df.to_csv(new_file_path, index=False)

print(f"New dataset saved as: {new_file_path}")


New dataset saved as: filtered_data.csv


In [None]:
pip install tldextract

Collecting tldextract
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.1.3-py3-none-any.whl (104 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/104.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.1.3


In [None]:
# prompt: find the total number of records in filtered_data.csv

# Load the filtered dataset
filtered_df = pd.read_csv('filtered_data.csv')

# Get the total number of records
total_records = len(filtered_df)

# Print the total number of records
print(f"Total number of records in filtered_data.csv: {total_records}")

Total number of records in filtered_data.csv: 26585


In [None]:
import pandas as pd
import tldextract
import re
from urllib.parse import urlparse
import ssl
import socket

# Function to extract URL length
def get_url_length(url):
    return len(url)

# Function to extract the number of subdomains
def get_num_subdomains(url):
    ext = tldextract.extract(url)
    return len(ext.subdomain.split('.')) if ext.subdomain else 0

# Function to check if URL uses HTTPS
def is_https(url):
    return 1 if urlparse(url).scheme == "https" else 0

# Function to extract suspicious keywords in the URL
def contains_suspicious_keywords(url):
    suspicious_keywords = ['login', 'verify', 'account', 'update', 'secure', 'banking', 'payment']
    return any(keyword in url for keyword in suspicious_keywords)

# Function to check for IP address in the URL
def contains_ip_address(url):
    pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b"
    return 1 if re.search(pattern, url) else 0

# Function to extract the number of hyphens in the domain name
def num_hyphens_in_domain(url):
    ext = tldextract.extract(url)
    return ext.domain.count('-')

# Function to count dots in the URL
def count_dots(url):
    return url.count('.')

# Function to check if the URL contains a shortened link (bit.ly, goo.gl, etc.)
def is_url_shortened(url):
    shortened_services = ['bit.ly', 'goo.gl', 'tinyurl.com']
    return 1 if any(service in url for service in shortened_services) else 0

# Function to check if URL contains the "@" symbol
def contains_at_symbol(url):
    return 1 if '@' in url else 0

# Function to check if the SSL certificate is valid (optional, may require network requests)
def ssl_cert_valid(url):
    try:
        parsed_url = urlparse(url)
        hostname = parsed_url.hostname
        context = ssl.create_default_context()
        connection = context.wrap_socket(socket.socket(socket.AF_INET), server_hostname=hostname)
        connection.connect((hostname, 443))
        cert = connection.getpeercert()
        return 1 if cert else 0
    except:
        return 0

# Function to extract the TLD (Top-Level Domain)
def get_tld(url):
    ext = tldextract.extract(url)
    return ext.suffix

# Function to extract the number of special characters in the URL
def count_special_characters(url):
    return len(re.findall(r'[^A-Za-z0-9]', url))

# Function to extract URL features
def extract_url_features(url):
    return {
        "url": url,
        "url_length": get_url_length(url),
        "num_subdomains": get_num_subdomains(url),
        "is_https": is_https(url),
        "contains_suspicious_keywords": contains_suspicious_keywords(url),
        "contains_ip_address": contains_ip_address(url),
        "num_hyphens_in_domain": num_hyphens_in_domain(url),
        "dot_count": count_dots(url),
        "is_url_shortened": is_url_shortened(url),
        "contains_at_symbol": contains_at_symbol(url),
        "ssl_cert_valid": ssl_cert_valid(url),
        "tld": get_tld(url),
        "special_char_count": count_special_characters(url)
    }

# Load the filtered dataset with URL and label
file_path = 'filtered_data.csv'
df = pd.read_csv(file_path)

# Analyze only the first 200 URLs
df_subset = df.head(26585)

# Extract features from URLs
url_features = df_subset['URL'].apply(extract_url_features)

# Convert the features into a dataframe
features_df = pd.DataFrame(url_features.tolist())

# Add the label column to the features dataframe
features_df['label'] = df_subset['label']

# Save the new dataset with extracted features into a CSV file
output_path = 'extracted_url_features_26585.csv'
features_df.to_csv(output_path, index=False)

print(f"Extracted URL features for first 200 URLs saved to: {output_path}")


In [None]:
import pandas as pd
import tldextract
import re
from urllib.parse import urlparse
import ssl
import socket

# Function to extract URL length
def get_url_length(url):
    return len(url)

# Function to extract the number of subdomains
def get_num_subdomains(url):
    ext = tldextract.extract(url)
    return len(ext.subdomain.split('.')) if ext.subdomain else 0

# Function to check if URL uses HTTPS
def is_https(url):
    return 1 if urlparse(url).scheme == "https" else 0

# Function to extract suspicious keywords in the URL
def contains_suspicious_keywords(url):
    suspicious_keywords = ['login', 'verify', 'account', 'update', 'secure', 'banking', 'payment']
    return any(keyword in url for keyword in suspicious_keywords)

# Function to check for IP address in the URL
def contains_ip_address(url):
    pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b"
    return 1 if re.search(pattern, url) else 0

# Function to extract the number of hyphens in the domain name
def num_hyphens_in_domain(url):
    ext = tldextract.extract(url)
    return ext.domain.count('-')

# Function to count dots in the URL
def count_dots(url):
    return url.count('.')

# Function to check if the URL contains a shortened link (bit.ly, goo.gl, etc.)
def is_url_shortened(url):
    shortened_services = ['bit.ly', 'goo.gl', 'tinyurl.com']
    return 1 if any(service in url for service in shortened_services) else 0

# Function to check if URL contains the "@" symbol
def contains_at_symbol(url):
    return 1 if '@' in url else 0

# Function to check if the SSL certificate is valid (optional, may require network requests)
def ssl_cert_valid(url):
    try:
        parsed_url = urlparse(url)
        hostname = parsed_url.hostname
        context = ssl.create_default_context()
        connection = context.wrap_socket(socket.socket(socket.AF_INET), server_hostname=hostname)
        connection.connect((hostname, 443))
        cert = connection.getpeercert()
        return 1 if cert else 0
    except:
        return 0

# Function to extract the TLD (Top-Level Domain)
def get_tld(url):
    ext = tldextract.extract(url)
    return ext.suffix

# Function to extract the number of special characters in the URL
def count_special_characters(url):
    return len(re.findall(r'[^A-Za-z0-9]', url))

# Function to extract URL features
def extract_url_features(url):
    return {
        "url": url,
        "url_length": get_url_length(url),
        "num_subdomains": get_num_subdomains(url),
        "is_https": is_https(url),
        "contains_suspicious_keywords": contains_suspicious_keywords(url),
        "contains_ip_address": contains_ip_address(url),
        "num_hyphens_in_domain": num_hyphens_in_domain(url),
        "dot_count": count_dots(url),
        "is_url_shortened": is_url_shortened(url),
        "contains_at_symbol": contains_at_symbol(url),
        "ssl_cert_valid": ssl_cert_valid(url),
        "tld": get_tld(url),
        "special_char_count": count_special_characters(url)
    }

# Load the filtered dataset with URL and label
file_path = 'filtered_data.csv'
df = pd.read_csv(file_path)

# Ask user for starting and ending index for records to process
start_index = int(input("Enter the starting record index: "))
end_index = int(input("Enter the ending record index: "))

# Ensure that the ending index is within the range of the dataframe
end_index = min(end_index, len(df))

# Extract a subset of the dataset based on user inputs
df_subset = df.iloc[start_index:end_index]

# Initialize total records and counter
total_records = len(df_subset)
completed_records = 0

# Extract features from URLs and show live progress
url_features = []
for _, row in df_subset.iterrows():
    url_features.append(extract_url_features(row['URL']))
    completed_records += 1
    print(f"Processing: {completed_records}/{total_records} records completed", end='\r')

# Convert the features into a dataframe
features_df = pd.DataFrame(url_features)

# Copy the label column as is from the parent dataset
features_df['label'] = df_subset['label'].values

# Define the output file path dynamically
output_file_name = f"extracted_url_features_{start_index}_{end_index}.csv"
features_df.to_csv(output_file_name, index=False)

print(f"\nExtracted URL features for records {start_index} to {end_index} saved to: {output_file_name}")


Enter the starting record index: 201
Enter the ending record index: 220
Processing: 19/19 records completed
Extracted URL features for records 201 to 220 saved to: extracted_url_features_201_220.csv


--------------------------------------------------------------------------------------------------------------------------------------

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from urllib.parse import urlparse

# Step 1: Load datasets
legitimate_df = pd.read_csv('structured_data_legitimate.csv')
phishing_df = pd.read_csv('structured_data_phishing.csv')

# Step 2: Combine datasets
legitimate_df['label'] = 0  # Label 0 for legitimate sites
phishing_df['label'] = 1  # Label 1 for phishing sites

# Concatenate the datasets
df = pd.concat([legitimate_df, phishing_df], ignore_index=True)

# Step 3: Refined Feature Extraction Function (fixing URL complexity issue)
def extract_features(url):
    features = {}

    # Extract domain and path
    domain = urlparse(url).hostname
    path = urlparse(url).path

    # URL length
    features['url_length'] = len(url)

    # Number of subdomains
    features['num_subdomains'] = domain.count('.') - 1

    # Presence of HTTPS
    features['uses_https'] = url.startswith('https')

    # Presence of an IP address in the URL
    features['has_ip'] = bool(re.search(r'\d+\.\d+\.\d+\.\d+', url))

    # Presence of special characters
    features['has_special_chars'] = bool(re.search(r'[^A-Za-z0-9./:_-]', url))

    # Handle query parameters and fragments:
    features['has_query'] = bool(urlparse(url).query)
    features['has_fragment'] = bool(urlparse(url).fragment)

    # **Handle trusted domains with query parameters or fragments leniently**
    trusted_domains = ['google.com', 'facebook.com', 'youtube.com', 'amazon.com', 'colab.research.google.com']
    features['is_known_domain'] = int(any(domain.endswith(known_domain) for known_domain in trusted_domains))

    # If the domain is trusted, give less importance to query and fragment
    if features['is_known_domain']:
        features['query_fragment_penalty'] = 0  # No penalty for query/fragment in trusted domains
    else:
        features['query_fragment_penalty'] = features['has_query'] + features['has_fragment']

    # Check for phishing-related keywords in the domain or path (login, secure, etc.)
    phishing_keywords = ['login', 'account', 'secure', 'update', 'verify', 'confirm', 'bank']
    features['has_phishing_keywords'] = int(any(keyword in domain or keyword in path for keyword in phishing_keywords))

    # Path length
    features['path_length'] = len(path)

    # Check for suspicious domain (random characters in domain name)
    features['has_random_chars'] = bool(re.search(r'\.[a-z]{5,}', domain))

    return features

# Step 4: Apply feature extraction on all URLs
df_features = df['URL'].apply(extract_features)

# Step 5: Convert the extracted features into a DataFrame
features_df = pd.DataFrame(df_features.tolist())

# Step 6: Prepare the feature matrix (X) and target vector (y)
X = features_df  # Features
y = df['label']  # Target labels (0 = legitimate, 1 = phishing)

# Step 7: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Feature scaling (important for models that use distance metrics)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 9: Train XGBoost Classifier
model = XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train_scaled, y_train)

# Step 10: Evaluate the model
y_pred = model.predict(X_test_scaled)

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Classification Report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Confusion Matrix
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Step 11: Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)
print(f'\nCross-Validation Accuracy: {cv_scores.mean() * 100:.2f}%')

# Step 12: Dynamic URL Prediction
def dynamic_predict_url():
    print("Welcome to the URL phishing detection system.")
    print("You can enter a URL to check if it is phishing or legitimate.")
    print("To exit, type 'exit'.")

    while True:
        # Get the URL input from the user
        url = input("Enter URL: ")

        # Exit condition
        if url.lower() == 'exit':
            print("Exiting the prediction system.")
            break

        # Extract features from the URL
        features = extract_features(url)
        features_df = pd.DataFrame([features])

        # Scale the features using the scaler
        features_scaled = scaler.transform(features_df)

        # Predict using the trained model
        prediction = model.predict(features_scaled)

        # Display the result
        if prediction == 1:
            print(f"URL: {url} -> Phishing")
        else:
            print(f"URL: {url} -> Legitimate")

# Call the dynamic prediction function (uncomment the line below to run in interactive mode)
dynamic_predict_url()


Parameters: { "use_label_encoder" } are not used.



Accuracy: 99.89%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3265
           1       1.00      1.00      1.00      2052

    accuracy                           1.00      5317
   macro avg       1.00      1.00      1.00      5317
weighted avg       1.00      1.00      1.00      5317


Confusion Matrix:
[[3265    0]
 [   6 2046]]


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Cross-Validation Accuracy: 99.85%
Welcome to the URL phishing detection system.
You can enter a URL to check if it is phishing or legitimate.
To exit, type 'exit'.
Enter URL: https://colab.research.google.com/drive/1gxXNmPdXW1vhXif1ztyjPGobbn7WHZDh?authuser=2#scrollTo=phEQZm3foHsu
URL: https://colab.research.google.com/drive/1gxXNmPdXW1vhXif1ztyjPGobbn7WHZDh?authuser=2#scrollTo=phEQZm3foHsu -> Phishing
Enter URL: exit
Exiting the prediction system.
