In [None]:
import pandas as pd

df = pd.read_csv('Phishing_Dataset.csv')
df.head(10)

Unnamed: 0,Type,url_length,number_of_dots_in_url,having_repeated_digits_in_url,number_of_digits_in_url,number_of_special_char_in_url,number_of_hyphens_in_url,number_of_underline_in_url,number_of_slash_in_url,number_of_questionmark_in_url,...,having_digits_in_subdomain,number_of_digits_in_subdomain,having_repeated_digits_in_subdomain,having_path,path_length,having_query,having_fragment,having_anchor,entropy_of_url,entropy_of_domain
0,0,37,2,0,0,8,0,0,5,0,...,0,0,1,0,3,0,0,0,4.010412,2.751629
1,1,70,5,0,0,12,0,0,6,0,...,0,0,1,0,4,0,0,0,4.08947,3.532573
2,0,42,2,0,6,8,0,0,3,1,...,0,0,1,0,1,1,0,0,4.386016,3.344698
3,0,46,2,0,0,7,0,0,4,0,...,0,0,1,0,2,0,0,0,4.221947,3.189898
4,0,51,3,0,0,9,0,0,5,0,...,0,0,1,0,3,0,0,0,4.103538,2.95282
5,0,51,1,0,0,9,2,0,5,0,...,0,0,1,0,3,0,0,0,4.136372,3.454822
6,0,86,3,0,0,14,6,0,4,0,...,0,0,1,0,2,0,0,0,4.299706,3.101881
7,1,64,1,0,1,10,0,0,7,0,...,0,0,1,0,5,0,0,0,4.366479,3.281036
8,0,54,2,0,0,8,0,1,4,0,...,0,0,1,0,2,0,0,0,4.291266,3.854286
9,0,44,2,0,0,8,1,0,4,0,...,0,0,1,0,2,0,0,0,4.362507,3.521641


In [None]:
# Step 1: Data Overview
print("Initial Shape of Dataset:", df.shape)
print("Columns:", df.columns)


Initial Shape of Dataset: (247950, 42)
Columns: Index(['Type', 'url_length', 'number_of_dots_in_url',
       'having_repeated_digits_in_url', 'number_of_digits_in_url',
       'number_of_special_char_in_url', 'number_of_hyphens_in_url',
       'number_of_underline_in_url', 'number_of_slash_in_url',
       'number_of_questionmark_in_url', 'number_of_equal_in_url',
       'number_of_at_in_url', 'number_of_dollar_in_url',
       'number_of_exclamation_in_url', 'number_of_hashtag_in_url',
       'number_of_percent_in_url', 'domain_length', 'number_of_dots_in_domain',
       'number_of_hyphens_in_domain', 'having_special_characters_in_domain',
       'number_of_special_characters_in_domain', 'having_digits_in_domain',
       'number_of_digits_in_domain', 'having_repeated_digits_in_domain',
       'number_of_subdomains', 'having_dot_in_subdomain',
       'having_hyphen_in_subdomain', 'average_subdomain_length',
       'average_number_of_dots_in_subdomain',
       'average_number_of_hyphens_i

In [None]:

# Step 2: Removing Duplicate URLs
df_cleaned = df.drop_duplicates()
print("Shape after removing duplicates:", df_cleaned.shape)

# Step 3: Handling Missing Values
# Replace missing values with median or mean, or drop columns if necessary
df_cleaned = df_cleaned.fillna(df_cleaned.median())
print("Missing values handled")


Shape after removing duplicates: (129778, 42)
Missing values handled


In [None]:

# Step 4: Outlier Detection and Removal
# Assuming `url_length` and `entropy_of_url` could have potential outliers
def remove_outliers(df, column):
    # Define IQR (Interquartile range)
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    # Filter out outliers
    return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]


# Removing outliers based on `url_length` and `entropy_of_url`
df_cleaned = remove_outliers(df_cleaned, 'url_length')
df_cleaned = remove_outliers(df_cleaned, 'entropy_of_url')
print("Shape after outlier removal:", df_cleaned.shape)


Shape after outlier removal: (120122, 42)


In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
# Step 5: Scaling the Features (optional for certain algorithms)
# Normalize or scale numerical features if needed
scaler = StandardScaler()

# List of features to be scaled
columns_to_scale = ['url_length', 'number_of_dots_in_url', 'entropy_of_url', 'entropy_of_domain']

# Apply scaling
df_cleaned[columns_to_scale] = scaler.fit_transform(df_cleaned[columns_to_scale])
print("Data scaling completed")

# Final Step: Final Data Overview
print("Final Shape of Dataset:", df_cleaned.shape)
print("First few rows of the final dataset:")
print(df_cleaned.head())


Data scaling completed
Final Shape of Dataset: (120122, 42)
First few rows of the final dataset:
   Type  url_length  number_of_dots_in_url  having_repeated_digits_in_url  \
0     0   -0.856990              -0.551060                              0   
1     1    0.744539               1.684308                              0   
2     0   -0.614334              -0.551060                              0   
3     0   -0.420209              -0.551060                              0   
4     0   -0.177554               0.194062                              0   

   number_of_digits_in_url  number_of_special_char_in_url  \
0                        0                              8   
1                        0                             12   
2                        6                              8   
3                        0                              7   
4                        0                              9   

   number_of_hyphens_in_url  number_of_underline_in_url  \
0             

In [None]:
# Saving the cleaned dataset
df_cleaned.to_csv('cleaned_dataset.csv', index=False)
print("Cleaned dataset saved as 'cleaned_dataset.csv'")

Start the process from here,
- **cleaned_dataset:** Consists of a complete cleaned dataset which has filtered the data with respect to model trainning

In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Step 1: Split dataset into features and target
X = df.drop(columns=['Type'])  # Features
y = df['Type']  # Target variable (0 or 1)

In [None]:
# Step 2: Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 4: Predict on test data
y_pred = clf.predict(X_test)

# Accuracy metrics of the trainned model Random Forest

In [None]:
# Step 5: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9673522887678968

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97     25668
           1       0.97      0.96      0.97     23922

    accuracy                           0.97     49590
   macro avg       0.97      0.97      0.97     49590
weighted avg       0.97      0.97      0.97     49590


Confusion Matrix:
 [[25052   616]
 [ 1003 22919]]


2 Parameter tests done:
1. Original Deep Hyperparameter Testing
2. Shortemed Hyperparameter Testing: More Accurate


In [None]:
#This is original hyperparameter but is paused for now, will executer a shorter one
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np
import json

# Assuming X_train and y_train are already defined as the training data
# Step 3: Set up hyperparameter grid for RandomForest
param_dist = {
    'n_estimators': np.arange(50, 200, 50),  # Instead of a wide range, we test a few values
    'max_depth': [10, 20, None],  # Reduced options for max depth
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Step 4: Set up the RandomizedSearchCV (n_iter=50 means it will try 50 different combinations)
rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=50,
                                   cv=3, n_jobs=-1, verbose=2, random_state=42)

# Step 5: Fit the random search to the training data
random_search.fit(X_train, y_train)

# Step 6: Output the best parameters
print(f"Best Parameters: {random_search.best_params_}")

# Step 7: Convert the best parameters to standard Python types
best_params = {key: int(value) if isinstance(value, np.integer) else value for key, value in random_search.best_params_.items()}

# Step 8: Write the parameters to a JSON file
with open('Final_best_hyperparameters.json', 'w') as json_file:
    json.dump(best_params, json_file)

print("Best hyperparameters saved to 'Final_best_hyperparameters.json'")


Fitting 3 folds for each of 50 candidates, totalling 150 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}
Best hyperparameters saved to 'Final_best_hyperparameters.json'


**Shortened Hyperparameter Testing:**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np
import time
from tqdm import tqdm  # For progress bar

# Assuming X_train and y_train are already defined as the training data

# Step 1: Set up hyperparameter grid for RandomForest
param_dist = {
    'n_estimators': [50, 100],  # Reduced range of n_estimators
    'max_depth': [10, None],    # Only test two values for max depth
}

# Step 2: Set up the RandomizedSearchCV with fewer iterations (e.g., 10 iterations)
n_iter_search = 10  # Limiting the number of combinations to speed up
rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=3, n_jobs=-1, random_state=42, verbose=0)

# Step 3: Track progress and expected time to complete
# Initialize progress bar
pbar = tqdm(total=n_iter_search, desc="Hyperparameter Search Progress")

# Function to manually track progress
start_time = time.time()  # To calculate the elapsed time

# Fit with manual tracking
for i in range(n_iter_search):
    start_iter = time.time()  # Start time for the iteration
    random_search.fit(X_train, y_train)

    # Calculate elapsed time per iteration
    iter_time = time.time() - start_iter
    elapsed_time = time.time() - start_time

    # Estimate remaining time
    remaining_iters = n_iter_search - (i + 1)
    estimated_time_remaining = remaining_iters * iter_time

    # Update progress bar
    pbar.update(1)
    pbar.set_postfix({
        'Elapsed': f"{elapsed_time:.2f} sec",
        'Remaining': f"{estimated_time_remaining:.2f} sec"
    })

pbar.close()

# Step 4: Output the best parameters
print(f"Best Parameters: {random_search.best_params_}")


  _data = np.array(data, dtype=dtype, copy=copy,
Hyperparameter Search Progress: 100%|██████████| 10/10 [15:59<00:00, 95.97s/it, Elapsed=959.73 sec, Remaining=0.00 sec]

Best Parameters: {'n_estimators': 100, 'max_depth': None}





In [None]:
# Step 4: Output the best parameters
print(f"Best Parameters: {random_search.best_params_}")

Best Parameters: {'n_estimators': 100, 'max_depth': None}


In [None]:
import json

# Save best hyperparameters to a JSON file
best_params = random_search.best_params_

# Write the parameters to a JSON file
with open('Shortened_best_hyperparameters.json', 'w') as json_file:
    json.dump(best_params, json_file)

print("Best hyperparameters saved to 'Shortened_best_hyperparameters.json'")


Best hyperparameters saved to 'Shortened_best_hyperparameters.json'


Loading Shortened hyperparameter and monitoring their accuracies

In [None]:
import json

# Load the best hyperparameters from the JSON file
with open('Shortened_best_hyperparameters.json', 'r') as json_file:
    best_params = json.load(json_file)

# Print the loaded hyperparameters
print("Loaded best hyperparameters:", best_params)


Loaded best hyperparameters: {'n_estimators': 100, 'max_depth': None}


In [None]:
# Train the Random Forest model with the best parameters
best_rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
best_rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = best_rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Display the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9673522887678968

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     25668
           1       0.97      0.96      0.97     23922

    accuracy                           0.97     49590
   macro avg       0.97      0.97      0.97     49590
weighted avg       0.97      0.97      0.97     49590


Confusion Matrix:
[[25052   616]
 [ 1003 22919]]


In [None]:
import pickle

# Save the trained Random Forest model to a file
with open('Shortened_best_random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(best_rf_model, model_file)

print("Trained model saved to 'shortened_best_random_forest_model.pkl'")


Trained model saved to 'shortened_best_random_forest_model.pkl'


In [None]:
import pickle
import numpy as np
from urllib.parse import urlparse
import math

# Function to calculate entropy of a string (used for URL and domain entropy)
def calculate_entropy(s):
    prob = [float(s.count(c)) / len(s) for c in dict.fromkeys(list(s))]
    entropy = -sum([p * math.log2(p) for p in prob])
    return entropy

# Feature extraction function for dynamically entered URL
def extract_features(url):
    parsed_url = urlparse(url)
    features = []

    # Extract the domain from the URL
    domain = parsed_url.netloc

    # Extract subdomain from the domain
    subdomain = domain.split('.')[0] if '.' in domain else domain

    # Feature 1: url_length
    url_length = len(url)

    # Feature 2: number_of_dots_in_url
    number_of_dots_in_url = url.count('.')

    # Feature 3: having_repeated_digits_in_url
    having_repeated_digits_in_url = 1 if any(str(i) * 2 in url for i in range(10)) else 0

    # Feature 4: number_of_digits_in_url
    number_of_digits_in_url = sum(c.isdigit() for c in url)

    # Feature 5: number_of_special_char_in_url
    special_chars = "!@#$%^&*()_+=-{}[]|\\:;\"'<>,.?/~`"
    number_of_special_char_in_url = sum(c in special_chars for c in url)

    # Feature 6: number_of_hyphens_in_url
    number_of_hyphens_in_url = url.count('-')

    # Feature 7: number_of_underline_in_url
    number_of_underline_in_url = url.count('_')

    # Feature 8: number_of_slash_in_url
    number_of_slash_in_url = url.count('/')

    # Feature 9: number_of_questionmark_in_url
    number_of_questionmark_in_url = url.count('?')

    # Feature 10: having_digits_in_subdomain
    having_digits_in_subdomain = 1 if any(char.isdigit() for char in subdomain) else 0

    # Feature 11: number_of_digits_in_subdomain
    number_of_digits_in_subdomain = sum(char.isdigit() for char in subdomain)

    # Feature 12: having_repeated_digits_in_subdomain
    having_repeated_digits_in_subdomain = 1 if any(str(i) * 2 in subdomain for i in range(10)) else 0

    # Feature 13: having_path
    having_path = 1 if parsed_url.path else 0

    # Feature 14: path_length
    path_length = len(parsed_url.path)

    # Feature 15: having_query
    having_query = 1 if parsed_url.query else 0

    # Feature 16: having_fragment
    having_fragment = 1 if parsed_url.fragment else 0

    # Feature 17: having_anchor (simplified to check if '#' is present)
    having_anchor = 1 if '#' in url else 0

    # Feature 18: entropy_of_url
    entropy_of_url = calculate_entropy(url)

    # Feature 19: entropy_of_domain
    entropy_of_domain = calculate_entropy(domain)

    # Additional features based on URL structure
    # Feature 20: domain_length
    domain_length = len(domain)

    # Feature 21: number_of_dots_in_domain
    number_of_dots_in_domain = domain.count('.')

    # Feature 22: number_of_hyphens_in_domain
    number_of_hyphens_in_domain = domain.count('-')

    # Feature 23: number_of_digits_in_domain
    number_of_digits_in_domain = sum(char.isdigit() for char in domain)

    # Feature 24: having_digits_in_domain
    having_digits_in_domain = 1 if any(char.isdigit() for char in domain) else 0

    # Feature 25: having_special_characters_in_domain
    having_special_characters_in_domain = 1 if any(c in special_chars for c in domain) else 0

    # Feature 26: number_of_special_characters_in_domain
    number_of_special_characters_in_domain = sum(c in special_chars for c in domain)

    # Feature 27: number_of_subdomains (split domain into subdomain parts)
    number_of_subdomains = domain.count('.')

    # Feature 28: number_of_equal_in_url
    number_of_equal_in_url = url.count('=')

    # Feature 29: number_of_at_in_url
    number_of_at_in_url = url.count('@')

    # Feature 30: number_of_percent_in_url
    number_of_percent_in_url = url.count('%')

    # Feature 31: number_of_hashtag_in_url
    number_of_hashtag_in_url = url.count('#')

    # Feature 32: number_of_exclamation_in_url
    number_of_exclamation_in_url = url.count('!')

    # Feature 33: number_of_dollar_in_url
    number_of_dollar_in_url = url.count('$')

    # Feature 34: number_of_hyphens_in_subdomain
    number_of_hyphens_in_subdomain = subdomain.count('-')

    # Feature 35: number_of_special_characters_in_subdomain
    number_of_special_characters_in_subdomain = sum(c in special_chars for c in subdomain)

    # Feature 36: having_special_characters_in_subdomain
    having_special_characters_in_subdomain = 1 if any(c in special_chars for c in subdomain) else 0

    # Feature 37: having_dot_in_subdomain
    having_dot_in_subdomain = 1 if '.' in subdomain else 0

    # Feature 38: number_of_underline_in_subdomain
    number_of_underline_in_subdomain = subdomain.count('_')

    # Feature 39: having_hyphen_in_subdomain
    having_hyphen_in_subdomain = 1 if '-' in subdomain else 0

    # Feature 40: number_of_slash_in_subdomain
    number_of_slash_in_subdomain = subdomain.count('/')

    # Feature 41: having_repeated_digits_in_domain
    having_repeated_digits_in_domain = 1 if any(str(i) * 2 in domain for i in range(10)) else 0

    # Append all features to the list
    features = [
        url_length, number_of_dots_in_url, having_repeated_digits_in_url, number_of_digits_in_url,
        number_of_special_char_in_url, number_of_hyphens_in_url, number_of_underline_in_url,
        number_of_slash_in_url, number_of_questionmark_in_url, having_digits_in_subdomain,
        number_of_digits_in_subdomain, having_repeated_digits_in_subdomain, having_path,
        path_length, having_query, having_fragment, having_anchor, entropy_of_url, entropy_of_domain,
        domain_length, number_of_dots_in_domain, number_of_hyphens_in_domain, number_of_digits_in_domain,
        having_digits_in_domain, having_special_characters_in_domain, number_of_special_characters_in_domain,
        number_of_subdomains, number_of_equal_in_url, number_of_at_in_url, number_of_percent_in_url,
        number_of_hashtag_in_url, number_of_exclamation_in_url, number_of_dollar_in_url,
        number_of_hyphens_in_subdomain, number_of_special_characters_in_subdomain,
        having_special_characters_in_subdomain, having_dot_in_subdomain, number_of_underline_in_subdomain,
        having_hyphen_in_subdomain, number_of_slash_in_subdomain, having_repeated_digits_in_domain
    ]

    # Return features as a 2D array for model prediction
    return np.array(features).reshape(1, -1)

# Load the trained model
with open('Shortened_best_random_forest_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

# Input URL dynamically
url = input("Enter a URL to check if it's phishy or legitimate: ")

# Check the number of features expected by the model
print(f"Model expects {model.n_features_in_} features.")

# Extract features from the URL
url_features = extract_features(url)

# Check the number of features extracted
print(f"Extracted {url_features.shape[1]} features.")

# Predict using the model
prediction = model.predict(url_features)

# Output result
if prediction == 1:
    print(f"The URL '{url}' is predicted as **Phishy**.")
else:
    print(f"The URL '{url}' is predicted as **Legitimate**.")


Similarly, Working on the Original Hyperparameters

In [None]:
import json

# Load the best hyperparameters from the JSON file
with open('Final_Original_best_hyperparameters.json', 'r') as json_file:
    best_params = json.load(json_file)

# Print the loaded hyperparameters
print("Loaded best hyperparameters:", best_params)


Loaded best hyperparameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}


In [None]:
# Train the Random Forest model with the best parameters
best_rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
best_rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = best_rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Display the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9673522887678968

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     25668
           1       0.97      0.96      0.97     23922

    accuracy                           0.97     49590
   macro avg       0.97      0.97      0.97     49590
weighted avg       0.97      0.97      0.97     49590


Confusion Matrix:
[[25052   616]
 [ 1003 22919]]


In [None]:
import pickle

# Save the trained Random Forest model to a file
with open('Original_best_random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(best_rf_model, model_file)

print("Trained model saved to 'shortened_best_random_forest_model.pkl'")


Trained model saved to 'shortened_best_random_forest_model.pkl'


In [None]:
import pickle
import numpy as np
from urllib.parse import urlparse
import math

# Function to calculate entropy of a string (used for URL and domain entropy)
def calculate_entropy(s):
    prob = [float(s.count(c)) / len(s) for c in dict.fromkeys(list(s))]
    entropy = -sum([p * math.log2(p) for p in prob])
    return entropy

# Feature extraction function for dynamically entered URL
def extract_features(url):
    parsed_url = urlparse(url)
    features = []

    # Extract the domain from the URL
    domain = parsed_url.netloc

    # Extract subdomain from the domain
    subdomain = domain.split('.')[0] if '.' in domain else domain

    # Feature 1: url_length
    url_length = len(url)

    # Feature 2: number_of_dots_in_url
    number_of_dots_in_url = url.count('.')

    # Feature 3: having_repeated_digits_in_url
    having_repeated_digits_in_url = 1 if any(str(i) * 2 in url for i in range(10)) else 0

    # Feature 4: number_of_digits_in_url
    number_of_digits_in_url = sum(c.isdigit() for c in url)

    # Feature 5: number_of_special_char_in_url
    special_chars = "!@#$%^&*()_+=-{}[]|\\:;\"'<>,.?/~`"
    number_of_special_char_in_url = sum(c in special_chars for c in url)

    # Feature 6: number_of_hyphens_in_url
    number_of_hyphens_in_url = url.count('-')

    # Feature 7: number_of_underline_in_url
    number_of_underline_in_url = url.count('_')

    # Feature 8: number_of_slash_in_url
    number_of_slash_in_url = url.count('/')

    # Feature 9: number_of_questionmark_in_url
    number_of_questionmark_in_url = url.count('?')

    # Feature 10: having_digits_in_subdomain
    having_digits_in_subdomain = 1 if any(char.isdigit() for char in subdomain) else 0

    # Feature 11: number_of_digits_in_subdomain
    number_of_digits_in_subdomain = sum(char.isdigit() for char in subdomain)

    # Feature 12: having_repeated_digits_in_subdomain
    having_repeated_digits_in_subdomain = 1 if any(str(i) * 2 in subdomain for i in range(10)) else 0

    # Feature 13: having_path
    having_path = 1 if parsed_url.path else 0

    # Feature 14: path_length
    path_length = len(parsed_url.path)

    # Feature 15: having_query
    having_query = 1 if parsed_url.query else 0

    # Feature 16: having_fragment
    having_fragment = 1 if parsed_url.fragment else 0

    # Feature 17: having_anchor (simplified to check if '#' is present)
    having_anchor = 1 if '#' in url else 0

    # Feature 18: entropy_of_url
    entropy_of_url = calculate_entropy(url)

    # Feature 19: entropy_of_domain
    entropy_of_domain = calculate_entropy(domain)

    # Additional features based on URL structure
    # Feature 20: domain_length
    domain_length = len(domain)

    # Feature 21: number_of_dots_in_domain
    number_of_dots_in_domain = domain.count('.')

    # Feature 22: number_of_hyphens_in_domain
    number_of_hyphens_in_domain = domain.count('-')

    # Feature 23: number_of_digits_in_domain
    number_of_digits_in_domain = sum(char.isdigit() for char in domain)

    # Feature 24: having_digits_in_domain
    having_digits_in_domain = 1 if any(char.isdigit() for char in domain) else 0

    # Feature 25: having_special_characters_in_domain
    having_special_characters_in_domain = 1 if any(c in special_chars for c in domain) else 0

    # Feature 26: number_of_special_characters_in_domain
    number_of_special_characters_in_domain = sum(c in special_chars for c in domain)

    # Feature 27: number_of_subdomains (split domain into subdomain parts)
    number_of_subdomains = domain.count('.')

    # Feature 28: number_of_equal_in_url
    number_of_equal_in_url = url.count('=')

    # Feature 29: number_of_at_in_url
    number_of_at_in_url = url.count('@')

    # Feature 30: number_of_percent_in_url
    number_of_percent_in_url = url.count('%')

    # Feature 31: number_of_hashtag_in_url
    number_of_hashtag_in_url = url.count('#')

    # Feature 32: number_of_exclamation_in_url
    number_of_exclamation_in_url = url.count('!')

    # Feature 33: number_of_dollar_in_url
    number_of_dollar_in_url = url.count('$')

    # Feature 34: number_of_hyphens_in_subdomain
    number_of_hyphens_in_subdomain = subdomain.count('-')

    # Feature 35: number_of_special_characters_in_subdomain
    number_of_special_characters_in_subdomain = sum(c in special_chars for c in subdomain)

    # Feature 36: having_special_characters_in_subdomain
    having_special_characters_in_subdomain = 1 if any(c in special_chars for c in subdomain) else 0

    # Feature 37: having_dot_in_subdomain
    having_dot_in_subdomain = 1 if '.' in subdomain else 0

    # Feature 38: number_of_underline_in_subdomain
    number_of_underline_in_subdomain = subdomain.count('_')

    # Feature 39: having_hyphen_in_subdomain
    having_hyphen_in_subdomain = 1 if '-' in subdomain else 0

    # Feature 40: number_of_slash_in_subdomain
    number_of_slash_in_subdomain = subdomain.count('/')

    # Feature 41: having_repeated_digits_in_domain
    having_repeated_digits_in_domain = 1 if any(str(i) * 2 in domain for i in range(10)) else 0

    # Append all features to the list
    features = [
        url_length, number_of_dots_in_url, having_repeated_digits_in_url, number_of_digits_in_url,
        number_of_special_char_in_url, number_of_hyphens_in_url, number_of_underline_in_url,
        number_of_slash_in_url, number_of_questionmark_in_url, having_digits_in_subdomain,
        number_of_digits_in_subdomain, having_repeated_digits_in_subdomain, having_path,
        path_length, having_query, having_fragment, having_anchor, entropy_of_url, entropy_of_domain,
        domain_length, number_of_dots_in_domain, number_of_hyphens_in_domain, number_of_digits_in_domain,
        having_digits_in_domain, having_special_characters_in_domain, number_of_special_characters_in_domain,
        number_of_subdomains, number_of_equal_in_url, number_of_at_in_url, number_of_percent_in_url,
        number_of_hashtag_in_url, number_of_exclamation_in_url, number_of_dollar_in_url,
        number_of_hyphens_in_subdomain, number_of_special_characters_in_subdomain,
        having_special_characters_in_subdomain, having_dot_in_subdomain, number_of_underline_in_subdomain,
        having_hyphen_in_subdomain, number_of_slash_in_subdomain, having_repeated_digits_in_domain
    ]

    # Return features as a 2D array for model prediction
    return np.array(features).reshape(1, -1)

# Load the trained model
with open('Shortened_best_random_forest_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

# Input URL dynamically
url = input("Enter a URL to check if it's phishy or legitimate: ")

# Check the number of features expected by the model
print(f"Model expects {model.n_features_in_} features.")

# Extract features from the URL
url_features = extract_features(url)

# Check the number of features extracted
print(f"Extracted {url_features.shape[1]} features.")

# Predict using the model
prediction = model.predict(url_features)

# Output result
if prediction == 1:
    print(f"The URL '{url}' is predicted as **Phishy**.")
else:
    print(f"The URL '{url}' is predicted as **Legitimate**.")


Enter a URL to check if it's phishy or legitimate: https://app.slack.com/client/T07AE67FCFM/C07AY99EQ6M
Model expects 41 features.
Extracted 41 features.
The URL 'https://app.slack.com/client/T07AE67FCFM/C07AY99EQ6M' is predicted as **Legitimate**.




Org Flow Contd

# Random Forest suits well with 96% accuracy, Now we will check if any other model suits more accurateky with the models or not

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# List of models to test
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': xgb.XGBClassifier()
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy for {model_name}: {accuracy}")

    # Display the classification report
    print(f"\nClassification Report for {model_name}:")
    print(classification_report(y_test, y_pred))

    # Display the confusion matrix
    print(f"\nConfusion Matrix for {model_name}:")
    print(confusion_matrix(y_test, y_pred))
    print("\n" + "="*60 + "\n")


Training Logistic Regression...

Accuracy for Logistic Regression: 0.7898022892819979

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.77      0.88      0.82     13177
           1       0.82      0.68      0.75     10848

    accuracy                           0.79     24025
   macro avg       0.80      0.78      0.78     24025
weighted avg       0.79      0.79      0.79     24025


Confusion Matrix for Logistic Regression:
[[11570  1607]
 [ 3443  7405]]


Training SVM...

Accuracy for SVM: 0.8367533818938606

Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.82      0.90      0.86     13177
           1       0.86      0.76      0.81     10848

    accuracy                           0.84     24025
   macro avg       0.84      0.83      0.83     24025
weighted avg       0.84      0.84      0.84     24025


Confusion Matrix for SVM:
[[11817  1360]
 [ 2562 




Accuracy for AdaBoost: 0.8328824141519251

Classification Report for AdaBoost:
              precision    recall  f1-score   support

           0       0.82      0.89      0.85     13177
           1       0.85      0.76      0.80     10848

    accuracy                           0.83     24025
   macro avg       0.84      0.83      0.83     24025
weighted avg       0.83      0.83      0.83     24025


Confusion Matrix for AdaBoost:
[[11723  1454]
 [ 2561  8287]]


Training XGBoost...

Accuracy for XGBoost: 0.8896566077003122

Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     13177
           1       0.90      0.84      0.87     10848

    accuracy                           0.89     24025
   macro avg       0.89      0.89      0.89     24025
weighted avg       0.89      0.89      0.89     24025


Confusion Matrix for XGBoost:
[[12213   964]
 [ 1687  9161]]


