In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score,
                             recall_score, f1_score, roc_auc_score,
                             classification_report)
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
import re
from urllib.parse import urlparse
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Load the data
df = pd.read_csv("PhiUSIIL_Phishing_URL_Dataset.csv")

# Drop rows where URL is missing
df = df.dropna(subset=['URL'])

# Separate features and target variable initially
X = df.drop(['label', 'FILENAME', 'URL', 'Domain', 'TLD', 'Title'], axis=1)
y = df['label']

def feature_engineering(df_in):
    df = df_in.copy()  # Create a copy to avoid modifying the original DataFrame

    # Handle missing 'URL', 'Domain', and 'TLD' before applying functions
    df['URL'] = df['URL'].fillna('')
    df['Domain'] = df['Domain'].fillna('')
    df['TLD'] = df['TLD'].fillna('')

    df['url_length'] = df['URL'].apply(lambda x: len(x) if isinstance(x, str) else 0)
    df['domain_length'] = df['Domain'].apply(lambda x: len(x) if isinstance(x, str) else 0)
    df['tld_length'] = df['TLD'].apply(lambda x: len(x) if isinstance(x, str) else 0)
    df['is_ip_address'] = df['URL'].apply(lambda url: 1 if isinstance(url, str) and re.match(r'^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$', urlparse(url).netloc) else 0)
    df['num_subdomains'] = df['Domain'].apply(lambda x: len(x.split('.')) - 1 if isinstance(x, str) else 0)
    df['is_https'] = df['URL'].apply(lambda url: 1 if isinstance(url, str) and url.startswith('https') else 0)
    df['path_length'] = df['URL'].apply(lambda url: len(urlparse(url).path) if isinstance(url, str) and isinstance(urlparse(url).path, str) else 0)
    df['num_numerical_chars'] = df['URL'].apply(lambda url: sum(c.isdigit() for c in url) if isinstance(url, str) else 0)
    df['num_special_chars'] = df['URL'].apply(lambda url: sum(not c.isalnum() for c in url) if isinstance(url, str) else 0)
    df['entropy'] = df['URL'].apply(lambda url: -sum([(url.count(c) / len(url)) * np.log2(url.count(c) / len(url)) for c in set(url)]) if isinstance(url, str) and len(url) > 0 else 0)
    return df

# 3. TF-IDF Feature Extraction
tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 5), max_features=500) # Limiting TF-IDF features
tfidf_matrix = tfidf_vectorizer.fit_transform(df['URL'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Apply feature engineering to the main DataFrame
df = feature_engineering(df)

# Concatenate TF-IDF features with the original feature set (excluding URL)
X = pd.concat([X, tfidf_df], axis=1)

# Store the original column names before imputation and feature selection:
original_X_columns = X.columns

# Handle missing values using imputation
imputer = SimpleImputer(strategy='mean')  # Replace missing values with the mean
X = imputer.fit_transform(X)

# 4. Feature Selection
selector = SelectKBest(score_func=f_classif, k=20) # Selecting top 20
X_selected = selector.fit_transform(X, y) # Apply feature selection before scaling

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected) # Scale the selected features

# 5. Split the data into training, testing, and validation sets (70/15/15)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 6. Define the Model (Regularized Random Forest)
rf_clf = RandomForestClassifier(n_estimators=100,
                                min_samples_split=10,
                                min_samples_leaf=5,
                                class_weight='balanced', # Helps with imbalanced data
                                random_state=42)

# 7. Train the Model
rf_clf.fit(X_train, y_train)

# 8. Test
def evaluate_model(X, y, model, set_name="Test"):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_pred)

    print(f"{set_name} Set Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")

    print(f"\n{set_name} Set Classification Report:")
    print(classification_report(y, y_pred))

# Evaluate the model on the test set
print("Test Set Evaluation:")
evaluate_model(X_test, y_test, rf_clf, set_name="Test")

# Evaluate the model on the validation set
print("\nValidation Set Evaluation:")
evaluate_model(X_val, y_val, rf_clf, set_name="Validation")

# Function to preprocess a single URL and predict its class
def predict_url(url, scaler, imputer, selector, tfidf_vectorizer, model, original_X_columns):
    # Create a DataFrame from the URL (with correct column names)
    url_data = pd.DataFrame([[url] + [np.nan] * (len(df.columns) - 1)], columns=df.columns)

    # Apply the same feature engineering steps as during training
    url_data = feature_engineering(url_data)

    # Extract TF-IDF features
    tfidf_matrix = tfidf_vectorizer.transform(url_data['URL'].astype(str)) #Ensure it is string

    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    # Drop non-feature columns
    X_url = url_data.drop(['label', 'FILENAME', 'URL', 'Domain', 'TLD', 'Title'], axis=1)

    # Concatenate TF-IDF features with the original feature set
    X_url = pd.concat([X_url, tfidf_df], axis=1)

    # Ensure the new URL has same features as the training data
    missing_cols = set(original_X_columns) - set(X_url.columns)
    for c in missing_cols:
        X_url[c] = 0  # Or X_url[c] = np.mean(X[c]) to use mean imputation

    # Ensure that the order of column is the same
    X_url = X_url[original_X_columns]

    # Impute missing values
    X_url = imputer.transform(X_url)

    # Select features
    X_url = selector.transform(X_url)

    # Scale the features
    X_url_scaled = scaler.transform(X_url)

    # Predict with the model
    prediction = model.predict(X_url_scaled)[0]
    return prediction

# 8. Test with URLs
mixed_urls = [
    "https://c.vialoops.com/CL0/https:%2F%2Fwww.lockedinai.com%2F/1/01000195bdd69a01-55026def-26e3-4605-9be9-4e05660d70d2-000000/X9CXZ41E7n-CJIL2SagqoWkrTN8SAybe89YL72prCAs=397", #advertising
    "https://mrlsexdoll.com/search?q=futa&options%5Bprefix%5D=last&type=product&sca_ref=7814373.LDU7LXVyHU&sca_source=rule34", #porn
    "https://qiyen.top/gcaatUke/1864875617422672262413d214e", #sus
    "https://s.magsrv.com/click.php?d=H4sIAAAAAAAAA01S227iMBD9lUgrHjfyjG_JY6XdipYWEFhpysvK2EkLW0oSAm2t.fh1WLZbTZycc8Zz8cRKS2BSEKaCnvu.OYz41Qiv4_OSht_Nc5.6_S6yQ1zs_ojFdsSvDx5G_AcwzDRDjSNUB49RUX.TnTmPXKgsl.pMxRCgMw2YRd531lU3Pmr7zk.nd.PCjOfzxXi6eBRlGZrubn3b6NIsA2Jc5vjiVPsAcB8absys7fxM2R0CTLtu0sj21pbmwfCiYMcXf68K1jXl8rjzkwjDaxEC6LJQFg1qZRQUeokf2rnSYjAQYozcPd30bwv17BerhxY3Tehu.TI6hgxda8sBT1fGrFbzqXmUoSk_vf_gaoDX219BThY_q33NZ07h8WNrW6uqTVhujmtbhSXyeaXaDyxrs.nf6ser73EgdRxF9b4nYgRaCK0RhKatPe2b6jX.AgJgqRQpIk.Ba_qcPV2GSpJlRAIoKkAgsqgoogiJa_WecRm9GSlE5XO1dhqzao1OKs8y5us6dxmgisGxgy9l2dmSi5K4fXLa.Gqf1A6TpjklQuZCcEh621QJfkuGiniO0RKBM3E.EMZbIGg4XGz4knMwEXcLcYaSeHwTgmKEQ4ahEaL5.O7L_v.Gnwgu30O6s0.H7jTcVxqK0Qlymk1IQq25qK2wtnZcVgioK.Ez7zh3Ocv_APaE0ZICAwAA&cb=e2e_67ff1c332eb8d5.38188977", #spam
    "https://ck.juicyads.com/getjuicy.php?jad=1748824&juser=126401&jfill=8729&jfimg=1744340&data=6508b3d9ce441bfbc0745b1118409128&idty=2ea063dbfecf6be7f8de66f7d1ed1ade&std2=x243v2z21343r26494b4&pubid=126401&zoneid=863031&jmx=undefined&jmy=undefined&jurl=p5y5l4i4m4b424n2p456t484v2p5g5q4p594g4j4o4g4v2k4k5r414", #ads
    "https://trkucibqy.com/794577f1-5d38-4419-bb4c-a1ccc9983102?sors11=17363410&CAMPAIGN_ID=1219789&BANNER_ID=3330443&COUNTRY=PH&OS=Android&OS_VER=10.0.99&medium=ARR_270325PHsb_17363410&Cost=0.060000&externalid=4f465d296ea7148bc07ce52859b5fa4f", #tracking
    "https://go.xlirdr.com/?realDomain=creative.xlirdr.com&campaignId=widget&tag=girls%2Fjapanese&targetDomain=javhdporn.live&userId=a857d671ed2ee3f67e327d7a3d55455ceef35f57922f8f8f71c52e413accec69&sourceId=javhdpornWL&thumbModelId=170009614&thumbUrl=https%3A%2F%2Fimg.doppiocdn.com%2Fthumbs%2F1744771800%2F170009614_webp&onlineModels=_aoi_chan_&filtersMatch=1&modelsInWatchHistoryCount=-1&userType=newuser&modelsCount=1&segment=hls-oldAPI&path=%2F&i=0&landing=WidgetV4Universal&referrer=https%3A%2F%2Fwww4.javhdporn.net%2F&abTest=widgetv4universal_sort030425&abTestVariant=widgetv4universal_sort030425_paidUsers_1&seenAbTest=0&seenDomain=0&seenLanding=0",#porn
    "www4.javhdporn.net", #porn
    "https://www.google.com",  # Legitimate
    "https://www.wikipedia.org",  # Legitimate
    "https://www.youtube.com",  # Legitimate
    "https://www.amazon.com",  # Legitimate
    "https://www.facebook.com", # Legitimate
    "https://www.wellsfargo.com", #Legitimate
    "https://www.bankofamerica.com", #Legitimate
    "https://www.chase.com" #Legitimate
]

print("\nTesting with URLs:")
for url in mixed_urls:
    # Preprocess the URL
    y_pred = predict_url(url, scaler, imputer, selector, tfidf_vectorizer, rf_clf, original_X_columns)

    # Print the result
    print(f"URL: {url}")
    print(f"Prediction: {'Phishing' if y_pred == 0 else 'Legitimate'}")

Test Set Evaluation:
Test Set Performance:
Accuracy: 0.9999
Precision: 0.9999
Recall: 1.0000
F1 Score: 0.9999
ROC AUC: 0.9999

Test Set Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15061
           1       1.00      1.00      1.00     20308

    accuracy                           1.00     35369
   macro avg       1.00      1.00      1.00     35369
weighted avg       1.00      1.00      1.00     35369


Validation Set Evaluation:
Validation Set Performance:
Accuracy: 0.9999
Precision: 0.9999
Recall: 1.0000
F1 Score: 0.9999
ROC AUC: 0.9999

Validation Set Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15090
           1       1.00      1.00      1.00     20280

    accuracy                           1.00     35370
   macro avg       1.00      1.00      1.00     35370
weighted avg       1.00      1.00      1.00     35370


Testing 