# Experimentation of Detecting Phishing Links 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    confusion_matrix
)

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Input,
    Embedding,
    Conv1D,
    GlobalMaxPooling1D,
    Dense,
    Dropout,
)
from tensorflow.keras.models import Model

import os
import requests
import zipfile
import os

2025-03-26 14:28:48.235160: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-26 14:28:48.248264: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742970528.258998    1072 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742970528.262094    1072 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742970528.272376    1072 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

## Datasets

- https://data.mendeley.com/datasets/vfszbj9b36/1
- https://archive.ics.uci.edu/dataset/967/phiusiil+phishing+url+dataset
- https://www.kaggle.com/datasets/harisudhan411/phishing-and-legitimate-urls
- https://www.kaggle.com/datasets/taruntiwarihp/phishing-site-urls
- https://www.kaggle.com/datasets/joebeachcapital/phishing-urls
- https://www.kaggle.com/datasets/shashwatwork/web-page-phishing-detection-dataset

In [2]:
phishing_df = pd.read_csv("dataset/phishing_data_num_1.csv")
phishing_df = phishing_df[["URL", "label"]]


phishing_df2 = pd.read_csv("dataset/dataset_num_2.csv")
phishing_df2["type"] = phishing_df2["type"].apply(
    lambda x: 0 if x == "legitimate" else 1
)
phishing_df2.columns = ['URL', 'label']


phishing_df3 = pd.read_csv("dataset/dataset_num_3.csv")
phishing_df3.columns = ["URL", "label"]


phishing_df4 = pd.read_csv("dataset/phishing_data_num_4.csv")
phishing_df4.columns = ["URL", "label"]
phishing_df4["label"] = phishing_df4["label"].apply(
    lambda x: 1 if x == "bad" else 0
)
phishing_df4_bad = phishing_df4[phishing_df4['label'] == 1]


phishing_df5 = pd.read_csv(
    "dataset/dataset_num 5.csv", encoding="ISO-8859-1", on_bad_lines="skip"
)
phishing_df5 = phishing_df5[['domain', 'label']].dropna()
phishing_df5.columns = ["URL", "label"]
phishing_df5["label"] = phishing_df5["label"].astype(int)


phishing_df6 = pd.read_csv("dataset/dataset_phishing num 6.csv")
phishing_df6 = phishing_df6[['url', 'status']]
phishing_df6.columns = ["URL", "label"]
phishing_df6["label"] = phishing_df6["label"].apply(
    lambda x: 1 if x == "phishing" else 0
)

  phishing_df5 = pd.read_csv(


In [3]:
df = pd.concat(
    [
        phishing_df,
        phishing_df2,
        phishing_df3,
        phishing_df4_bad,
        phishing_df5,
        phishing_df6,
    ],
    ignore_index=True,
)

In [4]:
df = df.drop_duplicates(subset="URL", keep="last")

In [5]:
len(df)

1481551

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1481551 entries, 0 to 1771745
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   URL     1481551 non-null  object
 1   label   1481551 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 33.9+ MB


## Data Exploration

In [7]:
df_label_count = df.groupby(by="label").count()
df_label_count.index = ['Non-Phishing', 'Phishing']

total_count = df_label_count.sum()
check_label_counts_percentage = (df_label_count / total_count) * 100
check_label_counts_percentage.columns = ['Label Percentage']
check_label_counts_percentage.apply(round)

Unnamed: 0,Label Percentage
Non-Phishing,51.0
Phishing,49.0


In [8]:
# How many url contains protocals in the string
df_http = df['URL'].str.contains('://', na=False)
df_http.sum()

contains_http = (df_http.sum() / len(df_http)) * 100
does_not_contain_http = 100 - contains_http

print(f"Num of rows that contains the protocal in the URL is: {contains_http:.0f}%")
print(f"Num of rows that does not contain the protocal in the URL is: {does_not_contain_http:.0f}%")

Num of rows that contains the protocal in the URL is: 59%
Num of rows that does not contain the protocal in the URL is: 41%


In [9]:
protocal_regex = r'^(?=.{0,10}://).*?://'

In [10]:
df[df['URL'].str.contains(protocal_regex, na=False)].head()

Unnamed: 0,URL,label
0,https://www.southbankmosaics.com,1
1,https://www.uni-mainz.de,1
2,https://www.voicefmradio.co.uk,1
3,https://www.sfnmjournal.com,1
4,https://www.rewildingargentina.org,1


In [11]:
df[~df['URL'].str.contains(protocal_regex, na=False)].head()

Unnamed: 0,URL,label
685971,0000111servicehelpdesk.godaddysites.com,0
685972,000011accesswebform.godaddysites.com,0
685973,00003.online,0
685974,0009servicedeskowa.godaddysites.com,0
685975,000n38p.wcomhost.com,0


## Data Cleaning

In [14]:
# Remove the first :// in the string within the first 10 characters of the string
df['URL'] = df['URL'].str.replace(protocal_regex, '', regex=True)

In [16]:
df[df['URL'].str.contains(protocal_regex, na=False)].head()

Unnamed: 0,URL,label


## Train Test Split

In [20]:
X = df['URL']
y = df['label']

In [29]:
def dataset_spliter(experimenting_with_model: bool):
    if experimenting_with_model:
        X_trunc, _ , y_trunc, _ = train_test_split(X, y, 
                                                  test_size=0.84, random_state=4, 
                                                  stratify=y, shuffle=True)
        return train_test_split(X_trunc, y_trunc, test_size=0.3, random_state=4, stratify=y_trunc, shuffle=True)
    else:
        return train_test_split(X, y, test_size=0.84, random_state=4, stratify=y,shuffle=True)

In [36]:
X_train, X_test, y_train, y_test = dataset_spliter(experimenting_with_model=True)

In [37]:
print(f"Training set: {len(X_train)}")
print(f"Testing set: {len(X_test)}")

Training set: 165933
Testing set: 71115


## Data Preprocessing

In [38]:
# Initialize tokenizer with an OOV token.
tokenizer = Tokenizer(char_level=True, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1  # plus one for padding token
sequences = tokenizer.texts_to_sequences(X_train)
max_length = max(len(seq) for seq in sequences)

In [39]:
def convert_url_padding(data):
    # Convert URLs to sequences and pad them
    sequences = tokenizer.texts_to_sequences(data)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding="post")
    return padded_sequences

In [40]:
X_train_encoding = convert_url_padding(X_train)
X_test_encoding = convert_url_padding(X_test)

## Tensorflow Callbacks

In [41]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor="val_loss",  # Monitors validation loss
    patience=2,  # Stops if loss doesn't improve for 5 epochs
    restore_best_weights=True,  # Restores model to best weights when stopped
    verbose=1,
)

## CNN Model

In [42]:
def create_cnn_model(vocab_size, embedding_dim, input_length):
    inputs = Input(shape=(input_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length, mask_zero=True)(inputs)

    # A 1D convolution layer to capture n-gram features
    x = Conv1D(filters=128, kernel_size=7, activation='relu')(x)
    x = GlobalMaxPooling1D()(x)

    x = Dense(256, activation='relu')(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.5)(x)
    # Binary classification output
    output_class = Dense(1, activation='sigmoid', name='binary_class')(x)
    model = Model(inputs, output_class)
    return model

# Create the CNN model
embedding_dim = 256  # You can experiment with this size
cnn_model = create_cnn_model(vocab_size, embedding_dim, input_length=max_length)
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
# opt = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9, nesterov=True)

cnn_model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
# cnn_model.summary()

I0000 00:00:1742971379.068487    1072 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1753 MB memory:  -> device: 0, name: NVIDIA RTX A2000 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


## Training

In [None]:
history_cnn = cnn_model.fit(
    X_train_encoding,
    y_train,
    epochs=5,
    validation_split=0.3,
    batch_size=32,
    callbacks=[early_stopping],
)

Epoch 1/5


I0000 00:00:1742971429.575078    1980 service.cc:152] XLA service 0x7f7358009c60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1742971429.575114    1980 service.cc:160]   StreamExecutor device (0): NVIDIA RTX A2000 Laptop GPU, Compute Capability 8.6
2025-03-26 14:43:49.618542: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1742971429.841114    1980 cuda_dnn.cc:529] Loaded cuDNN version 90300



[1m   2/3630[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:23[0m 89ms/step - accuracy: 0.3984 - loss: 0.6946  

I0000 00:00:1742971438.596170    1980 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m3629/3630[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 84ms/step - accuracy: 0.8496 - loss: 0.3667


2025-03-26 14:49:07.570666: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng11{k2=1,k3=0} for conv %cudnn-conv-bias-activation.3 = (f32[25,128,1,5781]{3,2,1,0}, u8[0]{0}) custom-call(f32[25,256,1,5787]{3,2,1,0} %bitcast.4050, f32[128,256,1,7]{3,2,1,0} %bitcast.4304, f32[128]{0} %bitcast.4452), window={size=1x7}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", metadata={op_type="Conv2D" op_name="functional_1/conv1d_1/convolution" source_file="/opt/miniconda/envs/phisherman/lib/python3.10/site-packages/tensorflow/python/framework/ops.py" source_line=1200}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"conv_result_scale":1,"activation_mode":"kNone","side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false} is taking a while...
2025-03-26 14:49:07.597919: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.88322596s
Tryi

[1m3630/3630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m369s[0m 99ms/step - accuracy: 0.8496 - loss: 0.3667 - val_accuracy: 0.8960 - val_loss: 0.2740
Epoch 2/5
[1m3345/3630[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m24s[0m 87ms/step - accuracy: 0.9022 - loss: 0.2597

## Evaluation

In [None]:
y_pred = cnn_model.predict(X_test_encoding)

In [None]:
y_pred_int = (y_pred > 0.5).astype(int)

In [None]:
def accuracy_and_confusion_matrix(y_test, y_pred):
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"Precision is: {precision}")
    print(f"Recall is: {recall}")
    print(f"F1 Score is: {f1}")
    print(f"Accuracy is: {accuracy}")
    
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=["Safe", "Phishing"],
        yticklabels=["Safe", "Phishing"],
    )
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

In [None]:
accuracy_and_confusion_matrix(y_test, y_pred_int)

In [None]:
df_errors = pd.DataFrame({
    "URL": X_train,
    "label": y_test,
    "Prediction": y_pred_int
})

df_errors.head()