# Experimentation of Detecting Phishing Links 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
)

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Input,
    Embedding,
    Conv1D,
    GlobalMaxPooling1D,
    Dense,
    Dropout,
)
from tensorflow.keras.models import Model

import os
import requests
import zipfile
import os

## Datasets

- https://data.mendeley.com/datasets/vfszbj9b36/1
- https://archive.ics.uci.edu/dataset/967/phiusiil+phishing+url+dataset
- https://www.kaggle.com/datasets/harisudhan411/phishing-and-legitimate-urls
- https://www.kaggle.com/datasets/taruntiwarihp/phishing-site-urls
- https://www.kaggle.com/datasets/joebeachcapital/phishing-urls
- https://www.kaggle.com/datasets/shashwatwork/web-page-phishing-detection-dataset

In [3]:
phishing_df = pd.read_csv("dataset/phishing_data_num_1.csv")
phishing_df = phishing_df[["URL", "label"]]


phishing_df2 = pd.read_csv("dataset/dataset_num_2.csv")
phishing_df2["type"] = phishing_df2["type"].apply(
    lambda x: 0 if x == "legitimate" else 1
)
phishing_df2.columns = ['URL', 'label']


phishing_df3 = pd.read_csv("dataset/dataset_num_3.csv")
phishing_df3.columns = ["URL", "label"]


phishing_df4 = pd.read_csv("dataset/phishing_data_num_4.csv")
phishing_df4.columns = ["URL", "label"]
phishing_df4["label"] = phishing_df4["label"].apply(
    lambda x: 1 if x == "bad" else 0
)
phishing_df4_bad = phishing_df4[phishing_df4['label'] == 1]


phishing_df5 = pd.read_csv(
    "dataset/dataset_num 5.csv", encoding="ISO-8859-1", on_bad_lines="skip"
)
phishing_df5 = phishing_df5[['domain', 'label']].dropna()
phishing_df5.columns = ["URL", "label"]
phishing_df5["label"] = phishing_df5["label"].astype(int)


phishing_df6 = pd.read_csv("dataset/dataset_phishing num 6.csv")
phishing_df6 = phishing_df6[['url', 'status']]
phishing_df6.columns = ["URL", "label"]
phishing_df6["label"] = phishing_df6["label"].apply(
    lambda x: 1 if x == "phishing" else 0
)

  phishing_df5 = pd.read_csv(


In [7]:
df = pd.concat(
    [
        phishing_df,
        phishing_df2,
        phishing_df3,
        phishing_df4_bad,
        phishing_df5,
        phishing_df6,
    ],
    ignore_index=True,
)

In [8]:
df = df.drop_duplicates(subset="URL", keep="last")

In [9]:
len(df_combined)

1481551

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1481551 entries, 0 to 1771745
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   URL     1481551 non-null  object
 1   label   1481551 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 33.9+ MB


## Data Exploration

In [16]:
df_label_count = df.groupby(by="label").count()
df_label_count.index = ['Non-Phishing', 'Phishing']

total_count = df_label_count.sum()
check_label_counts_percentage = (df_label_count / total_count) * 100
check_label_counts_percentage.columns = ['Label Percentage']
check_label_counts_percentage.apply(round)

Unnamed: 0,Label Percentage
Non-Phishing,51.0
Phishing,49.0


In [33]:
# How many url contains http in the string
df_http = df['URL'].str.contains('http', na=False)
df_http.sum()

contains_http = (df_http.sum() / len(df_http)) * 100
does_not_contain_http = 100 - contains_http

print(f"Num of rows that contains 'http' in the URL is: {contains_http:.0f}%")
print(f"Num of rows that does not contain 'http' in the URL is: {does_not_contain_http:.0f}%")

Num of rows that contains 'http' in the URL is: 59%
Num of rows that does not contain 'http' in the URL is: 41%


In [37]:
df[df['URL'].str.contains('http', na=False)].head()

Unnamed: 0,URL,label
0,https://www.southbankmosaics.com,1
1,https://www.uni-mainz.de,1
2,https://www.voicefmradio.co.uk,1
3,https://www.sfnmjournal.com,1
4,https://www.rewildingargentina.org,1


In [41]:
df[~df['URL'].str.contains('http', na=False)].head()

Unnamed: 0,URL,label
578260,ftp://ftp.cs.utexas.edu/pub/garbage/,0
668929,ftp://me@createkindlebooks.org:Noobasshole@cre...,1
670742,ftp://188.128.111.33/web/sec.htm,1
680939,ftp://188.128.111.33/IPTV/TV1324/view.html,1
683930,ftp://host3-61-static.7-79-b.business.telecomi...,1


## Data Cleaning

In [42]:
# Remove anything that has start of string to :// 
df['URL'] = df['URL'].str.replace(r'^.*?://', '', regex=True)

In [44]:
df[df['URL'].str.contains('http', na=False)].head(10)

Unnamed: 0,URL,label
68,www.httpzen.fairhash.org,0
401,43.134.167.94/servicelogin?passive=1209600&amp...,0
1093,www.google.com/url?q=https://ipfs.io/ipfs/qmpo...,0
1205,43.128.92.128/servicelogin?passive=1209600&amp...,0
1362,43.134.167.94/v3/signin/identifier?dsh=s151927...,0
1683,www.http.jmrmfitym.com,0
1725,cloud.mymailwall.com/n/url?l=okcgntugaaaaaa&am...,0
2481,43.153.207.103/v3/signin/identifier?dsh=s-1633...,0
2530,43.156.0.130/servicelogin?passive=1209600&amp;...,0
4369,saproxy.us.to/servicelogin?passive=1209600&con...,0
