<a href="https://colab.research.google.com/github/gerryfrank10/AI2025/blob/main/DPA_PHISHING_URL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing the necessary libraries
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# Downloading the dataset
import io, zipfile
url = 'https://archive.ics.uci.edu/static/public/967/phiusiil+phishing+url+dataset.zip'
r = requests.get(url, headers=None, verify=False)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()



In [4]:
!ls

PhiUSIIL_Phishing_URL_Dataset.csv  sample_data


In [5]:
# Loading and print first 5 rows of data
data = pd.read_csv('PhiUSIIL_Phishing_URL_Dataset.csv')
data.head()

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,521848.txt,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.0,1.0,0.522907,...,0,0,1,34,20,28,119,0,124,1
1,31372.txt,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.0,0.666667,0.03265,...,0,0,1,50,9,8,39,0,217,1
2,597387.txt,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.0,0.866667,0.028555,...,0,0,1,10,2,7,42,2,5,1
3,554095.txt,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.0,1.0,0.522907,...,1,1,1,3,27,15,22,1,31,1
4,151578.txt,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.0,1.0,0.079963,...,1,0,1,244,15,34,72,1,85,1


In [14]:
# We have many columns let's see all columns
data.columns, len(data.columns), len(data)

(Index(['FILENAME', 'URL', 'URLLength', 'Domain', 'DomainLength', 'IsDomainIP',
        'TLD', 'URLSimilarityIndex', 'CharContinuationRate',
        'TLDLegitimateProb', 'URLCharProb', 'TLDLength', 'NoOfSubDomain',
        'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio',
        'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL',
        'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL',
        'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL',
        'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength',
        'HasTitle', 'Title', 'DomainTitleMatchScore', 'URLTitleMatchScore',
        'HasFavicon', 'Robots', 'IsResponsive', 'NoOfURLRedirect',
        'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup', 'NoOfiFrame',
        'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton',
        'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto',
        'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef',
        'NoOfE

> We have 56 columns (so many ...) we need to perform feature importance on the columns and determine the important features and the categorization using database normalisation

1. We are going to achieve this by grouping similar items together and form the tables our of them

In [16]:
# Sample one row from the dataset
# Create a random index
import random
random.seed(0)
rindex =  np.random.randint(0, len(data))
data.iloc[rindex]

Unnamed: 0,68671
FILENAME,mw69687.txt
URL,http://www.xyi876.com
URLLength,20
Domain,www.xyi876.com
DomainLength,14
IsDomainIP,0
TLD,com
URLSimilarityIndex,61.090909
CharContinuationRate,1.0
TLDLegitimateProb,0.522907


## Exploratory Analysis
We are going to perform exploratory data analysis on the dataset by including the following steps
 - Checking Missing Values
 - Getting Important Features
 - Scaling and Normalization
 - Training and Testing Split
 -

In [19]:
# Getting Null Values
data.isnull().sum()

Unnamed: 0,0
FILENAME,0
URL,0
URLLength,0
Domain,0
DomainLength,0
IsDomainIP,0
TLD,0
URLSimilarityIndex,0
CharContinuationRate,0
TLDLegitimateProb,0


In [20]:
# Getting null Values (Other Method)
data.isna().sum()

Unnamed: 0,0
FILENAME,0
URL,0
URLLength,0
Domain,0
DomainLength,0
IsDomainIP,0
TLD,0
URLSimilarityIndex,0
CharContinuationRate,0
TLDLegitimateProb,0


> There are no null values in the data

In [22]:
# Getting statistics
data.describe()

Unnamed: 0,URLLength,DomainLength,IsDomainIP,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,HasObfuscation,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
count,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,...,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0
mean,34.573095,21.470396,0.002706,78.430778,0.845508,0.260423,0.055747,2.764456,1.164758,0.002057,...,0.237007,0.023474,0.486775,26.075689,6.333111,10.522305,65.071113,2.377629,49.262516,0.571895
std,41.314153,9.150793,0.051946,28.976055,0.216632,0.251628,0.010587,0.599739,0.600969,0.045306,...,0.425247,0.151403,0.499826,79.411815,74.866296,22.312192,176.687539,17.641097,161.02743,0.494805
min,13.0,4.0,0.0,0.155574,0.0,0.0,0.001083,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,16.0,0.0,57.024793,0.68,0.005977,0.050747,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,27.0,20.0,0.0,100.0,1.0,0.079963,0.05797,3.0,1.0,0.0,...,0.0,0.0,0.0,8.0,2.0,6.0,12.0,0.0,10.0,1.0
75%,34.0,24.0,0.0,100.0,1.0,0.522907,0.062875,3.0,1.0,0.0,...,0.0,0.0,1.0,29.0,8.0,15.0,88.0,1.0,57.0,1.0
max,6097.0,110.0,1.0,100.0,1.0,0.522907,0.090824,13.0,10.0,1.0,...,1.0,1.0,1.0,8956.0,35820.0,6957.0,27397.0,4887.0,27516.0,1.0


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235795 entries, 0 to 235794
Data columns (total 56 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   FILENAME                    235795 non-null  object 
 1   URL                         235795 non-null  object 
 2   URLLength                   235795 non-null  int64  
 3   Domain                      235795 non-null  object 
 4   DomainLength                235795 non-null  int64  
 5   IsDomainIP                  235795 non-null  int64  
 6   TLD                         235795 non-null  object 
 7   URLSimilarityIndex          235795 non-null  float64
 8   CharContinuationRate        235795 non-null  float64
 9   TLDLegitimateProb           235795 non-null  float64
 10  URLCharProb                 235795 non-null  float64
 11  TLDLength                   235795 non-null  int64  
 12  NoOfSubDomain               235795 non-null  int64  
 13  HasObfuscation

In [26]:
import seaborn as sns
corr = data.corr()
sns.heatmap(corr)

ValueError: could not convert string to float: '521848.txt'