In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
import Levenshtein as lev
from urllib.parse import urlparse

Website Phishing Dataset - Importing

In [2]:
phis_web = pd.read_csv('Data\dataset_phishing.csv')

In [3]:
phis_web.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


In [4]:
phis_web.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11430 entries, 0 to 11429
Data columns (total 89 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   url                         11430 non-null  object 
 1   length_url                  11430 non-null  int64  
 2   length_hostname             11430 non-null  int64  
 3   ip                          11430 non-null  int64  
 4   nb_dots                     11430 non-null  int64  
 5   nb_hyphens                  11430 non-null  int64  
 6   nb_at                       11430 non-null  int64  
 7   nb_qm                       11430 non-null  int64  
 8   nb_and                      11430 non-null  int64  
 9   nb_or                       11430 non-null  int64  
 10  nb_eq                       11430 non-null  int64  
 11  nb_underscore               11430 non-null  int64  
 12  nb_tilde                    11430 non-null  int64  
 13  nb_percent                  114

- There are no null values in the dataset.
- Datatypes of columns include int, float, object with 90 columns

In [5]:
phis_web.describe()

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank
count,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,...,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0
mean,61.126684,21.090289,0.150569,2.480752,0.99755,0.022222,0.141207,0.162292,0.0,0.293176,...,0.124759,0.775853,0.439545,0.072878,492.532196,4062.543745,856756.6,0.020122,0.533946,3.185739
std,55.297318,10.777171,0.357644,1.369686,2.087087,0.1555,0.364456,0.821337,0.0,0.998317,...,0.33046,0.417038,0.496353,0.259948,814.769415,3107.7846,1995606.0,0.140425,0.498868,2.536955
min,12.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-1.0,-12.0,0.0,0.0,0.0,0.0
25%,33.0,15.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,84.0,972.25,0.0,0.0,0.0,1.0
50%,47.0,19.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,242.0,3993.0,1651.0,0.0,1.0,3.0
75%,71.0,24.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,449.0,7026.75,373845.5,0.0,1.0,5.0
max,1641.0,214.0,1.0,24.0,43.0,4.0,3.0,19.0,0.0,19.0,...,1.0,1.0,1.0,1.0,29829.0,12874.0,10767990.0,1.0,1.0,10.0


Data Cleaning Step - Extracting Domain names from the URL's

- Utilized urlparse library to retrieve the domain names (Eg. www.google.com) from the url (Eg. www.google.com/search/DICTopics/).

In [6]:
def extract_domain(url):
    try:
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        return domain
    except:
        return ''

In [7]:
phis_web['domain'] = phis_web['url'].apply(extract_domain)

### Feature Extraction - TypoSquatting Detection using Lavenshtein Distance <br>
Typo-squatting is a technique used by hackers where they deliberately register the misspelled domain names of legit ones. <br>

We retrieved the top 100,000 websites from Cloud fare Radar and computed the minimum levenshtein distance between the the current domain and the list of trusted domains. Levenshtein distance measures the number of character edits (insertions, deletions and substitutions) necessary to change one domain name to another.

In [8]:
#List of top 100k trusted domains from Cloudfare

trusted_domain_list = list(pd.read_csv('Data\domains_100k.csv')['domain'])

In [9]:
# Computes the minimum levenshtein distance between a domain and all the trusted domains 
def min_lev_dist(domain, trusted_domains):
    min_distance = float('inf')
    closest_domain = None
    for trusted_domain in trusted_domains:
        distance = lev.distance(domain, trusted_domain)
        if distance < min_distance:
            min_distance = distance
            closest_domain = trusted_domain
    return min_distance, closest_domain

In [10]:
Min_Levenshtein_Distance, Closest_Trusted_Domain= zip(
    *phis_web['domain'].apply(lambda x: min_lev_dist(x, trusted_domain_list))
)

In [11]:
phis_web['Lavenshtein dist'] = Min_Levenshtein_Distance
phis_web['Closest_dom'] = Closest_Trusted_Domain

In [32]:
phis_web.to_csv('Data\dataset_phishing_11k.csv', header=True)

EDA Step - Table to summarize the Typosquatting feature

In [31]:
phis_web[(phis_web['status'] == 'phishing') & (phis_web['Lavenshtein dist'] < 10 )].loc[:, ['domain', 'Lavenshtein dist', 'Closest_dom']][10:20]

Unnamed: 0,domain,Lavenshtein dist,Closest_dom
51,sura.careervidi.com,8,acallerid.com
55,www.nakamistrad.com,9,aastra.com
58,sites.google.com,4,withgoogle.com
59,docs.google.com,5,9to5google.com
70,baghira-wupperwolf.de,9,uni-wuppertal.de
73,izvilagkft.hu,7,izbirkom.ru
74,sloaneandhyde.com,7,bladeandsoul.com
80,electricitypak.com,3,electricitymaps.com
84,marketinghelper.com.au,6,marketindex.com.au
88,interimmanagement.uk.com,9,servicemanagement.com
