# URL Feature Extraction

In [None]:
pip install python-whois

Collecting python-whois
  Downloading python-whois-0.8.0.tar.gz (109 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/109.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/109.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.6/109.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: python-whois
  Building wheel for python-whois (setup.py) ... [?25l[?25hdone
  Created wheel for python-whois: filename=python_whois-0.8.0-py3-none-any.whl size=103247 sha256=1ed6fb07f217256d82ab5dc6bd71762e8a2ee39d62cdb997cfdf8341615021c2
  Stored in directory: /root/.cache/pip/wheels/10/f1/87/145023b9a206e2e948be6480c61ef3fd3dbb81ef11b6977782
Successfully built python-whois
Installing collected packages: python-whois
Successfully installed pyth

In [None]:
pip install tqdm



In [None]:
import pickle
import re
import pandas as pd
import requests
import whois

from collections import Counter
from datetime import datetime
from math import log
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from tqdm import tqdm

Importing Dataset

In [None]:
# importing required packages for this module
# Run this code for each data source 
# i.e malwareOnline.csv, verified_phishing_online.csv and benignDMOZ.csv
phish = pd.read_csv("RawDatasets/verified_phishing_online.csv")
phish.head()

Unnamed: 0,id,dateadded,url,url_status,last_online,threat,tags,urlhaus_link,reporter
0,2724907,10/24/23 16:19,http://42.231.251.93:50898/Mozi.m,online,10/24/23 16:19,malware_download,"elf,Mozi",https://urlhaus.abuse.ch/url/2724907/,lrz_urlhaus
1,2724902,10/24/23 15:55,https://www.dropbox.com/scl/fi/y1zf9rt5n2guy0r...,online,10/24/23 16:12,malware_download,,https://urlhaus.abuse.ch/url/2724902/,anonymous
2,2724903,10/24/23 15:55,https://www.dropbox.com/scl/fi/u008gv89ok6tj3u...,online,10/24/23 16:13,malware_download,,https://urlhaus.abuse.ch/url/2724903/,anonymous
3,2724904,10/24/23 15:55,https://www.dropbox.com/scl/fi/96i5e2yrl5b383y...,online,10/24/23 15:55,malware_download,,https://urlhaus.abuse.ch/url/2724904/,anonymous
4,2724905,10/24/23 15:55,https://www.dropbox.com/scl/fi/g362v2hcsb1kly2...,online,10/24/23 15:55,malware_download,,https://urlhaus.abuse.ch/url/2724905/,anonymous


In [None]:
phish=phish.head(22800)
phish.shape

(4078, 9)

## Lexical Features ⌨

1.	**havingIP**: - If IP address present in URL then the feature is set to 1 else set to 0. Most benign sites do not use IP addresses as an URL. Use of IP address in URL indicates that attacker is trying to steal sensitive information or s means to confuse users so they often use.

In [None]:
#check for IP in URL domain
def havingIP(url):
    #print("1->",end=" ")
    regex=re.compile('.*\d+\.\d+\.\d+\.\d+\.*')
    d=regex.match(url)
    if d==None:
        return 0
    else:
        return 1

2.	**haveAtSign**: - If @ symbol present in URL then the feature is set to 1 else set to 0. attackers add special symbol @ in the URL leads the browser to ignore everything preceding the “@” symbol and the real address often follows the “@” symbol.

In [None]:
#Checks the presence of @ in URL (Have_At)
def haveAtSign(url):
  #print("2->",end=" ")
  if "@" in url:
    at = 1
  else:
    at = 0
  return at

3.	**countDot**: - Malicious URLs have many dots in the URL. For example, http://shop.fun.amazon.phishing.com, in this URL phishing.com is an actual domain name, whereas use of “amazon” word is to trick users to click on it. So, this function is used to count the number of dots in the URL.

In [None]:
# count number of dots
def countDot(url):
  #print("3->",end=" ")
  return url.count('.')

4.	**prefixSuffix**:- Checking the presence of '-' in the domain part of URL. The dash symbol is rarely used in legitimate URLs. Phishers tend to add prefixes or suffixes separated by (-) to the domain name so that users feel that they are dealing with a legitimate webpage.If the URL has the ‘-' symbol in the domain part of the URL, the value assigned to this feature is 1 else 0

In [None]:
def prefixSuffix(url):
    #print("4->",end=" ")
    if '-' in urlparse(url).netloc:
        return 1
    else:
        return 0

5.	**redirection**: If “//” present in the URL path then the feature is set to 1 else to 0. The existence of “//” within the URL path means that the user will be redirected to another website.

In [None]:
# Checking for redirection '//' in the url (Redirection)
def redirection(url):
  #print("5->",end=" ")
  pos = url.rfind('//')
  if pos < 8:
      return 0
  else:
    return 1

6.	**httpDomain**: If HTTPS token present in URL then the feature is set to 1 else to 0. Attackers may add the “HTTPS” token to the domain part of a URL in order to trick users. For example, http://https-www-paypal-it-mpp-home.soft-hair.com.

In [None]:
# Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
def httpDomain(url):
  #print("6->",end=" ")
  domain = urlparse(url).netloc
  if 'https' in domain:
    return 1
  else:
    return 0

7.	**tinyURL**:TinyURL service allows attackers to hide long malicious URL by making it short. The goal is to redirect user to phishing websites. If the URL is crafted using shortening services (like bit.ly) then feature is set to 1 else 0

In [None]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

#Checking for Shortening Services in URL (Tiny_URL)
def tinyURL(url):
    #print("7->",end=" ")
    match=re.search(shortening_services,url)
    if match:
      return 1
    else:
        return 0


8.	**getLength**: Used to calculate the length of the URL. It is observed safe URLs are often shorter than malicious URLs.

In [None]:
# Finding the length of URL and categorizing (URL_Length)
def getLength(url):
  #print("8->",end=" ")
  return len(url)

9. **getSlash**:- This function returns the number of slashes in the URL. Higher number of slashes indicate higher chances of URL being dangerous.

In [None]:

# number of '/' in URL
def getSlash(url):
  #print("9->",end=" ")
  s = urlparse(url).path.split('/')
  count = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      count += 1
  return count

10. **numDigits**:- This function returns the number of digits in the URL. Higher number of digits increase the chances of URL being dangerous.

In [None]:
def numDigits(url):
        #print("10->",end=" ")
        digits = [i for i in url if i.isdigit()]
        return len(digits)

11. **numFragments**: This function returns the number of “#” in the URL

In [None]:
def numFragments(url):
        #print("11->",end=" ")
        fragments = url.split('#')
        return len(fragments) - 1

12. **numSubDomains**: This function splits the URL returns the number of subdomains in it

In [None]:
def numSubDomains(url):
        #print("12->",end=" ")
        subdomains = url.split('http')[-1].split('//')[-1].split('/')
        return len(subdomains)-1

13. **domainExtension**: This function returns the domain extension found at the end of the URL.

In [None]:
def domainExtension(url):
        #print("13->",end=" ")
        ext = url.split('.')[-1].split('/')[0]
        return ext


14. **shannon_entropy**: This function calculates the Shannon Entropy(amount of info present) of a string

In [None]:
# Function for calcuating Shannon Entropy of a string
def shannon_entropy(string):
    counts=Counter(string)
    frequencies=((i / len(string)) for i in counts.values())
    return -sum(f*log(f,2) for f in frequencies)

# Calculating Shannon Entropy of URL
def url_shannon_entropy(url):
    #print("14->",end=" ")
    return shannon_entropy(url)

# Calculating Shannon Entropy of Domain
def domain_shannon_entropy(url):
    #print("15->",end=" ")
    domain = urlparse(url).netloc
    return shannon_entropy(domain)

# Calculating Shannon Entropy of Path
def path_shannon_entropy(url):
    #print("16->",end=" ")
    path = urlparse(url).path
    return shannon_entropy(path)

# Calculating Shannon Entropy of Query
def query_shannon_entropy(url):
    #print("17->",end=" ")
    query= urlparse(url).query
    return shannon_entropy(query)

# Calculating Shannon Entropy of Path+Query
def query_path_shannon_entropy(url):
    #print("18->",end=" ")
    pathquery= urlparse(url).path+'?'+urlparse(url).query
    return shannon_entropy(pathquery)

15. **suspiciousExtension**:  If a suspicious extension is present in the URL then the feature is set to 1 else set to 0. Extensions like '.exe','.pif','.application','.gadget' are suspicious extensions. Suspicious extensions are often found in malicious URLs.


In [None]:
def suspiciousExtension(url):
    #print("19->",end=" ")
    dangerous_extensions=['.exe','.pif','.application','.gadget',
                          '.msi','.msp','.scr','.hta','.cpl','.msc','.jar',
                         '.bat','.cmd','.vb','.vbs','.vbe','.js','.jse'
                         ,'.ws','.wsf','.wsc','.wsh','.ps1','.ps1xml','.ps2'
                         ,'.ps2xml','.psc1','.psc2','.msh','.scf','.lnk',
                         '.inf','.reg','htm','.hta','.sfx','.dll','.tmp','.py']
    for extension in dangerous_extensions:
        if re.search(extension,url):
            return 1
    return 0

16. **spacePresent**: If “%20” is present in the URL then the feature is set to 1 else to 0. Benign URLs generally do not have "%20" in their URLs.

In [None]:
# Checking if %20 percent in the URL
def spacePresent(url):
    #print("20->",end=" ")
    if re.search('%20',url):
        return 1
    else:
        return 0


17. **digitToLetterRatio**: This function calculates the digit to letter ratio of the URL. Higher number of digits present in URL tends to increase the ratio. Benign URLs usually have a smaller ratio.


In [None]:
# Calculating Digit to Letter ratio
def digitToLetterRatio(url):
    #print("21->",end=" ")
    d=0
    l=0
    for c in url:
        if c.isdigit():
            d=d+1
        elif c.isalpha():
            l=l+1
        else:
            pass
    return(d/l)


18. **specialCharacters**: This function counts the number of special characters in the URL. Dangerous URLs tend to have higher number of special characters.

In [None]:
# Count number of special characters in the URL
def specialCharacters(url):
    #print("22->",end=" ")
    special_char= 0
    for c in url:
        if (c.isalpha()):
            continue
        elif (c.isdigit()):
            continue
        else:
            special_char += 1
    return special_char


19. **suspiciousWords**: If suspicious words are present in the URL then the feature is set to 1 else set to 0. Words like 'gift','promo','paypal' are security sensitive words or brand names. Using spoofed URLs is common in phishing attacks. Victims tend to get deceived by the brand names and security sensitive words in the URL. Reference to list of words: Tupsamudre H., Singh A.K., Lodha S. (2019) Everything Is in the Name – A URL Based Approach for Phishing Detection. In: Dolev S., Hendler D., Lodha S., Yung M. (eds) Cyber Security Cryptography and Machine Learning. CSCML 2019. Lecture Notes in Computer Science, vol 11527. Springer, Cham. https://doi.org/10.1007/978-3-030-20951-3_21


In [None]:
# Check if suspicious words - (security sensitive words/ brand names) present in the URL
def suspiciousWords(url):
    #print("23->",end=" ")

    suspicious_words=['paypal','ali','safety','verify','google','netflix','instagram','icici','hdfc', 'apple','facebook','amazon','porn','gamble',
                      'award','limited','securewebsession','confirmation','signin','protection','access','redirectme','secure',
                      'recovery', 'verify','limited','secure','web',      'session',      'confirmation', 'page',      'sign',      'team',      'access',      'protection',      'active',
                      'manage',      'redirectme',      'secure',      'customer',      'account',
                      'client',      'information',      'recovery',      'verify',      'secured',
                      'business',      'refund',      'help',      'safe',      'bank',      'event',
                      'promo',      'webservis',      'giveaway',      'card',      'webspace',      'user',
                      'notify',      'servico',      'store',      'device',      'payment',      'webnode',
                      'drive',      'shop',      'gold',      'violation',      'random',      'upgrade',
                      'webapp',      'dispute',      'setting',      'banking',      'activity',      'startup',
                      'review',      'email',      'approval',      'admin',      'browser',      'billing',
                      'advert',      'protect',      'case',      'temporary',      'alert',      'portal',
                      'login',      'servehttp',      'center',      'client',      'restore',      'secure',
                      'blob',      'smart',      'fortune',      'gift',      'server',      'security',      'page',
                      'confirm',      'notification',      'core',      'host',      'central',      'service',
                      'account',      'servise',      'support',      'apps',      'form',      'info',      'compute',
                      'verification',      'check',      'storage',      'setting',      'digital',      'update',
                      'token',      'required',      'resolution',      'ebay',      'webscr',      'free',      'lucky',      'bonus'
     ]

    for word in suspicious_words:
        if re.search(word,url):
            return 1
    return 0

## WhoIs Based Features

20. **domainAge**: The domainAge function refers to the amount of time during which a domain name has existed. It is how old a domain name is. An old domain is less likely to be dangerous and more trustworthy.

In [None]:
def domainAge(domain_name):
  return 1
  #print("25->",end=" ")
  creation_date = domain_name.creation_date
  expiration_date = domain_name.expiration_date
  if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
    try:
     # print("hi1")
      creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if ((expiration_date is None) or (creation_date is None)):
      #print("hi2")
      return 1
  elif ((type(expiration_date) is list) and (type(creation_date) is list)):
        ageofdomain = abs((expiration_date[0] - creation_date[0]).days)
  elif (type(creation_date) is list):
        ageofdomain = abs((expiration_date - creation_date[0]).days)
  elif  (type(expiration_date) is list):
        ageofdomain = abs((expiration_date[0] - creation_date).days)
  else:
        ageofdomain = abs((expiration_date - creation_date).days)

  return (ageofdomain/365.25)

21. **country**: This function returns the country of the domain of the URL.

In [None]:
def country(domain_name):
  #print("26->",end=" ")
  return domain_name.country


## JavaScript based Features

22. **imgCount**: This function counts the number of images in the content of the URL.

In [None]:
# Get number of images
def imgCount(response):
    #print("24->",end=" ")
    try:
        soup = BeautifulSoup(response.content)
    except:
        return None
    return len(soup.find_all('img'))

23. **iframe**: -  If “<iframe” or “frameBorder="0"” is present in the URL then the feature is set to 1 else to 0. Phishers can make use of the “iframe” tag and make it invisible i.e. without frame borders. Since the border of the inserted webpage is invisible, user seems that the inserted web page is also part of the main web page and can enter sensitive information.

In [None]:
def iframe(response):
  #print("27->",end=" ")
  if response == "":
      return 0
  else:
      #print()
      if "<iframe " in response.text or 'frameBorder="0"' in response.text:
          return 1
      else:
          return 0

24. **mouseOver**:- Checks the effect of mouse over on status bar

In [None]:
#Checks the effect of mouse over on status bar (Mouse_Over)
def mouseOver(response):
  #print("28->",end=" ")
  if response == "" :
    return 1
  else:
    if re.findall("<script>.+onmouseover.+</script>", response.text):
      return 1
    else:
      return 0

Driver Script For the above functions

In [None]:
def featureExtraction(url,label):
  dns=0
  features = []
  #Lexical Feature
  features.append(havingIP(url))
  features.append(haveAtSign(url))
  features.append(countDot(url))
  features.append(prefixSuffix(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(getLength(url))
  features.append(getSlash(url))
  features.append(numDigits(url))
  features.append(numFragments(url))
  features.append(numSubDomains(url))
  features.append(domainExtension(url))

  features.append(url_shannon_entropy(url))
  features.append(domain_shannon_entropy(url))
  features.append(path_shannon_entropy(url))
  features.append(query_shannon_entropy(url))
  features.append(query_path_shannon_entropy(url))
  features.append(suspiciousExtension(url))
  features.append(spacePresent(url))
  features.append(digitToLetterRatio(url))
  features.append(specialCharacters(url))
  features.append(suspiciousWords(url))


  #features.append(imgCount(url))

  #Whois Based
  try:
    domain_name = whois.whois(urlparse(url).netloc,timeout=1)
  except:
    dns = 1

  features.append(0 if (dns == 1 or domain_name.domain_name==None)  else domainAge(domain_name))

  features.append("None" if (dns == 1 or domain_name.domain_name==None)  else country(domain_name))

# HTML & Javascript based features
  try:
    response = requests.get(url,timeout=1)
  except:
    response = ""
  features.append(imgCount(response))
  features.append(iframe(response))
  features.append(mouseOver(response))
  features.append(label)

  return features

In [None]:

phish_features = []
label = 1
progress_bar=tqdm(phish['url'])
for i,url in enumerate(progress_bar):
  if(i%1000==0):
    name='drive/MyDrive/malTrack/p'+str(i)+'.pickle'
    with open(name, 'wb') as f:
      pickle.dump(phish_features, f)
    #print("Url no",i," ",url)

  progress_bar.set_description(f"Url no {i} {url}")
  phish_features.append(featureExtraction(url,label))


Saving The Extracted Features in CSV Format

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
feature_names = ['havingIP', 'haveAtSign', 'countDot', 'prefixSuffix','redirection',
                'httpDomain', 'tinyURL', 'getLength', 'getSlash','numDigit','numfragments','numsubdomain',
                 'domainExt','URL_Shannon_Entropy','Domain_Shannon_Entropy','Path_Shannon_Entropy','Query_Shannon_Entropy',
                 'QandP_Shannon_Entropy','Susp_ext','space preset','digitToLetter_ratio','Sp_Character','susp_words',
                 'domainAge','domainCountry','num_Imag','iframe', 'Mouse_Over', 'Label',]

phishing = pd.DataFrame(phish_features, columns= feature_names)
phishing.to_csv('ExtractedFeaturesDataset/phish_Features_Final.csv')

In [None]:
name='drive/MyDrive/malTrack/verified_phishing_online.pickle'
with open(name, 'wb') as f:
      pickle.dump(phish_features, f)