### Import libraries

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import whois
import datetime
import requests
import time

import tldextract
import dns.resolver

from bs4 import BeautifulSoup

import ipaddress

import ssl
import re
from urllib.parse import urlparse
import socket
import validators

### Load the URLs

In [8]:
data = pd.read_csv("/Users/yihongan/Desktop/collected_urls.csv")
urls = data['url']
status = data['status']

### Functions to check the existence of URL

In [9]:
def get_domain(url):
    sub, domain, suffix = tldextract.extract(url)
    subdomain = f"{domain}.{suffix}" if sub == "" else f"{sub}.{domain}.{suffix}"
    domain = f"{domain}.{suffix}"
    return subdomain, domain

In [10]:
def validate_url(url):
    valid=validators.url(url)
    return valid == True

In [11]:
def get_request(url):
     validation = validate_url(url)
     if not validation:
          url = "http://" + url
     try:
          response = requests.get(url, timeout=20)
     except:
          response = None
     return response

In [12]:
def get_soup(response):
    if response is None:
        return None
    return BeautifulSoup(response.text, "html.parser")

### Response features

In [13]:
def get_age(domain):
    # Get the creation date of the domain
    try:
        creation_date = whois.whois(domain).creation_date

        # Calculate the age of the website in years
        if isinstance(creation_date, list):
                creation_date = creation_date[0]
        age = (datetime.datetime.now() - creation_date).days / 365
    except:
        age = None
    return age

def get_login_time(url):
    start_time = time.time()
    response = get_request(url)
    end_time = time.time()
    load_time_in_seconds = end_time - start_time
    return load_time_in_seconds, response

def get_external_link(soup, url):
    external_links = 0
    for link in soup.find_all("a"):
        if link.get("href") and not url in link.get("href"):
            external_links += 1
    return external_links

def get_redirects(response):
    num_redirects = len(response.history)
    return num_redirects

def get_ip_address_reputation(response):
    try:
        ip_addr = ipaddress.ip_address(response.url)
        response_ip = requests.get(f"http://checkip.dyndns.org/?ip={ip_addr}")
        reputation = "safe" if "OK" in response_ip.text else "malicious"
    except:
        reputation = None
    return reputation

def get_num_image(soup):
    num_images = len(soup.find_all("img"))
    return num_images

def get_num_iframes(soup):
    num_iframes = len(soup.find_all("iframe"))
    return num_iframes

def get_alexa_rank(soup):
    try:
        alexa_rank = None
        for meta in soup.find_all("meta"):
            if "name" in meta.attrs and meta.attrs["name"].lower() == "alexa":
                alexa_rank = int(meta.attrs["content"])
    except:
        alexa_rank = None
    return alexa_rank

def get_page_rank(url):
    try:
        GOOGLE_PR_CHECK_URL = "http://toolbarqueries.google.com/tbr?client=navclient-auto&features=Rank&ch=%s&q=info:%s"
        domain = url.split("//")[-1].split("/")[0]
        hsh = hash(domain.encode("utf-8")) & 0xEFFFFFFF
        response = requests.get(GOOGLE_PR_CHECK_URL % (hsh, domain))
        if response.status_code == 200:
            page_rank = int(response.content.strip().split(":")[-1])
        else:
            page_rank = None
    except:
        page_rank = None
    return page_rank

def get_num_hidden_text(soup):
    try:
        num_hidden_text = 0
        for element in soup.find_all():
            if element.get("style") and "display:none" in element.get("style").lower():
                num_hidden_text += 1
    except:
        num_hidden_text = None
    return num_hidden_text

def get_ext_tot_ratio(soup, url):
    try:
        num_internal_links = 0
        num_external_links = 0
        for link in soup.find_all("a"):
            if link.get("href"):
                if url in link.get("href"):
                    num_internal_links += 1
                else:
                    num_external_links += 1
        if num_internal_links > 0:
            external_to_internal_ratio = num_external_links / num_internal_links
        else:
            external_to_internal_ratio = num_external_links
    except:
        external_to_internal_ratio = None
    return external_to_internal_ratio

In [14]:
def get_response_features(url):
    time_and_response = get_login_time(url)
    response = time_and_response[1]
    if response is None:
        return [None] * 7
    soup = get_soup(response)
    
    # Features
    time = time_and_response[0]
    num_ex_links = get_external_link(soup, url)
    num_redirects = get_redirects(response)
    num_img = get_num_image(soup)
    num_iframe = get_num_iframes(soup)
    num_hidden = get_num_hidden_text(soup)
    ext_tot_ratio = get_ext_tot_ratio(soup, url)
    return [time, num_ex_links, num_redirects, num_img, num_iframe, num_hidden, ext_tot_ratio]

### URL features

In [15]:
def get_suspicious_words(url):
    keywords = ["login", "password", "verify", "account", "security", "wp", "admin", "content",
                "site", "images", "js", "alibaba", "css", "myaccount", "dropbox", "themes",
                "plugins", "signin", "view"]
    found_keywords = 0
    for keyword in keywords:
        if keyword in url:
            found_keywords = found_keywords + 1
    return found_keywords

def has_ip_address(url):
    ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
    return 1 if ip_pattern.search(url) else 0

def is_url_shortened(url):
    url_shortening_services = ["bit.ly", "tinyurl.com", "goo.gl", "ow.ly"]
    return 1 if any(service in url for service in url_shortening_services) else 0


def get_count_features(url):
    length = len(url)
    subdomain, domain = get_domain(url)
    length = len(url)
    subdomain_ratio = len(subdomain)/length

    # f34 prefix suffix
    hyphen_in_d = 1 if "-" in subdomain else 0

    # f4-f24
    num_dots = url.count('.')
    num_www = url.count('www')
    num_dcom = url.count('.com')
    num_http = url.count('http')
    num_https = url.count('https')
    num_2slash = url.count('//')
    num_quest = url.count('?')
    num_prtc = url.count('%')
    num_equal = url.count('=')
    num_star = url.count('*')
    num_dollar = url.count('$')
    num_under = url.count('_')
    num_space = url.count('%20') + url.count(' ')
    num_slash = url.count('/')
    num_dash = url.count('-')
    num_at = url.count('@')
    num_tile = url.count('~')
    num_line = url.count('|')
    num_colon = url.count(':')
    num_semic = url.count(';')
    num_comma = url.count(',')

    return [length, subdomain_ratio, num_dots,
            num_www, num_dcom, num_http, num_https, num_2slash, num_quest, num_prtc, num_equal,
            num_star, num_dollar, num_under, num_space, num_slash, num_dash, num_at, num_tile,
            num_line, num_colon, num_semic, num_comma]

In [16]:
def get_url_features(url):
    return [get_suspicious_words(url)] + [has_ip_address(url)] + [is_url_shortened(url)] + get_count_features(url)

### Get the age

In [None]:
ages = []
for url in urls:
    ages.append(get_age(url))

In [None]:
age_data = pd.DataFrame(ages, columns='age') # Output this dataframe

### Get response features

In [71]:
database = []
for url in tqdm(urls):
    database.append(get_response_features(url))

100%|██████████| 22902/22902 [9:20:18<00:00,  1.47s/it]   


In [90]:
response_features = ["login_time", "num_ex_links", "num_redirects", "num_img", "num_iframe", "num_hidden", "ext_tot_ratio"]

In [None]:
html_features_data = pd.DataFrame(database, columns=response_features) # Output this dataframe

### Get URL featues

In [16]:
dataset = []
for url in tqdm(urls):
    dataset.append(get_url_features(url))

100%|██████████| 22902/22902 [00:00<00:00, 76962.11it/s]


In [17]:
url_features = ["num_suspicious_words", "has_ip_address", "is_url_shortened", "length", "subdomain_ratio", "num_dots",
                "num_www", "num_dcom", "num_http", "num_https", "num_2slash", "num_quest", "num_prtc", "num_equal",
                "num_star", "num_dollar", "num_under", "num_space", "num_slash", "num_dash", "num_at", "num_tile",
                "num_line", "num_colon", "num_semic", "num_comma"]

In [18]:
url_features_data = pd.DataFrame(dataset, columns=url_features) # Output this dataframe

### Combine all the features to one dataset

In [19]:
url_features_data = pd.concat(urls, age_data ,url_features_data, html_features_data, status, axis=1)
url_features_data

Unnamed: 0,url,age,num_suspicious_words,has_ip_address,is_url_shortened,length,subdomain_ratio,num_dots,num_www,num_dcom,...,num_space,num_slash,num_dash,num_at,num_tile,num_line,num_colon,num_semic,num_comma,status
0,http://www.crestonwood.com/router.php,14.536986,0,0,0,37,0.513514,3,1,1,...,0,3,0,0,0,0,1,0,0,0
1,http://shadetreetechnology.com/V4/validation/a...,18.419178,0,0,0,77,0.298701,1,0,1,...,0,5,0,0,0,0,1,0,0,1
2,https://support-appleld.com.secureupdate.duila...,13.586301,0,0,0,126,0.396825,4,0,2,...,0,5,1,0,0,0,1,0,0,1
3,http://rgipt.ac.in,15.487671,0,0,0,18,0.611111,2,0,0,...,0,2,0,0,0,0,1,0,0,0
4,http://www.iracing.com/tracks/gateway-motorspo...,25.052055,0,0,0,55,0.272727,2,1,1,...,0,5,2,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22897,www.vee.com/services/costumes.aspx,27.408219,0,0,0,34,0.323529,3,1,1,...,0,2,0,0,0,0,0,0,0,0
22898,www.angelfire.com/art/hdesign/,,0,0,0,30,0.566667,2,1,1,...,0,3,0,0,0,0,0,0,0,0
22899,members.tripod.com/codpiece_28/,28.054795,0,0,0,31,0.580645,2,0,1,...,0,2,0,0,0,0,0,0,0,0
22900,www.freewebs.com/olivia-s-costume/,23.167123,0,0,0,34,0.470588,2,1,1,...,0,2,2,0,0,0,0,0,0,0


In [200]:
url_features_data.to_csv("/Users/yihongan/Desktop/phishing_data.csv", index=False, header=True)