### Import libraries

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import whois
import datetime
import requests
import time

import tldextract
import dns.resolver

from bs4 import BeautifulSoup

import ipaddress

import ssl
import re
from urllib.parse import urlparse
import socket
import validators

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pickle

In [2]:
# What version of Python do you have?
import sys

import tensorflow.keras
import pandas as pd
import sklearn as sk
import tensorflow as tf
import platform

print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
gpu = len(tf.config.list_physical_devices('GPU') ) > 0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Python Platform: macOS-13.0.1-arm64-arm-64bit
Tensor Flow Version: 2.12.0
Keras Version: 2.12.0

Python 3.10.10 | packaged by conda-forge | (main, Mar 24 2023, 20:12:31) [Clang 14.0.6 ]
Pandas 2.0.0
Scikit-Learn 1.2.2
GPU is available


### Functions to extract features

In [3]:
def get_age(domain):
    # Get the creation date of the domain
    try:
        creation_date = whois.whois(domain).creation_date

        # Calculate the age of the website in years
        if isinstance(creation_date, list):
                creation_date = creation_date[0]
        age = (datetime.datetime.now() - creation_date).days / 365
    except:
        age = 0
    return age

def get_domain(url):
    sub, domain, suffix = tldextract.extract(url)
    subdomain = f"{domain}.{suffix}" if sub == "" else f"{sub}.{domain}.{suffix}"
    domain = f"{domain}.{suffix}"
    return subdomain, domain

def validate_url(url):
    valid=validators.url(url)
    return valid == True

def get_request(url):
     validation = validate_url(url)
     if not validation:
          url = "http://" + url
     try:
          response = requests.get(url, timeout=20)
     except:
          response = None
     return response
 
def get_soup(response):
    if response is None:
        return None
    return BeautifulSoup(response.text, "html.parser")

def get_login_time(url):
    start_time = time.time()
    response = get_request(url)
    end_time = time.time()
    load_time_in_seconds = end_time - start_time
    return load_time_in_seconds, response

def get_external_link(soup, url):
    external_links = 0
    for link in soup.find_all("a"):
        if link.get("href") and not url in link.get("href"):
            external_links += 1
    return external_links

def get_redirects(response):
    num_redirects = len(response.history)
    return num_redirects

def get_ip_address_reputation(response):
    try:
        ip_addr = ipaddress.ip_address(response.url)
        response_ip = requests.get(f"http://checkip.dyndns.org/?ip={ip_addr}")
        reputation = "safe" if "OK" in response_ip.text else "malicious"
    except:
        reputation = None
    return reputation

def get_num_image(soup):
    num_images = len(soup.find_all("img"))
    return num_images

def get_num_iframes(soup):
    num_iframes = len(soup.find_all("iframe"))
    return num_iframes

def get_alexa_rank(soup):
    try:
        alexa_rank = None
        for meta in soup.find_all("meta"):
            if "name" in meta.attrs and meta.attrs["name"].lower() == "alexa":
                alexa_rank = int(meta.attrs["content"])
    except:
        alexa_rank = None
    return alexa_rank

def get_page_rank(url):
    try:
        GOOGLE_PR_CHECK_URL = "http://toolbarqueries.google.com/tbr?client=navclient-auto&features=Rank&ch=%s&q=info:%s"
        domain = url.split("//")[-1].split("/")[0]
        hsh = hash(domain.encode("utf-8")) & 0xEFFFFFFF
        response = requests.get(GOOGLE_PR_CHECK_URL % (hsh, domain))
        if response.status_code == 200:
            page_rank = int(response.content.strip().split(":")[-1])
        else:
            page_rank = None
    except:
        page_rank = None
    return page_rank

def get_num_hidden_text(soup):
    try:
        num_hidden_text = 0
        for element in soup.find_all():
            if element.get("style") and "display:none" in element.get("style").lower():
                num_hidden_text += 1
    except:
        num_hidden_text = None
    return num_hidden_text

def get_ext_tot_ratio(soup, url):
    try:
        num_internal_links = 0
        num_external_links = 0
        for link in soup.find_all("a"):
            if link.get("href"):
                if url in link.get("href"):
                    num_internal_links += 1
                else:
                    num_external_links += 1
        if num_internal_links > 0:
            external_to_internal_ratio = num_external_links / num_internal_links
        else:
            external_to_internal_ratio = num_external_links
    except:
        external_to_internal_ratio = None
    return external_to_internal_ratio

def get_response_features(url):
    time_and_response = get_login_time(url)
    response = time_and_response[1]
    if response is None:
        return [0] * 7
    soup = get_soup(response)
    
    # Features
    time = time_and_response[0]
    num_ex_links = get_external_link(soup, url)
    num_redirects = get_redirects(response)
    num_img = get_num_image(soup)
    num_iframe = get_num_iframes(soup)
    num_hidden = get_num_hidden_text(soup)
    ext_tot_ratio = get_ext_tot_ratio(soup, url)
    return [time, num_ex_links, num_redirects, num_img, num_iframe, num_hidden, ext_tot_ratio]

def get_suspicious_words(url):
    keywords = ["login", "password", "verify", "account", "security", "wp", "admin", "content",
                "site", "images", "js", "alibaba", "css", "myaccount", "dropbox", "themes",
                "plugins", "signin", "view"]
    found_keywords = 0
    for keyword in keywords:
        if keyword in url:
            found_keywords = found_keywords + 1
    return found_keywords

def has_ip_address(url):
    ip_pattern = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')
    return 1 if ip_pattern.search(url) else 0

def is_url_shortened(url):
    url_shortening_services = ["bit.ly", "tinyurl.com", "goo.gl", "ow.ly"]
    return 1 if any(service in url for service in url_shortening_services) else 0


def get_count_features(url):
    length = len(url)
    subdomain, domain = get_domain(url)
    length = len(url)
    subdomain_ratio = len(subdomain)/length

    # f34 prefix suffix
    hyphen_in_d = 1 if "-" in subdomain else 0

    # f4-f24
    num_dots = url.count('.')
    num_www = url.count('www')
    num_dcom = url.count('.com')
    num_http = url.count('http')
    num_https = url.count('https')
    num_2slash = url.count('//')
    num_quest = url.count('?')
    num_prtc = url.count('%')
    num_equal = url.count('=')
    num_star = url.count('*')
    num_dollar = url.count('$')
    num_under = url.count('_')
    num_space = url.count('%20') + url.count(' ')
    num_slash = url.count('/')
    num_dash = url.count('-')
    num_at = url.count('@')
    num_tile = url.count('~')
    num_line = url.count('|')
    num_colon = url.count(':')
    num_semic = url.count(';')
    num_comma = url.count(',')

    return [length, subdomain_ratio, num_dots,
            num_www, num_dcom, num_http, num_https, num_2slash, num_quest, num_prtc, num_equal,
            num_star, num_dollar, num_under, num_space, num_slash, num_dash, num_at, num_tile,
            num_line, num_colon, num_semic, num_comma]
    
def get_url_features(url):
    return [get_suspicious_words(url)] + [has_ip_address(url)] + [is_url_shortened(url)] + get_count_features(url)

### Load the models, imputer and scaler

In [24]:
# Load the tokenizer
with open('/Users/yihongan/Desktop/tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

# Load the MinMaxScaler
with open('/Users/yihongan/Desktop/std_sc.pkl', 'rb') as f:
    scaler = pickle.load(f)

# Load imputer
with open('/Users/yihongan/Desktop/lr_imputer.pkl', 'rb') as f:
    imputer = pickle.load(f)
    
# Load the CNN
CNN = tf.keras.models.load_model('/Users/yihongan/Desktop/CNN.tf')

# Load the DNN
DNN = tf.keras.models.load_model('/Users/yihongan/Desktop/DNN.tf')

# Load the rf
with open('/Users/yihongan/Desktop/RF.pkl', 'rb') as f:
    rf = pickle.load(f)


### Function for single url classification

In [79]:
feature_names = ['age', 'num_suspicious_words', 'has_ip_address',
                'is_url_shortened', 'length', 'subdomain_ratio', 'num_dots', 'num_www',
                'num_dcom', 'num_http', 'num_https', 'num_2slash', 'num_quest',
                'num_prtc', 'num_equal', 'num_star', 'num_dollar', 'num_under',
                'num_space', 'num_slash', 'num_dash', 'num_at', 'num_tile', 'num_line',
                'num_colon', 'num_semic', 'num_comma', 'login_time', 'num_ex_links',
                'num_redirects', 'num_img', 'num_iframe', 'num_hidden', 'ext_tot_ratio']

In [94]:
def predict_single_url(url, model=['CNN', 'DNN', 'RF']):
    print(f'Test url: {url}')
    # Numerical features
    age = get_age(url)
    response_features = get_response_features(url)
    url_features = get_url_features(url)
    
    if response_features[0] == 0:
        print(f'No response from {url}')
    
    X_num = [age] + url_features + response_features
 
    if X_num[0] is None:
        X_num = imputer.transform([X_num])
        X_num_norm = scaler.transform(X_num)
    else:
        X_num_norm = scaler.transform(np.array([X_num]))
    feature = pd.DataFrame(np.array(X_num).reshape(1, -1), columns=feature_names)
    
    # Textual features
    text_input_shape = (100,)
    X_text = np.array(pad_sequences(tokenizer.texts_to_sequences([url]), maxlen=text_input_shape[0], padding="post"))

    if model == 'CNN':
        print('Model: CNN')
        prob = CNN.predict([X_text, X_num_norm], verbose=0).squeeze()
        pred = "Phishing" if prob > 0.5 else "Legitimate"
    elif model == 'DNN':
        print('Model: DNN')
        prob = DNN.predict(X_num_norm).squeeze()
        pred = "Phishing" if prob > 0.5 else "Legitimate"
    else:
        print('Model: Random Forest')
        prob = rf.predict_proba(X_num_norm)[:, 1].squeeze()
        pred = "Phishing" if prob > 0.5 else "Legitimate"

    print(f"Prediction: {pred}")
    print(f"Predicted probability of being phishing: {prob}")

    return pred, feature

### Testing with phishing website

In [99]:
test_url = 'https://anazom.co.ip.azddeu.com/'
pred, feature = predict_single_url(test_url, model='CNN')

Test url: https://anazom.co.ip.azddeu.com/
No response from https://anazom.co.ip.azddeu.com/
Model: CNN
Prediction: Phishing
Predicted probability of being phishing: 0.9990572333335876


In [100]:
test_url = 'https://www.aeombamk-co-jp.zanqianjia.com/'
pred, feature = predict_single_url(test_url, model='RF')

Test url: https://www.aeombamk-co-jp.zanqianjia.com/
No response from https://www.aeombamk-co-jp.zanqianjia.com/
Model: Random Forest
Prediction: Phishing
Predicted probability of being phishing: 0.67


In [101]:
test_url = 'https://www.aeombamk-co-jp.zanqianjia.com/'
pred, feature = predict_single_url(test_url, model='CNN')

Test url: https://www.aeombamk-co-jp.zanqianjia.com/
No response from https://www.aeombamk-co-jp.zanqianjia.com/
Model: CNN
Prediction: Phishing
Predicted probability of being phishing: 0.9950657486915588


### Legitimate

In [98]:
test_url = 'https://www.sta.cuhk.edu.hk/peoples/scpy/'
pred, feature = predict_single_url(test_url, model='CNN')

Test url: https://www.sta.cuhk.edu.hk/peoples/scpy/
Model: CNN
Prediction: Legitimate
Predicted probability of being phishing: 0.004463085439056158


In [97]:
test_url = 'https://www.google.com.hk'
pred, feature = predict_single_url(test_url, model='RF')

Test url: https://www.google.com.hk
Model: Random Forest
Prediction: Legitimate
Predicted probability of being phishing: 0.06
