# Malicious URL detection
Eli Belkind 208250431                                                               
Koral Elbaz 318477684                                                                           
Itamar Almog 208196600 

In [1]:
# imports
import math
import whois
from pyquery import PyQuery
from requests import get
import numpy as np
import urllib.request as r
import re
from xml.dom import minidom
import dns.resolver
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd
import ssl

In [2]:
# the features we extracted from each url
features = ['domain_len','hostname_len','https','num_of_dots','num_of_digits','ip_in_url','max_constant','max_vowel',
       'special_char_domain','entropy','parameters','fragments','php_url','delimiters','encoded','special_char_url',
        'url_depth','sensitive_words_url','A_size','NS_size','MX_size','TXT_size','has_spf','site_is_up','comments','html_size',
        'script_size','body_to_script_r','special_char_html','special_char_to_script_r','special_char_to_body_r','num_of_scripts',
        'num_of_iframes','num_of_images','num_of_links','num_of_href','num_of_titles','php_html','exe_html','sensitive_words_html',
        'malicious_code_count','reach_rank','country_rank','days_since_registration','num_of_registration','days_since_expiration',
        'num_of_expiration','days_since_update','num_of_updates','has_ssl']

In [3]:
# a class that extract data fron the given url
class urlF:

    def __init__(self, url):
        self.timeout = 3
        self.url = url
        self.domain = self.getDomain(url)
        self.today = datetime.now()
        self.dns_res = dns.resolver.Resolver()
        self.dns_res.lifetime = self.dns_res.timeout = 1
        try:
            self.whois = whois.whois(self.domain)
        except:
            self.whois = None
        try:
            if url[:7] == "http://" or url[:8] == "https://":
                link = url
            else:
                link = "https://" + url
            self.page = get(link, timeout=self.timeout).text
            # check for comments before cleaning
            if '<!--' in self.page and '-->' in self.page:
                self.comments = 1
            else:
                self.comments = 0
            self.page = re.sub("(<!--.*?-->)", "", self.page, flags=re.DOTALL)
            self.page = self.page.replace('\n', '')
            self.page = self.page.replace('\t', '')
            self.page = self.page.replace('\r', '')
            self.pq = PyQuery(self.page)
            self.up = True
        except:
            self.page = ""
            self.pq = None
            self.up = False
            self.comments = 0

    def getDomain(self, name):
        if name[:7] == "http://":
            domain = name[7:]
        elif name[:8] == "https://":
            domain = name[8:]
        else:
            domain = name
        if domain[:4] == "www.":
            domain = domain[4:]
        if domain.find('/') != -1:
            domain = domain[:domain.find('/')]
        return domain

    # URL

    def num_of_dots(self):
        count = 0
        for c in self.domain:
            if c == '.':
                count += 1
        return [count]

    def digits_in_dom(self):
        count = 0
        for c in self.domain:
            if c.isdigit():
                count += 1
        return [count]

    def ipInUrl(self):
        regex = r'[0-9]+(?:\.[0-9]+){3}'
        match = re.search(regex,self.url)
        if match:
            return [1]
        else:
            return [0]

    def maxc(self):
        constant = ['q', 'r', 't', 'p', 's', 'd', 'f', 'g', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm', 'w', 'h']
        maxc = 0
        for i in range(len(self.domain) - 1):
            count = 0
            for j in range(i, len(self.domain)):
                if self.domain[j] in constant:
                    count += 1
                else:
                    break
            if count > maxc:
                maxc = count
        return [maxc]

    def maxv(self):
        vowls = ['a', 'e', 'o', 'u', 'i']
        maxv = 0
        for i in range(len(self.domain) - 1):
            count = 0
            for j in range(i, len(self.domain)):
                if self.domain[j] in vowls:
                    count += 1
                else:
                    break
            if count > maxv:
                maxv = count
        return [maxv]

    def getSpecialCharDom(self):
        count = 0
        for c in self.domain[:self.domain.find('.')]:
            if not c.isalpha() and not c.isdigit():
                count += 1
        return [count]

    def getEntropy(self):
        string = self.domain.strip()
        prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
        entropy = -1 * sum([(p * math.log(p,2)) for p in prob])
        return [entropy]

    def delimeterCount(self):
        common_delimiters = [';', '_', '?', '=', '(', ')', ',', '$', '!', '*']
        count = 0
        for i in common_delimiters:
            count += self.url.count(i)
        return [count]

    def encoded(self):
        return [self.url.count('%')]

    def specialCharUrl(self):
        if self.url[:7] == "http://":
            name = self.url[7:]
        elif self.url[:8] == "https://":
            name = self.url[8:]
        else:
            name = self.url
        special_char = ['//', '@', '-']
        count = 0
        for c in special_char:
            count += name.count(c)
        return [count]

    def sensitiveWordsCount(self):
        sen_words = ['email', 'account', 'login', 'password','server','free']
        count = 0
        for i in sen_words:
            if i in self.url:
                count += 1
        return [count]

    # DNS

    def dnsA(self):
        try:
            result = self.dns_res.query(self.domain, 'A')
            return [len(result)]
        except:
            return [0]

    def dnsNS(self):
        try:
            result = self.dns_res.query(self.domain, 'NS')
            return [len(result)]
        except:
            return [0]

    def dnsMX(self):
        try:
            result = self.dns_res.query(self.domain, 'MX')
            return [len(result)]
        except:
            return [0]

    def dnsTXT(self):
        try:
            result = self.dns_res.query(self.domain, 'TXT')
            length = len(result)
            spf = 0
            for r in result:
                if 'v=spf' in r.to_text():
                    spf = 1
                    break
            return [[length],[spf]]
        except:
            return [[0],[0]]

    # HTML

    def getHtmlLength(self):
        return [len(self.page)]

    def getScriptLength(self):
        if self.up:
            return [len(self.pq.text('script').text())]
        else:
            return [0]

    def scriptToBodyR(self):
        if self.up and not self.getHtmlLength() == 0:
            return [self.getScriptLength()[0]/self.getHtmlLength()[0]]
        else:
            return [0]

    def specialCharHtml(self):
        if self.up:
            count = 0
            for c in self.page:
                if not c.isalpha() and not c.isdigit():
                    count += 1
            return [count]
        else:
            return [0]

    def specialCharToBodyR(self):
        if self.up and not self.getHtmlLength() == 0:
            return [self.specialCharHtml()[0]/self.getHtmlLength()[0]]
        else:
            return [0]

    def specialChatToScript(self):
        if self.up and not self.getScriptLength() == 0:
            return [self.specialCharHtml()[0]/self.getScriptLength()[0]]
        else:
            return [0]

    def linksCount(self):
        links = ['http://', 'https://', 'ftp://', 'gopher://', 'file://', 'mailto://']
        count = 0
        for l in links:
            count += self.page.count(l)
        return [count]

    def refCount(self):
        if self.up:
            return [len([i for i in self.pq('a').items()])]
        else:
            return [0]

    def sensitiveTopics(self):
        sec_sen_words = ['account', 'bank', 'secure', 'password', 'login', 'signin', 'credit', 'pay', 'click', 'mail'
            , 'prize', 'money', 'hack', 'download', 'free', 'now', 'credit', 'buy']
        count = 0
        for word in sec_sen_words:
            if word in self.page:
                count += 1
        return [count]

    def malCode(self):
        mal_code_words = ['excec(', 'eval(', 'escape(', 'link(', 'underescape(', 'search(', '.write', 'fromCharCode',
                          '.innerHTML','.outerHTML']
        count = 0
        for word in mal_code_words:
            if word in self.page:
                count += 1
        return [count]

    # Context

    def siteRank(self):
        f = []
        path = 'https://data.alexa.com/data?cli=10&dat=snbamz&url=' + self.domain
        try:
            res = r.urlopen(path, timeout=self.timeout)
            dom = minidom.parse(res)
            flag = True
            for element in dom.getElementsByTagName('REACH'):
                if element.hasAttribute('RANK'):
                    f.append([element.attributes['RANK'].value])
                    flag = False
            if flag:
                f.append([0])
            flag = True
            for element in dom.getElementsByTagName('COUNTRY'):
                if element.hasAttribute('RANK'):
                    f.append([element.attributes['RANK'].value])
                    flag = False
            if flag:
                f.append([0])

        except:
            f.append([0])
            f.append([0])
        return f

    def daysSinceRegistration(self):
        days = 0
        size = 0
        if self.whois and self.whois.creation_date:
            if type(self.whois.creation_date) is datetime:
                diff = self.today - self.whois.creation_date
                size = 1
                days = diff.days
            elif type(self.whois.creation_date) is list:
                diff = timedelta(days=0)
                for reg in self.whois.creation_date:
                    if type(reg) is str:
                        try:
                            tmp = self.today - datetime.strptime(reg, "%Y-%m-%d %H:%M:%S")
                        except:
                            continue
                    else:
                        tmp = self.today - reg
                    if tmp > diff:
                        diff = tmp
                size = len(self.whois.creation_date)
                days = diff.days
            elif type(self.whois.creation_date) is str:
                try:
                    time = datetime.strptime(self.whois.creation_date, "%Y-%m-%d %H:%M:%S")
                    diff = self.today - time
                    days = diff.days
                except:
                    days = 0
                size = 1
        return [[days], [size]]

    def daysSinceExpiration(self):
        days = 0
        size = 0
        if self.whois and self.whois.expiration_date:
            if type(self.whois.expiration_date) is datetime:
                diff = self.whois.expiration_date - self.today
                size = 1
                days = diff.days
            elif type(self.whois.expiration_date) is list:
                diff = timedelta(days=0)
                for reg in self.whois.expiration_date:
                    if type(reg) is str:
                        try:
                            tmp = datetime.strptime(reg, "%Y-%m-%d %H:%M:%S") - self.today
                        except:
                            continue
                    else:
                        tmp = reg - self.today
                    if tmp > diff:
                        diff = tmp
                size = len(self.whois.expiration_date)
                days = diff.days
            elif type(self.whois.expiration_date) is str:
                try:
                    time = datetime.strptime(self.whois.expiration_date, "%Y-%m-%d %H:%M:%S")
                    diff = self.today - time
                    days = diff.days
                except:
                    days = 0
                size = 1
        return [[days],[size]]

    def daysSinceUpdate(self):
        days = 0
        size = 0
        if self.whois and self.whois.updated_date:
            if type(self.whois.updated_date) is datetime:
                diff = self.today - self.whois.updated_date
                size = 1
                days = diff.days
            elif type(self.whois.updated_date) is list:
                diff = timedelta(days=15380)
                for reg in self.whois.updated_date:
                    if type(reg) is str:
                        try:
                            tmp = self.today - datetime.strptime(reg, "%Y-%m-%d %H:%M:%S")
                        except:
                            continue
                    else:
                        tmp = self.today - reg
                    if tmp < diff:
                        diff = tmp
                size = len(self.whois.updated_date)
                days = diff.days
            elif type(self.whois.updated_date) is str:
                try:
                    time = datetime.strptime(self.whois.updated_date, "%Y-%m-%d %H:%M:%S")
                    diff = self.today - time
                    days = diff.days
                except:
                    days = 0
                size = 1
        return [[days],[size]]

    def hasSSL(self):
        try:
            ssl.get_server_certificate((self.domain, 443))
            return [1]
        except:
            return [0]

    def getFeatures(self):
        f = []

        # URL

        f.append([len(self.domain)])  # hostname length
        f.append([len(self.domain[:self.domain.find('.')])])  # domain length

        if self.url[:8] == "https://":
            f.append([1])
        else:
            f.append([0])

        f.append(self.num_of_dots())
        f.append(self.digits_in_dom())
        f.append(self.ipInUrl())
        f.append(self.maxc())
        f.append(self.maxv())
        f.append(self.getSpecialCharDom())
        f.append(self.getEntropy())

        # parameters and fragments
        f.append([len(self.url.split('&')) - 1])
        f.append([len(self.url.split('#')) - 1])

        # suspicious files
        if '.php' in self.url:
            f.append([1])
        else:
            f.append([0])

        f.append(self.delimeterCount())
        f.append(self.encoded())
        f.append(self.specialCharUrl())
        # url depth
        f.append([self.url.count('/') - 2 * self.url.count('//')])
        f.append(self.sensitiveWordsCount())

        # DNS

        f.append(self.dnsA())
        f.append(self.dnsNS())
        f.append(self.dnsMX())
        res = self.dnsTXT()
        for item in res:
            f.append(item)

        # HTML

        if self.up:
            f.append([1])
        else:
            f.append([0])

        f.append([self.comments])
        f.append(self.getHtmlLength())
        f.append(self.getScriptLength())
        f.append(self.scriptToBodyR())
        f.append(self.specialCharHtml())
        f.append(self.specialCharToBodyR())
        f.append(self.specialChatToScript())

        # unique titles
        f.append([self.page.count('</script>')])
        f.append([self.page.count('</iframe>')])
        f.append([self.page.count('</image>')])

        f.append(self.linksCount())
        f.append(self.refCount())

        # sum of titles
        f.append([self.page.count('</')])

        # references to files
        if '.php' in self.page:
            f.append([1])
        else:
            f.append([0])

        if '.exe' in self.page:
            f.append([1])
        else:
            f.append([0])

        f.append(self.sensitiveTopics())
        f.append(self.malCode())

        # Context

        rank = self.siteRank()
        for i in rank:
            f.append(i)

        reg = self.daysSinceRegistration()
        for i in reg:
            f.append(i)

        exp = self.daysSinceExpiration()
        for i in exp:
            f.append(i)

        update = self.daysSinceUpdate()
        for i in update:
            f.append(i)

        f.append(self.hasSSL())

        return np.array(f).T


In [None]:
# from the data base we read eaxh url and extract the data, and then save it in a csv file
# because we download data from the url and from other tools like alexa and whois, the process can take a while
col = features
df = pd.read_csv('malicious_phish.csv')
url = df['url']
urlDF = pd.DataFrame(urlF(url.iloc[0]).getFeatures(), columns=col)
for i in url.index:
    if i == 0:
        continue
    tmp = pd.DataFrame(urlF(url.iloc[i]).getFeatures(), index=[i], columns=col)
    urlDF = pd.concat([urlDF, tmp], axis=0)
df = pd.concat([df, urlDF], axis=1)
file = 'C:\Users\elobl\PycharmProjects\ml_finale\data\full_phish.csv'
df.to_csv(file, index=False)

In [7]:
# after normalization we put the data in our ML model

df = pd.read_csv('C:\\Users\\elobl\\PycharmProjects\\ml_finale\\data\\full_phish.csv')
label = df['label']
df = df.drop(['label','url'], axis=1)

x = df
y = label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42, stratify=y)

model = RandomForestClassifier(n_estimators=300, random_state=42)

model.fit(x_train,y_train)
predictions = model.predict(x_test)
true_labels = y_test
cf_matrix = confusion_matrix(true_labels, predictions)
print(cf_matrix)
recall = cf_matrix[0][0] / (cf_matrix[0][0] + cf_matrix[0][1])
precision = cf_matrix[0][0] / (cf_matrix[0][0] + cf_matrix[1][0])
auc = (cf_matrix[0][0] + cf_matrix[1][1]) / (cf_matrix[0][0] + cf_matrix[1][1] + cf_matrix[1][0] + cf_matrix[0][1])
f1 = 2*precision*recall/(precision+recall)
print(recall, precision, auc, f1)

[[1571   38]
 [  71  520]]
0.9763828464885022 0.9567600487210719 0.9504545454545454 0.9664718548139034
