# Feature Engineering
### Create combined dataframe with phishing and legitimate URLs

In [26]:
#Import Libraries
import pandas as pd
import tldextract
import re

In [27]:
#20,000 random phishing URLs
phishing = pd.read_csv('DataSets/phishing.csv')
phish = phishing.sample(n=20000, replace=False)
phish.head()

Unnamed: 0,url,label
7535,http://www.saisonoard.co.jp.ooppox.xyz,1
4336,https://81kw81.myspreadshop.fr/,1
46390,https://tpadrewards.net/pool/1inch.html,1
27623,https://magazine-parceiroluiza.com/589458/9651...,1
17506,https://rivalesports.net,1


In [28]:
#Select 20,000 legitimate URLs to be used. 10,000 are full links, 10,000 are domains.
legitimate1 = pd.read_csv('DataSets/legitimate.csv')
legit1 = legitimate1.sample(n=10000, replace=False)
legitimate2 = pd.read_csv('DataSets/legitimate2.csv')
legit2 = legitimate2.head(10000)
legit = pd.concat([legit1, legit2], axis=0)
legit.head()

Unnamed: 0,url,label
414884,vidraguas.com.br/wordpress/iii7/index.html,0
404141,ciklaorejom.my03.com/was/u.php,0
38919,myspace.com/steadymobbnmusic/music/albums/pre-...,0
300220,washingtonexaminer.com/local/maryland/2011/11/...,0
352188,horsebetting.com/racetrack/?id=FRE&race=1&race...,0


In [29]:
#Combine dataframe and shuffle data
df = pd.concat([phish, legit], axis=0).sample(frac=1)
df.head()

Unnamed: 0,url,label
8722,ioliu.cn,0
5978,http://pay-ovhcloud-fr-dcdc53f53db.querbeet-ei...,1
45068,https://tsai.com.au/mass.org/,1
7549,xiaobaipan.com,0
321611,youtube.com/watch?v=eiv0D-vD28U,0


## URL-Based Features
     Features are extracted from the URL itself.
     Features include: 
        1. Presence of IP Address in the URL
        2. URL Length
        3. The presence of redirection in the URL
        4. Use of URL Shortening Services
        5. The presence of sensitive words
        6. Number of subdomains
        7-18. The count of '!', '@', '#', '%', '&', '+', '-', '=', '.', '/', '\', and '?' in the URL, respectively
        19. The presence of 'https'
        20. The presence of 'http'

In [30]:
def usesIP(url):
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)'
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}|'
        '([0-9]+(?:\.[0-9]+){3}:[0-9]+)|'
        '((?:(?:\d|[01]?\d\d|2[0-4]\d|25[0-5])\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d|\d)(?:\/\d{1,2})?)', url)
    if match:
        return 1
    else:
        return 0
df['ip'] = df['url'].apply(lambda x: usesIP(x))
df.head()

Unnamed: 0,url,label,ip
8722,ioliu.cn,0,0
5978,http://pay-ovhcloud-fr-dcdc53f53db.querbeet-ei...,1,0
45068,https://tsai.com.au/mass.org/,1,0
7549,xiaobaipan.com,0,0
321611,youtube.com/watch?v=eiv0D-vD28U,0,0


In [31]:
def url_length(url):
    length = len(url)
    return length
df['len'] = df['url'].apply(lambda x: url_length(x))
df.head()

Unnamed: 0,url,label,ip,len
8722,ioliu.cn,0,0,8
5978,http://pay-ovhcloud-fr-dcdc53f53db.querbeet-ei...,1,0,58
45068,https://tsai.com.au/mass.org/,1,0,29
7549,xiaobaipan.com,0,0,14
321611,youtube.com/watch?v=eiv0D-vD28U,0,0,31


In [32]:
def redirect(url):
    if url[8:].find('//') >= 0:
        return 1
    else:
        return 0
df['redir'] = df['url'].apply(lambda x: redirect(x))
df.head()

Unnamed: 0,url,label,ip,len,redir
8722,ioliu.cn,0,0,8,0
5978,http://pay-ovhcloud-fr-dcdc53f53db.querbeet-ei...,1,0,58,0
45068,https://tsai.com.au/mass.org/,1,0,29,0
7549,xiaobaipan.com,0,0,14,0
321611,youtube.com/watch?v=eiv0D-vD28U,0,0,31,0


In [33]:
def isShort(url):
    shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"
    
    if re.search(shortening_services,url):
        return 1
    else:
        return 0
df['short'] = df['url'].apply(lambda x: isShort(x))
df.head()

Unnamed: 0,url,label,ip,len,redir,short
8722,ioliu.cn,0,0,8,0,0
5978,http://pay-ovhcloud-fr-dcdc53f53db.querbeet-ei...,1,0,58,0,0
45068,https://tsai.com.au/mass.org/,1,0,29,0,0
7549,xiaobaipan.com,0,0,14,0,0
321611,youtube.com/watch?v=eiv0D-vD28U,0,0,31,0,0


In [34]:
def check_sensitive_words(url):
    sensitive_words = r"Password|Account|Login|Verify|Security|Update|Payment|Card|Bank|Alert|Access|" \
                            r"Confirm|Information|Identity|Social Security|Verification|Fraud|Suspended|Limited|" \
                            r"Urgent|Unusual|Unauthorised|Suspicious|Lock|Reset|Expired|Invalid|Disabled|Termination|" \
                            r"Deactivation|Hack|Breach|Compromised|Phishing|Scam|Fake|Spam|Spoof|Spoofing|Impersonation|" \
                            r"Emergency|Critical|Failure|Error|Warning|Alert|Threat|Danger|Attack|Virus"
    if re.search(sensitive_words, url, flags=re.IGNORECASE):
        return 1
    else:
        return 0
df['sensitive'] = df['url'].apply(lambda x: check_sensitive_words(x))
df.head()

Unnamed: 0,url,label,ip,len,redir,short,sensitive
8722,ioliu.cn,0,0,8,0,0,0
5978,http://pay-ovhcloud-fr-dcdc53f53db.querbeet-ei...,1,0,58,0,0,0
45068,https://tsai.com.au/mass.org/,1,0,29,0,0,0
7549,xiaobaipan.com,0,0,14,0,0,0
321611,youtube.com/watch?v=eiv0D-vD28U,0,0,31,0,0,0


In [35]:
def count_subdomains(url):
    subdomains = tldextract.extract(url).subdomain.split('.')
    return len(subdomains)
df['subdomains'] = df['url'].apply(lambda x: count_subdomains(x))
df.head()

Unnamed: 0,url,label,ip,len,redir,short,sensitive,subdomains
8722,ioliu.cn,0,0,8,0,0,0,1
5978,http://pay-ovhcloud-fr-dcdc53f53db.querbeet-ei...,1,0,58,0,0,0,1
45068,https://tsai.com.au/mass.org/,1,0,29,0,0,0,1
7549,xiaobaipan.com,0,0,14,0,0,0,1
321611,youtube.com/watch?v=eiv0D-vD28U,0,0,31,0,0,0,1


In [36]:
features = ['!', '@', '#', '%', '&', '+', '-', '=', '.', '/', '\\', '?']
for symbol in features:
            df[symbol] = df['url'].apply(lambda x: x.count(symbol))
df.head()

Unnamed: 0,url,label,ip,len,redir,short,sensitive,subdomains,!,@,#,%,&,+,-,=,.,/,\,?
8722,ioliu.cn,0,0,8,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
5978,http://pay-ovhcloud-fr-dcdc53f53db.querbeet-ei...,1,0,58,0,0,0,1,0,0,0,0,0,0,4,1,2,4,0,1
45068,https://tsai.com.au/mass.org/,1,0,29,0,0,0,1,0,0,0,0,0,0,0,0,3,4,0,0
7549,xiaobaipan.com,0,0,14,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
321611,youtube.com/watch?v=eiv0D-vD28U,0,0,31,0,0,0,1,0,0,0,0,0,0,1,1,1,1,0,1


In [37]:
def https(url):
    if url[:5] == "https":
        return 1
    else:
        return 0
df['https'] = df['url'].apply(lambda x: https(x))
df.head()

Unnamed: 0,url,label,ip,len,redir,short,sensitive,subdomains,!,@,...,%,&,+,-,=,.,/,\,?,https
8722,ioliu.cn,0,0,8,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
5978,http://pay-ovhcloud-fr-dcdc53f53db.querbeet-ei...,1,0,58,0,0,0,1,0,0,...,0,0,0,4,1,2,4,0,1,0
45068,https://tsai.com.au/mass.org/,1,0,29,0,0,0,1,0,0,...,0,0,0,0,0,3,4,0,0,1
7549,xiaobaipan.com,0,0,14,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
321611,youtube.com/watch?v=eiv0D-vD28U,0,0,31,0,0,0,1,0,0,...,0,0,0,1,1,1,1,0,1,0


In [38]:
def http(url):
    if url[:4] == "http" and url[:5] != 'https' :
        return 1
    else:
        return 0
df['http'] = df['url'].apply(lambda x: http(x))
df.head()

Unnamed: 0,url,label,ip,len,redir,short,sensitive,subdomains,!,@,...,&,+,-,=,.,/,\,?,https,http
8722,ioliu.cn,0,0,8,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
5978,http://pay-ovhcloud-fr-dcdc53f53db.querbeet-ei...,1,0,58,0,0,0,1,0,0,...,0,0,4,1,2,4,0,1,0,1
45068,https://tsai.com.au/mass.org/,1,0,29,0,0,0,1,0,0,...,0,0,0,0,3,4,0,0,1,0
7549,xiaobaipan.com,0,0,14,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
321611,youtube.com/watch?v=eiv0D-vD28U,0,0,31,0,0,0,1,0,0,...,0,0,1,1,1,1,0,1,0,0


In [41]:
#Create a new dataframe and remove url
new_df = df.drop('url', axis=1)
new_df.head()

Unnamed: 0,label,ip,len,redir,short,sensitive,subdomains,!,@,#,...,&,+,-,=,.,/,\,?,https,http
8722,0,0,8,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5978,1,0,58,0,0,0,1,0,0,0,...,0,0,4,1,2,4,0,1,0,1
45068,1,0,29,0,0,0,1,0,0,0,...,0,0,0,0,3,4,0,0,1,0
7549,0,0,14,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
321611,0,0,31,0,0,0,1,0,0,0,...,0,0,1,1,1,1,0,1,0,0


In [42]:
#Export new dataframe as a csv
new_df.to_csv('DataSets/EngineeredData.csv', index=False)