# Feature Engineering
### Create combined dataframe with phishing and legitimate URLs

In [4]:
#Phishing URLs
import pandas as pd

phish = pd.read_csv('phishing.csv')
phish.shape

(51527, 2)

In [5]:
#Select 75,000 legitimate URLs to be used (will shrink down once offline URLs are removed)
legitimate = pd.read_csv('legitimate.csv')
legit = legitimate.sample(n=50000, replace=False)
legit.shape

(50000, 2)

In [6]:
#Combine dataframe and shuffle data
df = pd.concat([phish, legit], ignore_index=True).sample(frac=1).reset_index(drop=True)
df.shape

(101527, 2)

## URL-Based Features
     Features are extracted from the URL itself.
     Features include: 
        1. Presence of IP Address in the URL
        2. Presence of '@' in the URL
        3. URL Length
        4. The presence of redirection in the URL
        5. Use of URL Shortening Services
        6. The presence of sensitive words
        7. Number of subdomains
        8. Having a hyphen '-' in the domain name


### Feature 1: Presence of IP Address in URL
#### This is common in phishing scams

In [7]:
import ipaddress
from urllib.parse import urlparse
def usesIP(url):
    try:
        ip = ipaddress.ip_address(url)
        return 1
    except:
        return 0

### Feature 2: Presence of '@' Symbol in URL
#### The '@' symbol ignores all text proceeding it and is common in phishing scams

In [8]:
def hasAt(url):
    if '@' in url:
        return 1
    else:
        return 0

### Feature 3: URL Length
#### Longer URLs tend to be associated with phishing scams

In [9]:
def url_length(url):
    length = len(url)
    return length

### Feature 4: Redirection
#### The presence of '//' in a URL means that redirection is present. This is common in phishing scams
#### Checks after the 8th index to ensure that https or http is not caught

In [10]:
import re
def redirect(url):
    if url[8:].find('//') >= 0:
        return 1
    else:
        return 0

### Feature 5: URL Shortening
#### Check for the presence of common URL shortening links which cause a redirection and are common in scams

In [11]:
def isShort(url):
    shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"
    
    if re.search(shortening_services,url):
        return 1
    else:
        return 0

### Feature 6: Presence of Sensitive Words
#### Phishing URLs commonly have sensitive words such as those listed below to trick unsuspecting users

In [None]:
def check_sensitive_words(url):
    sensitive_words = r"Password|Account|Login|Verify|Security|Update|Payment|Card|Bank|Alert|Access|" \
                            r"Confirm|Information|Identity|Social Security|Verification|Fraud|Suspended|Limited|" \
                            r"Urgent|Unusual|Unauthorised|Suspicious|Lock|Reset|Expired|Invalid|Disabled|Termination|" \
                            r"Deactivation|Hack|Breach|Compromised|Phishing|Scam|Fake|Spam|Spoof|Spoofing|Impersonation|" \
                            r"Emergency|Critical|Failure|Error|Warning|Alert|Threat|Danger|Attack|Virus"
    if re.search(sensitive_words, url):
        return 1
    else:
        return 0

### Feature 7: Number of Subdomains
#### Phishing scams commonly have a higher number of subdomains

In [13]:
import tldextract
def count_subdomains(url):
    subdomains = tldextract.extract(url).subdomain.split('.')
    return len(subdomains)

### Feature 8: Presence of '-' in Domain
#### This is common in phishing scams to appear to be a legitimate common website

In [14]:
def isHyphen(url):
    if '-' in urlparse(url).netloc:
        return 1            
    else:
        return 0            

## HTML and Javascript-Based Features
     Features are extracted from the domain.
     Features include: 
        1. IFrame redirection
        2. Status bar customization
        3. Disabling right-click
        4. Number of redirects

### Feature 9: HTML Redirection
#### Check for IFrame tags which is HTML redirection commonly present in phishing scams

In [None]:
import requests
def htmlredir(url):
    try:
        response = requests.get(url)
        if re.findall(r"[|]", response.text):
            return 1
        else:
            return 0
    except:
        return 0

# Create new DataFrame based off of features

In [None]:
def feature_extraction(url, label):
    features = []
    #URL-based features
    df['ip'] = df['url'].apply(lambda x: usesIP(urlparse(x).hostname))
    df['@'] = df['url'].apply(lambda x: hasAt(x))
    df['length'] = df['url'].apply(lambda x: url_length(x))
    df['redirect'] = df['url'].apply(lambda x: redirect(x))
    df['shortened'] = df['url'].apply(lambda x: isShort(x))
    df['SensitiveWord'] = df['url'].apply(lambda x: check_sensitive_words(x))
    df['SubdomainCount'] = df['url'].apply(lambda x: count_subdomains(x))
    df['hyphen'] = df['url'].apply(lambda x: isHyphen(x))

    #HTML and Javascript-based features
    df['htmlredir'] = df['url'].apply(lambda x: htmlredir(x))
    
    return features

df
