In [1]:
import pandas as pd

### Phishing links characteristics

1)Long URL -->
If the length of the URL is greater than or equal 54 characters then the URL classified as phishing
(if the length is more than 45 but less than 54 is classified as suspicious)

2)@ symbol at the URL

3)Redirecting using “//” (this means this urls sends the user to another page)

4)Having "-" in domain name



5)Sub-Domain and Multi Sub-Domains (having more than 3 parts seperated with coma {subdomains})


6)Having "ngrok" in the address

In [3]:
full_urls = pd.read_csv('cleaned_link_dataset.csv')

In [4]:
full_urls.head(5)

Unnamed: 0.1,Unnamed: 0,URL
0,0,https://locking-app-adverds.000webhostapp.com/...
1,1,http://www.myhealthcarepharmacy.ca/wp-includes...
2,2,http://code.google.com/p/pylevenshtein/
3,3,http://linkedin.com/
4,4,http://imageshack.com/f/219/cadir2yr3.jpg


In [5]:
sep_urls = pd.read_csv('full_cleaned.csv')

In [7]:
sep_urls.head(5)

Unnamed: 0.1,Unnamed: 0,protocol,domain,address
0,0,https,locking-app-adverds.000webhostapp.com,payment-update-0.html?fb_source=bookmark_apps&...
1,1,http,www.myhealthcarepharmacy.ca,wp-includes/js/jquery/ini.php
2,2,http,code.google.com,p/pylevenshtein/
3,3,http,linkedin.com,
4,4,http,imageshack.com,f/219/cadir2yr3.jpg


### Adding some neccessary columns 

In [8]:
sep_urls['is_long'] = 0
sep_urls['has_@'] = 0
sep_urls['has_//'] = 0
sep_urls['has-domain'] = 0
sep_urls['many_subdomains'] = 0
sep_urls['has_ngrok'] = 0
sep_urls['phishing'] = 0

In [9]:
sep_urls.head(5)

Unnamed: 0.1,Unnamed: 0,protocol,domain,address,is_long,has_@,has_//,has-domain,many_subdomains,has_ngrok,phishing
0,0,https,locking-app-adverds.000webhostapp.com,payment-update-0.html?fb_source=bookmark_apps&...,0,0,0,0,0,0,0
1,1,http,www.myhealthcarepharmacy.ca,wp-includes/js/jquery/ini.php,0,0,0,0,0,0,0
2,2,http,code.google.com,p/pylevenshtein/,0,0,0,0,0,0,0
3,3,http,linkedin.com,,0,0,0,0,0,0,0
4,4,http,imageshack.com,f/219/cadir2yr3.jpg,0,0,0,0,0,0,0


In [39]:
#full url
def is_long(x):
    if len(str(x)) > 53:
        return 1
    return 0

In [27]:
#full url
def has_at(x):
    if '@' in str(x):
        return 1
    return 0

In [28]:
#only domain column
def double_slash(x):
    if '//' in str(x):
        return 1
    return 0

In [53]:
#only domain column
def has_minus(x):
    if '-' in str(x):
        return 1
    return 0

In [57]:
#only in domain column
def many_subd(x):
    if str(x).count('.') > 3:
        return 1
    return 0

In [31]:
#all url
def has_ngrok(x):
    if 'ngrok' in str(x):
        return 1
    return 0

### Applying the functions to the dataframe

In [43]:
sep_urls['is_long'] = full_urls['URL'].apply(is_long)


In [47]:
sep_urls['has_@'] = full_urls['URL'].apply(has_at)

In [51]:
sep_urls['has_//'] = sep_urls['domain'].apply(double_slash)

In [54]:
sep_urls['has-domain'] = sep_urls['domain'].apply(has_minus)

In [58]:
sep_urls['many_subdomains'] = sep_urls['domain'].apply(many_subd)

In [59]:
sep_urls['has_ngrok'] = full_urls['URL'].apply(has_ngrok)

In [60]:
sep_urls['phishing'] = 

Unnamed: 0.1,Unnamed: 0,protocol,domain,address,is_long,has_@,has_//,has-domain,many_subdomains,has_ngrok,phishing
0,0,https,locking-app-adverds.000webhostapp.com,payment-update-0.html?fb_source=bookmark_apps&...,1,0,0,1,0,0,0
1,1,http,www.myhealthcarepharmacy.ca,wp-includes/js/jquery/ini.php,1,0,0,0,0,0,0
2,2,http,code.google.com,p/pylevenshtein/,0,0,0,0,0,0,0
3,3,http,linkedin.com,,0,0,0,0,0,0,0
4,4,http,imageshack.com,f/219/cadir2yr3.jpg,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1776,1776,https,docs.google.com,document/u/1/,0,0,0,0,0,0,0
1777,1777,http,www.charlestodd.com,wp-includes/pomo/adobe2.html,1,0,0,0,0,0,0
1778,1778,http,tslimpact.com,medsynaptic/wp-content/themes/twentyseventeen/...,1,0,0,0,0,0,0
1779,1779,http,daoudilorin11.mystagingwebsite.com,wp-content/plugins/ubh/acc/dir/68e1c/dir/car.php,1,0,0,0,0,0,0


### Filling PHISHING column based on other columns result

In [61]:
conditions = [
    (sep_urls['is_long'] == 1),
    (sep_urls['has_@'] ==1),
    (sep_urls['has_//'] == 1),
    (sep_urls['many_subdomains'] == 1),
    (sep_urls['has-domain'] ==1),
    (sep_urls['has_ngrok'] ==1),
    ]    

In [62]:
import numpy as np

In [68]:
value = [1,1,1,1,1,1]

In [69]:
sep_urls['phishing'] = np.select(conditions, value)


In [70]:
sep_urls

Unnamed: 0.1,Unnamed: 0,protocol,domain,address,is_long,has_@,has_//,has-domain,many_subdomains,has_ngrok,phishing
0,0,https,locking-app-adverds.000webhostapp.com,payment-update-0.html?fb_source=bookmark_apps&...,1,0,0,1,0,0,1
1,1,http,www.myhealthcarepharmacy.ca,wp-includes/js/jquery/ini.php,1,0,0,0,0,0,1
2,2,http,code.google.com,p/pylevenshtein/,0,0,0,0,0,0,0
3,3,http,linkedin.com,,0,0,0,0,0,0,0
4,4,http,imageshack.com,f/219/cadir2yr3.jpg,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1776,1776,https,docs.google.com,document/u/1/,0,0,0,0,0,0,0
1777,1777,http,www.charlestodd.com,wp-includes/pomo/adobe2.html,1,0,0,0,0,0,1
1778,1778,http,tslimpact.com,medsynaptic/wp-content/themes/twentyseventeen/...,1,0,0,0,0,0,1
1779,1779,http,daoudilorin11.mystagingwebsite.com,wp-content/plugins/ubh/acc/dir/68e1c/dir/car.php,1,0,0,0,0,0,1


In [71]:
sep_urls.to_csv('final_result.csv')