In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
data_1 = pd.read_csv("/content/raw_data.csv")

In [3]:
data_1.head()

Unnamed: 0,URL,Target,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,https://locking-app-adverds.000webhostapp.com/...,yes,,,,
1,http://www.myhealthcarepharmacy.ca/wp-includes...,yes,,,,
2,http://code.google.com/p/pylevenshtein/,no,,,,
3,http://linkedin.com/,no,,,,
4,http://imageshack.com/f/219/cadir2yr3.jpg,no,,,,


In [4]:
data_1['URL'].str.split("://").sample(5)

Unnamed: 0,URL
725,"[http, kitapyurdu.com/]"
718,"[http, order.hbonow.com/]"
498,"[http, kartkihaftowane.com/image/lisense/box.c..."
939,"[http, southeasternhotelmanagement.com/art/]"
745,"[http, bank-of-america-online-banking-com.usa...."


In [5]:
protocol_separating = data_1['URL'].str.split("://",expand = True)

In [6]:
protocol_separating.head()

Unnamed: 0,0,1,2,3,4,5,6
0,https,locking-app-adverds.000webhostapp.com/payment-...,,,,,
1,http,www.myhealthcarepharmacy.ca/wp-includes/js/jqu...,,,,,
2,http,code.google.com/p/pylevenshtein/,,,,,
3,http,linkedin.com/,,,,,
4,http,imageshack.com/f/219/cadir2yr3.jpg,,,,,


In [7]:
domain = protocol_separating[1].str.split("/",n = 1,expand = True) #split(seperator,no of splits according to seperator(delimiter),expand)

In [8]:
domain.columns=["domain_name","address"] #renaming columns of data frame

In [9]:
domain.head()

Unnamed: 0,domain_name,address
0,locking-app-adverds.000webhostapp.com,payment-update-0.html?fb_source=bookmark_apps&...
1,www.myhealthcarepharmacy.ca,wp-includes/js/jquery/ini.php
2,code.google.com,p/pylevenshtein/
3,linkedin.com,
4,imageshack.com,f/219/cadir2yr3.jpg


In [10]:
splitted_data = pd.concat([protocol_separating[0],domain],axis=1)

In [11]:
splitted_data.columns = ['protocol','domain_name','address']

In [12]:
splitted_data.head()

Unnamed: 0,protocol,domain_name,address
0,https,locking-app-adverds.000webhostapp.com,payment-update-0.html?fb_source=bookmark_apps&...
1,http,www.myhealthcarepharmacy.ca,wp-includes/js/jquery/ini.php
2,http,code.google.com,p/pylevenshtein/
3,http,linkedin.com,
4,http,imageshack.com,f/219/cadir2yr3.jpg


In [13]:
splitted_data['is_phished'] = pd.Series(data_1['Target'], index=splitted_data.index)

In [14]:
splitted_data

Unnamed: 0,protocol,domain_name,address,is_phished
0,https,locking-app-adverds.000webhostapp.com,payment-update-0.html?fb_source=bookmark_apps&...,yes
1,http,www.myhealthcarepharmacy.ca,wp-includes/js/jquery/ini.php,yes
2,http,code.google.com,p/pylevenshtein/,no
3,http,linkedin.com,,no
4,http,imageshack.com,f/219/cadir2yr3.jpg,no
...,...,...,...,...
1776,https,docs.google.com,document/u/1/,no
1777,http,www.charlestodd.com,wp-includes/pomo/adobe2.html,yes
1778,http,tslimpact.com,medsynaptic/wp-content/themes/twentyseventeen/...,yes
1779,http,daoudilorin11.mystagingwebsite.com,wp-content/plugins/ubh/acc/dir/68e1c/dir/car.php,yes


In [15]:
def long_url(l):
    l= str(l)
    """This function is defined in order to differntiate website based on the length of the URL"""
    if len(l) < 54:
        return 0
    elif len(l) >= 54 and len(l) <= 75:
        return 2
    return 1

In [16]:
#Applying the above defined function in order to divide the websites into 3 categories
splitted_data['long_url'] = data_1['URL'].apply(long_url)

In [17]:
#Will show the results only the websites which are legitimate according to above condition as 0 is legitimate website
splitted_data[splitted_data.long_url == 0]

Unnamed: 0,protocol,domain_name,address,is_phished,long_url
2,http,code.google.com,p/pylevenshtein/,no,0
3,http,linkedin.com,,no,0
4,http,imageshack.com,f/219/cadir2yr3.jpg,no,0
6,http,www.7-zip.org,download.html,no,0
7,http,ebay.com,,no,0
...,...,...,...,...,...
1766,http,malomolk.com,nab/cardinfo.html,yes,0
1774,http,www.dpincsupport.com,,no,0
1775,https,bitcoin.org,en/,no,0
1776,https,docs.google.com,document/u/1/,no,0


In [18]:
def have_at_symbol(l):
    """This function is used to check whether the URL contains @ symbol or not"""
    if "@" in str(l):
        return 1
    return 0

In [19]:
splitted_data['having_@_symbol'] = data_1['URL'].apply(have_at_symbol)

In [20]:
splitted_data

Unnamed: 0,protocol,domain_name,address,is_phished,long_url,having_@_symbol
0,https,locking-app-adverds.000webhostapp.com,payment-update-0.html?fb_source=bookmark_apps&...,yes,1,0
1,http,www.myhealthcarepharmacy.ca,wp-includes/js/jquery/ini.php,yes,2,0
2,http,code.google.com,p/pylevenshtein/,no,0,0
3,http,linkedin.com,,no,0,0
4,http,imageshack.com,f/219/cadir2yr3.jpg,no,0,0
...,...,...,...,...,...,...
1776,https,docs.google.com,document/u/1/,no,0,0
1777,http,www.charlestodd.com,wp-includes/pomo/adobe2.html,yes,2,0
1778,http,tslimpact.com,medsynaptic/wp-content/themes/twentyseventeen/...,yes,1,0
1779,http,daoudilorin11.mystagingwebsite.com,wp-content/plugins/ubh/acc/dir/68e1c/dir/car.php,yes,1,0


In [21]:
def redirection(l):
    """If the url has symbol(//) after protocol then such URL is to be classified as phishing """
    if "//" in str(l):
        return 1
    return 0

In [22]:
splitted_data['redirection_//_symbol'] = protocol_separating[1].apply(redirection)

In [23]:
splitted_data.head()

Unnamed: 0,protocol,domain_name,address,is_phished,long_url,having_@_symbol,redirection_//_symbol
0,https,locking-app-adverds.000webhostapp.com,payment-update-0.html?fb_source=bookmark_apps&...,yes,1,0,0
1,http,www.myhealthcarepharmacy.ca,wp-includes/js/jquery/ini.php,yes,2,0,0
2,http,code.google.com,p/pylevenshtein/,no,0,0,0
3,http,linkedin.com,,no,0,0,0
4,http,imageshack.com,f/219/cadir2yr3.jpg,no,0,0,0


In [24]:
def prefix_suffix_seperation(l):
    if '-' in str(l):
        return 1
    return 0

In [25]:
splitted_data['prefix_suffix_seperation'] = domain['domain_name'].apply(prefix_suffix_seperation)

In [26]:
splitted_data.head()

Unnamed: 0,protocol,domain_name,address,is_phished,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation
0,https,locking-app-adverds.000webhostapp.com,payment-update-0.html?fb_source=bookmark_apps&...,yes,1,0,0,1
1,http,www.myhealthcarepharmacy.ca,wp-includes/js/jquery/ini.php,yes,2,0,0,0
2,http,code.google.com,p/pylevenshtein/,no,0,0,0,0
3,http,linkedin.com,,no,0,0,0,0
4,http,imageshack.com,f/219/cadir2yr3.jpg,no,0,0,0,0


In [27]:
def sub_domains(l):
    l= str(l)
    if l.count('.') < 3:
        return 0
    elif l.count('.') == 3:
        return 2
    return 1

In [28]:
splitted_data['sub_domains'] = splitted_data['domain_name'].apply(sub_domains)

In [29]:
splitted_data

Unnamed: 0,protocol,domain_name,address,is_phished,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation,sub_domains
0,https,locking-app-adverds.000webhostapp.com,payment-update-0.html?fb_source=bookmark_apps&...,yes,1,0,0,1,0
1,http,www.myhealthcarepharmacy.ca,wp-includes/js/jquery/ini.php,yes,2,0,0,0,0
2,http,code.google.com,p/pylevenshtein/,no,0,0,0,0,0
3,http,linkedin.com,,no,0,0,0,0,0
4,http,imageshack.com,f/219/cadir2yr3.jpg,no,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
1776,https,docs.google.com,document/u/1/,no,0,0,0,0,0
1777,http,www.charlestodd.com,wp-includes/pomo/adobe2.html,yes,2,0,0,0,0
1778,http,tslimpact.com,medsynaptic/wp-content/themes/twentyseventeen/...,yes,1,0,0,0,0
1779,http,daoudilorin11.mystagingwebsite.com,wp-content/plugins/ubh/acc/dir/68e1c/dir/car.php,yes,1,0,0,0,0


In [30]:
#Features
x = splitted_data.columns[4:9]
x

Index(['long_url', 'having_@_symbol', 'redirection_//_symbol',
       'prefix_suffix_seperation', 'sub_domains'],
      dtype='object')

In [31]:
#variable to be predicted; yes = 0 and no = 1
y = pd.factorize(splitted_data['is_phished'])[0]
y

array([0, 0, 1, ..., 0, 0, 0])

In [32]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = DecisionTreeClassifier()

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(splitted_data[x], y)

In [33]:
test_data = pd.read_csv("/content/test_data.csv")

In [34]:
clf.predict(test_data[x]) #testing the classifier on test data.

array([0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,

In [35]:
preds = test_data.is_phished[clf.predict(test_data[x])] #predicted values

In [36]:
preds.head(10)

Unnamed: 0,is_phished
0,yes
0,yes
1,no
0,yes
1,no
1,no
0,yes
1,no
1,no
1,no


In [37]:
actual = pd.Series(test_data['is_phished']) #actual values

In [38]:
confusion_matrix(actual,preds)

array([[ 92,  82],
       [ 60, 156]])

In [39]:
accuracy_score(actual,preds) #accuracy of classifier

0.6358974358974359

In [40]:
test_data[x]

Unnamed: 0,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation,sub_domains
0,1,0,0,1,0
1,1,0,0,0,0
2,0,0,0,0,0
3,1,0,0,0,0
4,0,0,0,0,0
...,...,...,...,...,...
385,1,1,0,0,0
386,0,0,0,0,0
387,0,0,0,0,0
388,0,0,0,0,2


In [41]:
link = input("Enter the Link: ")

Enter the Link: https://www.instagram.com/


In [42]:
df = pd.DataFrame([[link]],columns=['URL'])

In [43]:
df

Unnamed: 0,URL
0,https://www.instagram.com/


In [44]:
df['URL'].str.split("://").head() #Here we divided the protocol from the entire URL. but need it to be divided it
                                                 #seperate column

Unnamed: 0,URL
0,"[https, www.instagram.com/]"


In [45]:
separation = df['URL'].str.split("://",expand = True)

In [46]:
separation

Unnamed: 0,0,1
0,https,www.instagram.com/


In [47]:
domain_name = separation[1].str.split("/",n = 1,expand = True)

In [48]:
domain_name.columns=["domain_name","address"]

In [49]:
domain_name

Unnamed: 0,domain_name,address
0,www.instagram.com,


In [50]:
#Concatenation of data frames
split_data = pd.concat([separation[0],domain_name],axis=1)

In [51]:
split_data.columns = ['protocol','domain_name','address']

In [52]:
split_data.head()

Unnamed: 0,protocol,domain_name,address
0,https,www.instagram.com,


In [53]:
def long_url(l):
    l= str(l)
    """This function is defined in order to differntiate website based on the length of the URL"""
    if len(l) < 54:
        return 0
    elif len(l) >= 54 and len(l) <= 75:
        return 2
    return 1

In [54]:
#Applying the above defined function in order to divide the websites into 3 categories
split_data['long_url'] = df['URL'].apply(long_url)

In [55]:
split_data

Unnamed: 0,protocol,domain_name,address,long_url
0,https,www.instagram.com,,0


In [56]:
#Will show the results only the websites which are legitimate according to above condition as 0 is legitimate website
split_data[splitted_data.long_url == 0]

  split_data[splitted_data.long_url == 0]


Unnamed: 0,protocol,domain_name,address,long_url


In [57]:
len(df['URL'][0])

26

In [58]:
def have_at_symbol(l):
    """This function is used to check whether the URL contains @ symbol or not"""
    if "@" in str(l):
        return 1
    return 0

In [59]:
split_data['having_@_symbol'] = df['URL'].apply(have_at_symbol)

In [60]:
split_data

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol
0,https,www.instagram.com,,0,0


In [61]:
def redirection(l):
    """If the url has symbol(//) after protocol then such URL is to be classified as phishing """
    if "//" in str(l):
        return 1
    return 0

In [62]:
split_data['redirection_//_symbol'] = separation[1].apply(redirection)

In [63]:
split_data

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol
0,https,www.instagram.com,,0,0,0


In [64]:
def prefix_suffix_seperation(l):
    if '-' in str(l):
        return 1
    return 0

In [65]:
split_data['prefix_suffix_seperation'] = domain_name['domain_name'].apply(prefix_suffix_seperation)

In [66]:
split_data

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation
0,https,www.instagram.com,,0,0,0,0


In [67]:
def sub_domains(l):
    l= str(l)
    if l.count('.') < 3:
        return 0
    elif l.count('.') == 3:
        return 2
    return 1

In [68]:
split_data['sub_domains'] = split_data['domain_name'].apply(sub_domains)

In [69]:
split_data

Unnamed: 0,protocol,domain_name,address,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation,sub_domains
0,https,www.instagram.com,,0,0,0,0,0


In [70]:
x = split_data.columns[3:9]
x

Index(['long_url', 'having_@_symbol', 'redirection_//_symbol',
       'prefix_suffix_seperation', 'sub_domains'],
      dtype='object')

In [71]:
y = split_data[x]

In [72]:
y

Unnamed: 0,long_url,having_@_symbol,redirection_//_symbol,prefix_suffix_seperation,sub_domains
0,0,0,0,0,0


In [73]:
df_list = y.values.tolist()

In [74]:
df_list = np.array(df_list)

In [75]:
df_list.reshape(-1,1)

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [76]:
result = clf.predict(df_list)



In [77]:
if result == 1:
  print("It's a Normal Link")
else :
  print("It's a Malicious Link")

It's a Normal Link
