In [2]:
#Used data from https://www.kaggle.com/datasets/mohamedouledhamed/phishing-site-urls
#Load raw data
import pandas as pd
import numpy as np
df = pd.read_csv('websites.csv')
print(df.shape)
df.describe()

(549346, 2)


Unnamed: 0,URL,Label
count,549346,549346
unique,507195,2
top,jhomitevd2abj3fk.tor2web.org/,good
freq,52,392924


In [3]:
#Clean data

#Remove duplicates
df.drop_duplicates()

#Check for missing values
print('Number of missing values for each feature:')
print(df.isna().sum())

#Encode categorical features
df['Label'] = df['Label'].replace({'bad': 0, 'good': 1})
df.head()

Number of missing values for each feature:
URL      0
Label    0
dtype: int64


Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,0
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,0
3,mail.printakid.com/www.online.americanexpress....,0
4,thewhiskeydregs.com/wp-content/themes/widescre...,0


In [4]:
#Add URL length as a feature
df['Length'] = df['URL'].apply(len)
df.head()

Unnamed: 0,URL,Label,Length
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,0,225
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,0,81
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,0,177
3,mail.printakid.com/www.online.americanexpress....,0,60
4,thewhiskeydregs.com/wp-content/themes/widescre...,0,116


In [5]:
#Check for the presence of 'sensitive words'
sensitive_words = ['login', 'password', 'bank', 'account', 'credit', 'card', 'security', 'verification']

# Define function to check if a sensitive word is present in the URL
def check_sensitive_words(url):
    for word in sensitive_words:
        if word in url.lower():
            return 1
    return 0

# Apply the function to each URL in the dataframe and add the result as a new column
df['SensitiveWord'] = df['URL'].apply(check_sensitive_words)
df.head()

Unnamed: 0,URL,Label,Length,SensitiveWord
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,0,225,1
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,0,81,0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,0,177,0
3,mail.printakid.com/www.online.americanexpress....,0,60,0
4,thewhiskeydregs.com/wp-content/themes/widescre...,0,116,0


In [7]:
#Check number of subdomains
import tldextract

def count_subdomains(url):
    subdomains = tldextract.extract(url).subdomain.split('.')
    return len(subdomains)

df['SubdomainCount'] = df['URL'].apply(count_subdomains)
df.head()

Unnamed: 0,URL,Label,Length,SensitiveWord,SubdomainCount
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,0,225,1,1
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,0,81,0,1
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,0,177,0,1
3,mail.printakid.com/www.online.americanexpress....,0,60,0,1
4,thewhiskeydregs.com/wp-content/themes/widescre...,0,116,0,1


In [8]:
#See if an ip address is used instead of a domain
import ipaddress

def is_ip_address(url):
    try:
        ipaddress.ip_address(url)
        return 1
    except ValueError:
        return 0
df['IpAddress'] = df['URL'].apply(is_ip_address)
df.head()


Unnamed: 0,URL,Label,Length,SensitiveWord,SubdomainCount,IpAddress
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,0,225,1,1,0
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,0,81,0,1,0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,0,177,0,1,0
3,mail.printakid.com/www.online.americanexpress....,0,60,0,1,0
4,thewhiskeydregs.com/wp-content/themes/widescre...,0,116,0,1,0


In [9]:
#Check for the presence of a shortening service
import re

def check_shortening(url):
    shortening_services = ['bit.ly', 'goo.gl', 'ow.ly', 'tinyurl.com']
    pattern = r'^https?://(?:www\.)?(' + '|'.join(shortening_services) + ')/[a-zA-Z0-9]+$'
    if re.match(pattern, url):
        return 1
    else:
        return 0
df['Shortened'] = df['URL'].apply(is_ip_address)
df.head()

Unnamed: 0,URL,Label,Length,SensitiveWord,SubdomainCount,IpAddress,Shortened
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,0,225,1,1,0,0
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,0,81,0,1,0,0
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,0,177,0,1,0,0
3,mail.printakid.com/www.online.americanexpress....,0,60,0,1,0,0
4,thewhiskeydregs.com/wp-content/themes/widescre...,0,116,0,1,0,0


In [10]:
#Split the label feature from the other features
#Drop URL as it is a string and not encoded
y = df['Label']
X = df.drop(['URL', 'Label'], axis=1)

In [12]:
#Split training and testing set
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, 
                                                    test_size = 0.20, random_state = 6)

In [13]:
#Train the Decision Tree model
from sklearn.tree import DecisionTreeClassifier

# instantiate the model 
tree = DecisionTreeClassifier(max_depth = 5)
# fit the model 
tree.fit(X_train_val, y_train_val)
y_test_tree = tree.predict(X_test)

In [15]:
from sklearn.metrics import accuracy_score

print("Decision Tree accuracy is {:.4f}".format(accuracy_score(y_test, y_test_tree)))

Decision Tree accuracy is 0.7671


In [16]:
#Save model
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(tree, f)

In [18]:
print(df.nunique())

URL               507195
Label                  2
Length               605
SensitiveWord          2
SubdomainCount        26
IpAddress              2
Shortened              2
dtype: int64


In [22]:
print(df['Label'].value_counts())
print('**' * 20)
print(df['SensitiveWord'].value_counts())
print('**' * 20)
print(df['SubdomainCount'].value_counts())
print('**' * 20)
print(df['IpAddress'].value_counts())
print('**' * 20)
print(df['Shortened'].value_counts())
print('**' * 20)

Label
1    392924
0    156422
Name: count, dtype: int64
****************************************
SensitiveWord
0    517923
1     31423
Name: count, dtype: int64
****************************************
SubdomainCount
1     526366
2      14585
3       2567
4       1653
5       1084
11       488
7        452
9        401
10       384
8        328
12       262
6        214
17       174
13       129
14        78
15        49
16        45
18        24
22        19
19        15
20        10
21         9
23         6
25         2
33         1
26         1
Name: count, dtype: int64
****************************************
IpAddress
0    549260
1        86
Name: count, dtype: int64
****************************************
Shortened
0    549260
1        86
Name: count, dtype: int64
****************************************
