In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 1. Import the dataset

In [2]:
path = '../input/phishing-site-urls/phishing_site_urls.csv'
data = pd.read_csv(path)
phishing_data = data.sample(n=10000, random_state =42).copy() #Sample of n websites
phishing_data = phishing_data.reset_index(drop=True)
phishing_data.head()

Unnamed: 0,URL,Label
0,tubevector.com/search/?q=erika,good
1,classmates.com/directory/school/Marian%20Chris...,good
2,isgsi.com/office/adb/c58c94eccbfb951bd4ba5f2a6...,bad
3,manufacturersnews.com/executives.asp?start=CEN,good
4,'9d345009-a-62cb3a1a-s-sites.googlegroups.com/...,bad


# 2. Feature extraction

In [3]:
pip install tldextract

Collecting tldextract
  Obtaining dependency information for tldextract from https://files.pythonhosted.org/packages/d0/de/3f37b2568115c7ebeae39508dc1092f04f3dc286f22ef30171baca9c9cf2/tldextract-5.1.1-py3-none-any.whl.metadata
  Downloading tldextract-5.1.1-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Downloading tldextract-5.1.1-py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.7/97.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: requests-file, tldextract
Successfully installed requests-file-1.5.1 tldextract-5.1.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
import tldextract
from urllib.parse import urlparse

def extract_features(url):
    parsed_url = urlparse(url)
    # Extract domain features using tldextract
    domain_extract = tldextract.extract(url)
    domain = domain_extract.domain
    suffix = domain_extract.suffix

    # Features to extract
    features = {
        'url_length': len(url), #Length of URL
        'domain_length': len(domain), #Length of domain
        'dot_count': domain.count('.'), #Number of dots in the domain
        'is_ip_address': domain.replace('.', '').isdigit(), #IP address in the domain
        'special_chars_in_domain': any(char.isnumeric() or not char.isalnum() for char in domain), #Presence of special characters in the domain
        'tld_length': len(suffix), #Length of the top-level domain (e.g., '.com', '.org')
        'hyphen_in_domain': '-' in domain, #Presence of hyphen in the domain
        'at_symbol': '@' in parsed_url.netloc #Presence of '@' in the URL
    }

    return features

In [5]:
y_labels = phishing_data["Label"] 
phishing_data['Features'] = phishing_data["URL"].apply(extract_features) 
data_features = pd.json_normalize(phishing_data['Features'])
phishing_data = pd.concat([phishing_data, data_features], axis=1)
x_features = phishing_data.drop(["Label","URL","Features"], axis=1)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x_features, y_labels, test_size = 0.2, random_state = 42)

### Function for storing the data

In [7]:
models = []
accuracy_train = []
accuracy_test = []

def save_results(model, acc_tr, acc_te):
    models.append(model)
    accuracy_train.append(round(acc_tr, 3))
    accuracy_test.append(round(acc_te, 3))

# **Decision Tree**

In [8]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth = 5)
tree.fit(x_train, y_train)

In [9]:
tree_x_train = tree.predict(x_train)
tree_x_test = tree.predict(x_test)

In [10]:
tree_acc_train = accuracy_score(y_train, tree_x_train)
tree_acc_test = accuracy_score(y_test, tree_x_test)
save_results('Decision Tree', tree_acc_train, tree_acc_test)

# **Logistic Regression**

In [11]:
from sklearn.linear_model import LogisticRegression

lre = LogisticRegression()
lre.fit(x_train, y_train)

In [12]:
lre_x_train = lre.predict(x_train)
lre_x_test = lre.predict(x_test)

In [13]:
lre_acc_train = accuracy_score(y_train, lre_x_train)
lre_acc_test = accuracy_score(y_test, lre_x_test)
save_results('Logistic Regression', lre_acc_train, lre_acc_test)

# **Random Forest**

In [14]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(max_depth=5)
forest.fit(x_train, y_train)

In [15]:
forest_x_train = forest.predict(x_train)
forest_x_test = forest.predict(x_test)

In [16]:
forest_acc_train = accuracy_score(y_train, forest_x_train)
forest_acc_test = accuracy_score(y_test, forest_x_test)
save_results("Random Forest", forest_acc_train, forest_acc_test)

# **Support Vector Machine**

In [17]:
from sklearn.svm import SVC

svm = SVC(kernel='linear')
svm.fit(x_train, y_train)

In [18]:
svm_x_train = svm.predict(x_train)
svm_x_test = svm.predict(x_test)

In [19]:
svm_acc_train = accuracy_score(y_train, svm_x_train)
svm_acc_test = accuracy_score(y_test, svm_x_test)
save_results("Support Vector Machine", svm_acc_train, svm_acc_test)

# **MLP Classifier**

In [20]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(alpha=0.001, hidden_layer_sizes = [100,100,100])
mlp.fit(x_train, y_train)

In [21]:
mlp_x_train = mlp.predict(x_train)
mlp_x_test = mlp.predict(x_test)

In [22]:
mlp_acc_train = accuracy_score(y_train, mlp_x_train)
mlp_acc_test = accuracy_score(y_test, mlp_x_test)
save_results("MLP Classifier", mlp_acc_train, mlp_acc_test)

# **Model Comparison**

In [23]:
table = pd.DataFrame({'Machine Learning Model': models,
                     'Train Accuracy': accuracy_train,
                     'Test Accuracy': accuracy_test})
table.sort_values(by=['Test Accuracy', 'Train Accuracy'], ascending=False)

Unnamed: 0,Machine Learning Model,Train Accuracy,Test Accuracy
4,MLP Classifier,0.808,0.818
0,Decision Tree,0.796,0.806
2,Random Forest,0.784,0.8
1,Logistic Regression,0.745,0.749
3,Support Vector Machine,0.731,0.74


In [24]:
new_urls = ['norell.ty']
features = [extract_features(url) for url in new_urls]
url_df = pd.DataFrame(features)
print(forest.predict(url_df))

['bad']


In [25]:
import pickle
with open('./logistic_regression.h5','wb') as file:
    pickle.dump(lre, file)
with open('./decision_tree.h5','wb') as file:
    pickle.dump(tree, file)
with open('./support_vector_machine.h5','wb') as file:
    pickle.dump(svm, file)
with open('./random_forest.h5','wb') as file:
    pickle.dump(forest, file)
with open('./mlp.h5','wb') as file:
    pickle.dump(mlp, file)