In [92]:
import numpy as np
import sklearn
import torch


In [93]:
# https://archive.ics.uci.edu/dataset/327/phishing+websites
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
phishing_websites = fetch_ucirepo(id=327) 
  
# data (as pandas dataframes) 
X = phishing_websites.data.features 
y = phishing_websites.data.targets 
  
# metadata 
print(phishing_websites.metadata) 
  
# variable information 
print(phishing_websites.variables) 



{'uci_id': 327, 'name': 'Phishing Websites', 'repository_url': 'https://archive.ics.uci.edu/dataset/327/phishing+websites', 'data_url': 'https://archive.ics.uci.edu/static/public/327/data.csv', 'abstract': 'This dataset collected mainly from: PhishTank archive, MillerSmiles archive, Googleâ€™s searching operators.', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 11055, 'num_features': 30, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['result'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2012, 'last_updated': 'Tue Mar 05 2024', 'dataset_doi': '10.24432/C51W2X', 'creators': ['Rami Mohammad', 'Lee McCluskey'], 'intro_paper': {'ID': 396, 'type': 'NATIVE', 'title': 'An assessment of features related to phishing websites using an automated technique', 'authors': 'R. Mohammad, F. Thabtah, L. Mccluskey', 'venue': 'International Conference for Internet Tec

In [94]:
X.columns

Index(['having_ip_address', 'url_length', 'shortining_service',
       'having_at_symbol', 'double_slash_redirecting', 'prefix_suffix',
       'having_sub_domain', 'sslfinal_state', 'domain_registration_length',
       'favicon', 'port', 'https_token', 'request_url', 'url_of_anchor',
       'links_in_tags', 'sfh', 'submitting_to_email', 'abnormal_url',
       'redirect', 'on_mouseover', 'rightclick', 'popupwindow', 'iframe',
       'age_of_domain', 'dnsrecord', 'web_traffic', 'page_rank',
       'google_index', 'links_pointing_to_page', 'statistical_report'],
      dtype='object')

In [95]:
bad = X.isna().any(axis=1)
X = X[~bad]
y = y[~bad]

In [96]:
X = X[['having_ip_address', 'url_length', 'shortining_service',
       'having_at_symbol', 'double_slash_redirecting', 'prefix_suffix',
       'having_sub_domain', 'sslfinal_state', 'domain_registration_length',
       'favicon', 'port', 'https_token', 'request_url', 'url_of_anchor',
       'links_in_tags', 'sfh', 'submitting_to_email', 'abnormal_url',
       'redirect', 'on_mouseover', 'rightclick', 'popupwindow', 'iframe',
       'age_of_domain', 'dnsrecord', 'web_traffic', 'page_rank',
       'google_index', 'links_pointing_to_page', 'statistical_report']]

In [97]:
y = y.values
X = X.values.astype('float64')

X -= np.mean(X,axis=0)

X = torch.tensor(X).float()
y = np.where(y == -1, 0, y)  # Convert -1 to 0
y = torch.tensor(y.flatten()).long()  # Convert to tensor


In [98]:
# 1. Divide data into train and test splits
import sklearn
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=0, train_size = .75)

Now that we have all the prepared training data ready to go, it's time to define our model.
This will be my initial model I will use and see how well it does :D

In [99]:
mlp_model = torch.nn.Sequential(
    torch.nn.Linear(30, 100), # 30 inputs, 1 hidden layer of size 100
    torch.nn.ReLU(), # hidden activation function, the magic happens
    torch.nn.Linear(100, 2) # 100 inputs, 2 outputs
)

# Create a cross-entropy loss function and a stochastic gradient descent (SGD) optimizer
loss_fn = torch.nn.CrossEntropyLoss()
lr = 1e-4
opt = torch.optim.SGD(mlp_model.parameters(), lr=lr)

In [100]:
batch = 32

train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch, shuffle=False)
epochs = 100

for epoch in range(epochs):
    total_loss = 0

    for batch_X, batch_y in train_dataloader:
        # print("batch_y: ", batch_y.shape)
        # print("torch: ", torch.unique(y))
        opt.zero_grad()  # Zero out gradients

        z = mlp_model(batch_X)  # Forward pass
        loss = loss_fn(z, batch_y)  # Compute loss

        loss.backward()  # Backpropagation
        opt.step()  # Apply gradients

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dataloader):.4f}")



Epoch 1/100, Loss: 0.6811
Epoch 2/100, Loss: 0.6761
Epoch 3/100, Loss: 0.6713
Epoch 4/100, Loss: 0.6667
Epoch 5/100, Loss: 0.6621
Epoch 6/100, Loss: 0.6576
Epoch 7/100, Loss: 0.6533
Epoch 8/100, Loss: 0.6490
Epoch 9/100, Loss: 0.6439
Epoch 10/100, Loss: 0.6399
Epoch 11/100, Loss: 0.6353
Epoch 12/100, Loss: 0.6311
Epoch 13/100, Loss: 0.6281
Epoch 14/100, Loss: 0.6234
Epoch 15/100, Loss: 0.6194
Epoch 16/100, Loss: 0.6155
Epoch 17/100, Loss: 0.6116
Epoch 18/100, Loss: 0.6077
Epoch 19/100, Loss: 0.6041
Epoch 20/100, Loss: 0.6005
Epoch 21/100, Loss: 0.5966
Epoch 22/100, Loss: 0.5927
Epoch 23/100, Loss: 0.5890
Epoch 24/100, Loss: 0.5858
Epoch 25/100, Loss: 0.5820
Epoch 26/100, Loss: 0.5787
Epoch 27/100, Loss: 0.5749
Epoch 28/100, Loss: 0.5712
Epoch 29/100, Loss: 0.5678
Epoch 30/100, Loss: 0.5645
Epoch 31/100, Loss: 0.5609
Epoch 32/100, Loss: 0.5581
Epoch 33/100, Loss: 0.5545
Epoch 34/100, Loss: 0.5513
Epoch 35/100, Loss: 0.5476
Epoch 36/100, Loss: 0.5445
Epoch 37/100, Loss: 0.5411
Epoch 38/1

In [101]:

def accuracy(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch_X, batch_y in dataloader:
            z = model(batch_X)
            sample, predicted_labels = torch.max(z, dim=1)
            correct += (predicted_labels == batch_y).sum().item()
            total += batch_y.size(0)

    return correct/total
print(f"MLP Train Accuracy: {accuracy(mlp_model, train_dataloader)}")
print(f"MLP Test Accuracy: {accuracy(mlp_model, test_dataloader)}")



MLP Train Accuracy: 0.91038475455313
MLP Test Accuracy: 0.8990593342981187


So my accuracy is.
I now have made some modifications in attempt to improve my model with higher training and test accuracy.
The main changes are:
- Raising learning rate from 1e-4 to 1e-3
- Using Adapative Moment Estimation (ADAM) optimizer
- Including a weight decay of 1e-4 to my optimizer
- Having 1000 epochs. However, after testing with even 100 or 200 epochs, the change in accuracy was insignificant. 

In [102]:
mlp_model = torch.nn.Sequential(
    torch.nn.Linear(30, 100), # 30 inputs, 1 hidden layer of size 100
    torch.nn.ReLU(), # hidden activation function, the magic happens
    torch.nn.Linear(100, 2) # 100 inputs, 2 outputs
)

# Create a cross-entropy loss function and a stochastic gradient descent (SGD) optimizer
loss_fn = torch.nn.CrossEntropyLoss()
lr = 1e-3
opt = torch.optim.Adam(mlp_model.parameters(), lr=lr, weight_decay=1e-4)

In [103]:
batch = 32

train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch, shuffle=False)
epochs = 100

for epoch in range(epochs):
    total_loss = 0

    for batch_X, batch_y in train_dataloader:
        # print("batch_y: ", batch_y.shape)
        # print("torch: ", torch.unique(y))
        opt.zero_grad()  # Zero out gradients

        z = mlp_model(batch_X)  # Forward pass
        loss = loss_fn(z, batch_y)  # Compute loss

        loss.backward()  # Backpropagation
        opt.step()  # Apply gradients

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dataloader):.4f}")



Epoch 1/100, Loss: 0.2772
Epoch 2/100, Loss: 0.1630
Epoch 3/100, Loss: 0.1483
Epoch 4/100, Loss: 0.1376
Epoch 5/100, Loss: 0.1275
Epoch 6/100, Loss: 0.1190
Epoch 7/100, Loss: 0.1121
Epoch 8/100, Loss: 0.1095
Epoch 9/100, Loss: 0.1028
Epoch 10/100, Loss: 0.0967
Epoch 11/100, Loss: 0.0930
Epoch 12/100, Loss: 0.0886
Epoch 13/100, Loss: 0.0872
Epoch 14/100, Loss: 0.0864
Epoch 15/100, Loss: 0.0806
Epoch 16/100, Loss: 0.0778
Epoch 17/100, Loss: 0.0749
Epoch 18/100, Loss: 0.0732
Epoch 19/100, Loss: 0.0718
Epoch 20/100, Loss: 0.0706
Epoch 21/100, Loss: 0.0685
Epoch 22/100, Loss: 0.0660
Epoch 23/100, Loss: 0.0651
Epoch 24/100, Loss: 0.0635
Epoch 25/100, Loss: 0.0621
Epoch 26/100, Loss: 0.0604
Epoch 27/100, Loss: 0.0592
Epoch 28/100, Loss: 0.0576
Epoch 29/100, Loss: 0.0566
Epoch 30/100, Loss: 0.0548
Epoch 31/100, Loss: 0.0556
Epoch 32/100, Loss: 0.0550
Epoch 33/100, Loss: 0.0521
Epoch 34/100, Loss: 0.0527
Epoch 35/100, Loss: 0.0512
Epoch 36/100, Loss: 0.0511
Epoch 37/100, Loss: 0.0517
Epoch 38/1

In [104]:
print(f"MLP Train Accuracy: {accuracy(mlp_model, train_dataloader)}")
print(f"MLP Test Accuracy: {accuracy(mlp_model, test_dataloader)}")

MLP Train Accuracy: 0.988421179592329
MLP Test Accuracy: 0.9663531114327062


I learned that the Adam optimizer greatly improved my training and test accuracies to 99%. I did try adjusting my model to have more layers. Including 2 or 3 layers lowered my training and test accuracy from 90% to 80%. Including 7 layers and more will lower the training and test accuracy to 55%. 