In [1]:
# Load CSV file containing the database into Python
import pandas as pd

# Define the name of the file containing the database
filename = 'malicious_phish.csv'

# Load the CSV file into Python
print('Loading the dataset...')

dataset = pd.read_csv(filename)

print('Dataset Loaded Successfully!')

print(dataset.head())

Loading the dataset...
Dataset Loaded Successfully!
                                                 url        type
0                                   br-icloud.com.br    phishing
1                mp3raid.com/music/krizz_kaliko.html      benign
2                    bopsecrets.org/rexroth/cr/1.htm      benign
3  http://www.garage-pirenne.be/index.php?option=...  defacement
4  http://adventure-nicaragua.net/index.php?optio...  defacement


In [2]:
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

# Encode the 'type' column into numerical format
label_encoder = LabelEncoder()
dataset['type'] = label_encoder.fit_transform(dataset['type'])

# Split the dataset into features (URLs) and labels (types)
X = dataset['url']
y = dataset['type']

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the URLs
X_tokenized = []

print("Tokenizing URLs...")

for url in X:
    tokens = tokenizer.encode(url, add_special_tokens=True, truncation=True)
    X_tokenized.append(tokens)

print("Tokenization complete!")

# Pad the tokenized sequences
X_padded = pad_sequences(X_tokenized, padding='post', truncating='post', dtype='long', value=0)

# Convert the padded sequences and labels into PyTorch tensors
X_tensor = torch.tensor(X_padded, dtype=torch.long)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.4, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


  from .autonotebook import tqdm as notebook_tqdm







Tokenizing URLs...
Tokenization complete!
X_train shape: torch.Size([390714, 512])
X_test shape: torch.Size([260477, 512])
y_train shape: torch.Size([390714])
y_test shape: torch.Size([260477])


In [3]:
from torch.utils.data import DataLoader, TensorDataset

# Create a TensorDataset
train_dataset = TensorDataset(X_train, y_train)

# Create a DataLoader
batch_size = 64  
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [5]:
import torch.nn as nn

class URLClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(URLClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv1 = nn.Conv1d(embedding_dim, hidden_dim, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(hidden_dim * 256, hidden_dim)  # Adjusted input size for 512 tokens
        self.fc2 = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(0, 2, 1)  # Reshape for Conv1d
        conv_out = self.conv1(embedded)
        conv_out = self.relu(conv_out)
        pooled = self.pool(conv_out)
        pooled = pooled.view(pooled.size(0), -1)
        pooled = self.dropout(pooled)
        output = self.fc1(pooled)
        output = self.relu(output)
        output = self.fc2(output)
        return output


In [6]:
# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define hyperparameters
vocab_size = len(tokenizer)  # Size of the vocabulary
embedding_dim = 128  # Dimensionality of token embeddings
hidden_dim = 64  # Number of output channels for the convolutional layer
num_classes = len(label_encoder.classes_)  # Number of classes
learning_rate = 0.001
num_epochs = 10

# Create an instance of the URLClassifier model
model = URLClassifier(vocab_size, embedding_dim, hidden_dim, num_classes).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

print('Start training...')

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:  
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Compute training accuracy
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        total_loss += loss.item()
    
    # Print training statistics
    avg_loss = total_loss / len(train_loader)
    train_accuracy = correct / total
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {train_accuracy:.4f}')

print('Training completed!')


Start training...
Epoch [1/10], Loss: 0.1403, Accuracy: 0.9545
Epoch [2/10], Loss: 0.0748, Accuracy: 0.9760
Epoch [3/10], Loss: 0.0578, Accuracy: 0.9809
Epoch [4/10], Loss: 0.0489, Accuracy: 0.9837
Epoch [5/10], Loss: 0.0423, Accuracy: 0.9860
Epoch [6/10], Loss: 0.0384, Accuracy: 0.9872
Epoch [7/10], Loss: 0.0344, Accuracy: 0.9884
Epoch [8/10], Loss: 0.0315, Accuracy: 0.9894
Epoch [9/10], Loss: 0.0288, Accuracy: 0.9902
Epoch [10/10], Loss: 0.0264, Accuracy: 0.9911
Training completed!


In [7]:
# Create a TensorDataset for the test set
test_dataset = TensorDataset(X_test, y_test)

# Create a DataLoader for the test set
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Evaluation mode
model.eval()

# Define a variable to accumulate the total correct predictions
total_correct = 0
total_samples = 0

print('Start testing...')
# Iterate over batches in the test_loader
for inputs, labels in test_loader:  # Assuming you have a DataLoader for test data
    inputs, labels = inputs.to(device), labels.to(device)
    
    # Forward pass
    with torch.no_grad():
        outputs = model(inputs)
    
    # Calculate predictions
    _, predicted = torch.max(outputs, 1)
    
    # Update total_correct and total_samples
    total_correct += (predicted == labels).sum().item()
    total_samples += labels.size(0)

# Calculate accuracy
accuracy = total_correct / total_samples

print('Testing completed!')
print(f'Accuracy on the test set: {accuracy:.4f}')

Start testing...
Testing completed!
Accuracy on the test set: 0.9845


In [8]:
# Save the model
torch.save(model.state_dict(), 'url_classifier.pth')