In [1]:
# Load CSV file containing the database into Python
import pandas as pd

# Define the name of the file containing the database
filename = 'malicious_phish.csv'

# Load the CSV file into Python
dataset = pd.read_csv(filename)
print(dataset.head())

                                                 url        type
0                                   br-icloud.com.br    phishing
1                mp3raid.com/music/krizz_kaliko.html      benign
2                    bopsecrets.org/rexroth/cr/1.htm      benign
3  http://www.garage-pirenne.be/index.php?option=...  defacement
4  http://adventure-nicaragua.net/index.php?optio...  defacement


In [4]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

# Encode the 'type' column into numerical format
label_encoder = LabelEncoder()
dataset['type'] = label_encoder.fit_transform(dataset['type'])

# Split the dataset into features (URLs) and labels (types)
X = dataset['url']
y = dataset['type']

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the URLs
X_tokenized = []
max_seq_length = 512  # Maximum sequence length supported by BERT

for url in X:
    tokens = tokenizer.encode(url, add_special_tokens=True, max_length=max_seq_length, truncation=True)
    X_tokenized.append(tokens)

# Pad the tokenized sequences
X_padded = pad_sequences(X_tokenized, maxlen=max_seq_length, padding='post', truncating='post', dtype='long', value=0)

# Convert the padded sequences and labels into PyTorch tensors
X_tensor = torch.tensor(X_padded, dtype=torch.long)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)







X_train shape: torch.Size([520952, 512])
X_test shape: torch.Size([130239, 512])
y_train shape: torch.Size([520952])
y_test shape: torch.Size([130239])


In [6]:
from torch.utils.data import DataLoader, TensorDataset

# Assuming you have X_train and y_train tensors

# Create a TensorDataset
train_dataset = TensorDataset(X_train, y_train)

# Create a DataLoader
batch_size = 64  # Adjust according to your needs
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [5]:
# Print all labels in the test and training sets
print('y_train:', y_train)
print('y_test:', y_test)
print('x_train:', X_train)
print('x_test:', X_test)

y_train: tensor([0, 3, 0,  ..., 1, 0, 0])
y_test: tensor([2, 0, 3,  ..., 0, 0, 0])
x_train: tensor([[  101,  4372,  1012,  ...,     0,     0,     0],
        [  101,  5906,  1012,  ...,     0,     0,     0],
        [  101,  4372,  1012,  ...,     0,     0,     0],
        ...,
        [  101,  8299,  1024,  ...,     0,     0,     0],
        [  101, 16012,  7583,  ...,     0,     0,     0],
        [  101, 13683,  1012,  ...,     0,     0,     0]])
x_test: tensor([[  101,  8299,  1024,  ...,     0,     0,     0],
        [  101,  2966,  1011,  ...,     0,     0,     0],
        [  101,  7479,  1012,  ...,     0,     0,     0],
        ...,
        [  101,  2394, 29278,  ...,     0,     0,     0],
        [  101,  1062,  5302,  ...,     0,     0,     0],
        [  101,  1996,  3217,  ...,     0,     0,     0]])


In [9]:
import torch.nn as nn

class URLClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(URLClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv1 = nn.Conv1d(embedding_dim, hidden_dim, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(hidden_dim * (max_seq_length // 2), hidden_dim)  # Adjusted input size
        self.fc2 = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(0, 2, 1)  # Reshape for Conv1d
        conv_out = self.conv1(embedded)
        conv_out = self.relu(conv_out)
        pooled = self.pool(conv_out)
        pooled = pooled.view(pooled.size(0), -1)
        pooled = self.dropout(pooled)
        output = self.fc1(pooled)
        output = self.relu(output)
        output = self.fc2(output)
        return output

In [10]:
# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define hyperparameters
vocab_size = len(tokenizer)  # Size of the vocabulary
embedding_dim = 128  # Dimensionality of token embeddings
hidden_dim = 64  # Number of output channels for the convolutional layer
num_classes = len(label_encoder.classes_)  # Number of classes
learning_rate = 0.001
num_epochs = 10

# Create an instance of the URLClassifier model
model = URLClassifier(vocab_size, embedding_dim, hidden_dim, num_classes).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:  # Assuming you have a DataLoader for training data
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Compute training accuracy
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        total_loss += loss.item()
    
    # Print training statistics
    avg_loss = total_loss / len(train_loader)
    train_accuracy = correct / total
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {train_accuracy:.4f}')

print('Training completed!')


Epoch [1/10], Loss: 0.1299, Accuracy: 0.9578
Epoch [2/10], Loss: 0.0700, Accuracy: 0.9776


KeyboardInterrupt: 

In [None]:
# Print accuracy
threshold = 0.5
predictions = (test_outputs > threshold).float()
accuracy = (predictions == y_test).float().mean()
print(f'Accuracy: {accuracy.item():.4f}')

# Confusion matrix
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test.cpu().numpy(), predictions.cpu().numpy())
print('Confusion Matrix:')
print(conf_matrix)