In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification

# Load the dataset from the specified path
data = pd.read_csv('/content/drive/MyDrive/Database_Project/malicious_phish.csv')
data = data.sample(n=2000, random_state=42).reset_index(drop=True)

url_label_categorical_to_numerical_dictionary = {'phishing':0,
 'defacement':1,
 'benign':2,
 'malware':3}

data['type'] = data['type'].map(url_label_categorical_to_numerical_dictionary)

# Separate the features and labels
y = data['type']
X = data.drop(columns=['type'])  # Assuming 'type' is the label column

# Split the data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

# Split the training set further into 80% training and 20% validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.005, random_state=42)

# Initialize the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Column name as a string, assuming the column with URLs is named 'url'
url_column = 'url'

# Convert each entry in the column to a string and make it a list of strings
X_train_text = X_train[url_column].astype(str).tolist()
X_val_text = X_val[url_column].astype(str).tolist()
X_test_text = X_test[url_column].astype(str).tolist()

# Tokenize the text data
train_encodings = tokenizer(X_train_text, truncation=True, padding=True, max_length=2)
val_encodings = tokenizer(X_val_text, truncation=True, padding=True, max_length=2)
test_encodings = tokenizer(X_test_text, truncation=True, padding=True, max_length=2)

# Convert to TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).batch(4)  # Set batch size to 4 for training

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
)).batch(8)  # Set batch size to 8 for validation

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(8)  # Set batch size to 8 for testing

# Initialize the model
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

# Compile the model (necessary for training)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),  # Choose a suitable learning rate
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train the model using `fit()`
model.fit(train_dataset, validation_data=val_dataset, epochs=3)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(test_dataset)

print(f"Test accuracy: {test_accuracy:.4f}")

# For new URL prediction
new_url = "http://example.com/suspicious-page"
new_url_encodings = tokenizer([new_url], truncation=True, padding=True, max_length=2)

# Convert the tokenized encodings to TensorFlow dataset format
new_url_dataset = tf.data.Dataset.from_tensor_slices((
    dict(new_url_encodings)
)).batch(1)  # Batch size of 1

# Make the prediction
new_url_predictions = model.predict(new_url_dataset)

# Get the predicted class index (which corresponds to a label)
predicted_class_index = tf.argmax(new_url_predictions.logits, axis=-1).numpy()[0]

# Map the predicted class index to the corresponding label
predicted_label = [label for label, index in url_label_categorical_to_numerical_dictionary.items() if index == predicted_class_index][0]

print(f"Predicted label for the new URL: {predicted_label}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test accuracy: 0.7000
Predicted label for the new URL: benign


In [None]:
# Save the model to your Google Drive
model.save('/content/drive/MyDrive/malicious_phish_model')


