In [92]:
import pandas as pd
from urllib.parse import urlparse
import re
import ipaddress

# Load the dataset
df = pd.read_csv('urldata.csv')

# Inspect data
df.sample(100)


Unnamed: 0.1,Unnamed: 0,url,label,result
4630,29210,https://www.en.wikipedia.org/wiki/2008_Missour...,benign,0
356752,381332,http://voicebomber.com/wp-admin/css/colors/sun...,malicious,1
317916,342496,https://www.pcengines.ch/tp3.htm,benign,0
202892,227472,https://www.kissthisguy.com/7224misheard.htm,benign,0
64794,89374,https://www.politicalgraveyard.com/bio/minore-...,benign,0
...,...,...,...,...
192154,216734,https://www.imdb.com/name/nm0105442/,benign,0
375703,400283,http://windowmedics.com/9cbspg6,malicious,1
148464,173044,https://www.dictionary30.com/meaning/TRUDEL,benign,0
161864,186444,https://www.facebook.com/jures2,benign,0


In [93]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Concatenate, Input, Embedding, GRU, Dense
from tensorflow.keras.optimizers import Adam

# Step 1: Prepare the data (0 for benign, 1 for phishing)

# Tokenize the URLs (character-level tokenization)
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df['url'])

# Convert the URLs into sequences of integers
url_sequences = tokenizer.texts_to_sequences(df['url'])

# Pad the sequences to make them of equal length
max_sequence_length = 100  # can be adjusted based on data
X = pad_sequences(url_sequences, maxlen=max_sequence_length)

# Target labels
y = df['result'].values

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Build the GRU model
vocab_size = len(tokenizer.word_index) + 1  # Total number of unique characters
embedding_dim = 50  # Size of the character embedding vectors

model = Sequential()

# Embedding layer (convert each character to a dense vector)
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))

# GRU layer
model.add(GRU(128, return_sequences=False))

# Output layer (binary classification)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Step 4: Train the model
model.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_test, y_test))

# Step 5: Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')


KeyboardInterrupt: 

In [None]:
# Save the trained GRU model
model.save('phishing_gru_model.keras')

# Save the tokenizer using pickle
import pickle

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [None]:
# Import required modules
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the saved GRU model in .keras format
model = load_model('phishing_gru_model.keras')

# Example usage: Tokenizing a new URL
MAX_SEQUENCE_LENGTH = 100  # Same value used during training

In [None]:
from sklearn.metrics import roc_curve

# Example: Get prediction probabilities for a test dataset
y_prob = model.predict(X_test).flatten()

# Compute ROC curve and find the optimal threshold
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
optimal_idx = (tpr - fpr).argmax()  # Maximize TPR - FPR
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal Threshold: {optimal_threshold}")

# Use the new threshold in predictions
prediction = 'phishing' if y_prob[0] >= optimal_threshold else 'benign'
print(f"Prediction: {prediction} , {y_prob[0]}")


[1m2660/2660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 20ms/step
Optimal Threshold: 0.25421249866485596
Prediction: phishing , 0.9999070763587952


In [None]:
from imblearn.over_sampling import SMOTE

# Resample the training set using SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report

y_pred = (y_prob >= optimal_threshold).astype(int)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     64264
           1       0.99      0.99      0.99     20855

    accuracy                           1.00     85119
   macro avg       0.99      0.99      0.99     85119
weighted avg       1.00      1.00      1.00     85119



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

# Example GRU model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=50, input_length=100))  # Embedding Layer
model.add(GRU(128, return_sequences=False))  # GRU Layer
model.add(Dense(1, activation='sigmoid'))  # Output Layer

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example URL
url = "http://gemmell.co.nz/view/"

# Tokenize the URL
sequence = tokenizer.texts_to_sequences([url])

# Pad the sequence to match the input length (100 in this case)
padded_sequence = pad_sequences(sequence, maxlen=100)

# Check the shape
print(padded_sequence.shape)  # Should be (1, 100)



(1, 100)


In [None]:
import numpy as np

# Expand the dimensions to make it (1, 100, 1)
input_data = np.expand_dims(padded_sequence, axis=-1)

# Check the shape
print(input_data.shape)  # Should be (1, 100, 1)


(1, 100, 1)


In [None]:
# Retrain the model with the corrected input
history = model.fit(X_train, y_train, epochs=2, batch_size=32, validation_data=(X_test, y_test))
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")

# Example prediction
prediction = model.predict(input_data)[0][0]
result = 'phishing' if prediction >= 0.5 else 'benign'
print(f"Prediction: {result}, Probability: {prediction}")


Epoch 1/2
[1m10640/10640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m667s[0m 62ms/step - accuracy: 0.9548 - loss: 0.1287 - val_accuracy: 0.9926 - val_loss: 0.0239
Epoch 2/2
[1m10640/10640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16974s[0m 2s/step - accuracy: 0.9943 - loss: 0.0207 - val_accuracy: 0.9957 - val_loss: 0.0160
[1m2660/2660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 21ms/step - accuracy: 0.9962 - loss: 0.0143
Test Accuracy: 0.9957118630409241
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 413ms/step
Prediction: phishing, Probability: 0.9998806715011597


In [None]:
model.save('phishing_gru_model.keras')
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

In [97]:
# Imports (Place all necessary imports at the top)
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the trained model
model = load_model('phishing_gru_model.keras')

# Load the tokenizer
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

test_url = "example.com"
tokenized = tokenizer.texts_to_sequences([test_url])
print(tokenized)

# Define a function to predict a single URL
def predict_single_url(url, model, tokenizer, max_length=100):
    """Predict if a URL is phishing or benign using the trained GRU model."""
    # Tokenize and pad the sequence
    sequence = tokenizer.texts_to_sequences([url])
    padded_sequence = pad_sequences(sequence, maxlen=max_length)
    print(padded_sequence.shape)
    print(f"Padded sequence in Journal: {padded_sequence}")

    # Predict using the model
    prediction = model.predict(padded_sequence)[0][0]
    print("Notebook Prediction (Raw):", prediction)
    confidence = 1 - prediction if prediction > 0.5 else prediction
    print("Notebook Confidence:", confidence)

    # Classify based on a threshold
    result = 'phishing' if prediction >= 0.5 else 'benign'

    # Display the result
    print(f"URL: {url} | Prediction: {result} | Probability: {prediction:.4f}")

# Example usage
print("Testing a single URL prediction:")
predict_single_url("http://gemmell.co.nz/view/", model, tokenizer)

# Optional: Test multiple URLs
def predict_multiple_urls(urls, model, tokenizer, max_length=100):
    """Predict phishing or benign for multiple URLs."""
    for url in urls:
        predict_single_url(url, model, tokenizer, max_length)

# Example with multiple URLs
urls_to_test = [
    "http://gemmell.co.nz/view/",
    "https://www.google.com",
    "http://fake-phish.com"
]
print("\nTesting multiple URLs:")
predict_multiple_urls(urls_to_test, model, tokenizer)



[[3, 38, 6, 15, 11, 16, 3, 8, 10, 4, 15]]
Testing a single URL prediction:
(1, 100)
Padded sequence in Journal: [[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0 14  2  2 11 20  1  1 21  3 15 15  3 16 16  8 10  4  8 13 41  1 29
   9  3  7  1]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 500ms/step
Notebook Prediction (Raw): 0.9998807
Notebook Confidence: 0.00011932849884033203
URL: http://gemmell.co.nz/view/ | Prediction: phishing | Probability: 0.9999

Testing multiple URLs:
(1, 100)
Padded sequence in Journal: [[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0 14  2  2 11 20  1  1 21  3 15 15  3 16 16  8 10  4  8 