In [7]:
import pandas as pd
from urllib.parse import urlparse
import re
import ipaddress

# Load the dataset
df = pd.read_csv('urldata.csv')

# Inspect data
df.sample(100)

Unnamed: 0.1,Unnamed: 0,url,label,result
386704,386704,http://sterlingdiamonds.co.uk/blog/wp-content/...,malicious,1
151042,151042,https://www.bizofbaseball.com/index.php?option...,benign,0
195326,195326,https://www.festival-automne.com/marcial-di-fo...,benign,0
334316,334316,https://www.youtube.com/watch?v=aiOwZ9NIcho,benign,0
429487,429487,http://www.cctrubiak.com/Document,malicious,1
...,...,...,...,...
71444,71444,https://www.mentalhealthupdate.blogspot.com/,benign,0
154805,154805,https://www.broussard-hart.com/,benign,0
381347,381347,http://webmail.mhcable.com/secure/www.paypal.c...,malicious,1
280381,280381,https://www.relationship-institute.com/staff.cfm,benign,0


In [8]:
# feature extraction preformed by URLFeatureExtraction.py
# python URLFeatureExtraction.py urldata.csv returns featuredata.csv
df = pd.read_csv('featuredata.csv', on_bad_lines='skip')

features = ['Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                        'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record',
                        'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards']


In [9]:
# Convert the features into separate columns
# df_features = pd.DataFrame(df['features'].tolist(), index=df.index, columns=features)

# Combine the extracted features with the original DataFrame
# df_combined = pd.concat([df, df_features], axis=1)
df = df.dropna(subset=['Label'])


In [10]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


# Define X (features) and y (target)
X = df[features]
y = df['Label']

# Step 1: Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Use SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Step 3: Train the model on the resampled data
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# Step 4: Make predictions on the test set
y_pred = rf_model.predict(X_test)



In [None]:
# Feature extracted URLs tested
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Step 4: Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=5, batch_size=128, validation_data=(X_test, y_test))

# Step 5: Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')



Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6376 - loss: 0.6347 - val_accuracy: 0.7426 - val_loss: 0.5205
Epoch 2/5
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 884us/step - accuracy: 0.7288 - loss: 0.5386 - val_accuracy: 0.7398 - val_loss: 0.5075
Epoch 3/5
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 894us/step - accuracy: 0.7409 - loss: 0.5254 - val_accuracy: 0.7408 - val_loss: 0.5065
Epoch 4/5
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 890us/step - accuracy: 0.7406 - loss: 0.5260 - val_accuracy: 0.7445 - val_loss: 0.5028
Epoch 5/5
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 963us/step - accuracy: 0.7424 - loss: 0.5186 - val_accuracy: 0.7475 - val_loss: 0.5026
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 483us/step - accuracy: 0.7509 - loss: 0.5016
Test Accuracy: 0.7475487589836121


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Bidirectional, LSTM
from tensorflow.keras.optimizers import Adam

# Step 1: Prepare the data (0 for benign, 1 for phishing)

# Tokenize the URLs (character-level tokenization)
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df['URL'])

# Convert the URLs into sequences of integers
url_sequences = tokenizer.texts_to_sequences(df['URL'])

# Pad the sequences to make them of equal length
max_sequence_length = 100  # can be adjusted based on data
X = pad_sequences(url_sequences, maxlen=max_sequence_length)

# Target labels
y = df['Label'].values

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Build the LSTM model
vocab_size = len(tokenizer.word_index) + 1  # Total number of unique characters
embedding_dim = 50  # Size of the character embedding vectors

model = Sequential()
# Embedding layer (convert each character to a dense vector)
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
# LSTM layer
model.add(Bidirectional(LSTM(128, return_sequences=False)))
# Output layer (binary classification)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Step 4: Train the model
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test))

# Step 5: Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy Bi: {accuracy}')

Epoch 1/10




[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 112ms/step - accuracy: 0.8224 - loss: 0.3774 - val_accuracy: 0.9468 - val_loss: 0.1439
Epoch 2/10
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 110ms/step - accuracy: 0.9610 - loss: 0.1295 - val_accuracy: 0.9646 - val_loss: 0.1165
Epoch 3/10
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 110ms/step - accuracy: 0.9700 - loss: 0.1079 - val_accuracy: 0.9781 - val_loss: 0.0840
Epoch 4/10
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 110ms/step - accuracy: 0.9544 - loss: 0.1604 - val_accuracy: 0.9779 - val_loss: 0.0806
Epoch 5/10
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 110ms/step - accuracy: 0.9797 - loss: 0.0688 - val_accuracy: 0.9816 - val_loss: 0.0669
Epoch 6/10
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 110ms/step - accuracy: 0.9853 - loss: 0.0514 - val_accuracy: 0.9906 - val_loss: 0.0317
Epoch 7/10
[1m278/27