In [1]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Input, Dropout

First, we load the data and split into input and output

In [3]:
# Loading feature dataset
data = pd.read_csv('cat_feature_dataset.csv')
data.drop(columns=['url'], inplace=True)
data

Unnamed: 0,label,url_length,has_ip,is_http,has_redirect,has_top_tld,shortening_service,slash_count,dash_count,dot_count,digit_count,keyword
0,0,80,0,0,0,1,0,2,0,2,5,0
1,1,90,0,1,0,0,0,4,0,4,15,1
2,1,60,0,1,0,1,0,6,2,2,15,1
3,0,80,0,1,0,1,0,4,4,0,0,0
4,1,90,0,1,0,1,0,4,0,2,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...
468335,1,60,0,1,0,0,0,4,0,4,5,1
468336,0,80,0,0,0,1,0,6,4,2,10,0
468337,1,20,0,1,0,1,1,2,0,0,0,0
468338,1,90,1,1,0,0,0,4,0,4,15,0


In [4]:
# X is the features (input), y is the label (output)
X = data.drop('label', axis=1)
y = data['label']

print(X.shape)
print(y.shape)

# This turns X, which is currently a 2D dataframe into a 3D dataframe by adding an extra dimension 1
# X.shape[0] represents the samples, X.shape[1] represents the columns (or features in this case)
X = X.values.reshape(X.shape[0], X.shape[1], 1)

# Transforms X and y into numpy arrays
X = np.array(X)
y = np.array(y)

print(X.shape)
print(y.shape)

(468340, 11)
(468340,)
(468340, 11, 1)
(468340,)


In [5]:
# Split into 70% training, 20% validation and 10% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 1/3, random_state=42)

# Check data shapes
print("Train:", X_train.shape, y_train.shape)
print("Valid:", X_val.shape, y_val.shape)
print("Test:", X_test.shape, y_test.shape)

Train: (327838, 11, 1) (327838,)
Valid: (93668, 11, 1) (93668,)
Test: (46834, 11, 1) (46834,)


In [6]:
# Create model
model = Sequential()

# Input layer
model.add(Input(shape=(11, 1)))

# Convolution layers
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D())
model.add(Dropout(0.2))

# Fully connected layers
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile model
print("Compiling...")
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'precision', 'recall'])

# Show summary
model.summary()

Compiling...


In [7]:
history = model.fit(X_train, y_train, epochs=8, batch_size=128, validation_data=(X_val, y_val))

loss, accuracy, precision, recall = model.evaluate(X_test, y_test)

# Print results
print("Loss:", round(loss, 4))
print("Accuracy:", round(accuracy, 4))
print("Precision:", round(precision, 4))
print("Recall:", round(recall, 4))

Epoch 1/8
[1m2562/2562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.8232 - loss: 0.3812 - precision: 0.7624 - recall: 0.8751 - val_accuracy: 0.8550 - val_loss: 0.3212 - val_precision: 0.8003 - val_recall: 0.8980
Epoch 2/8
[1m2562/2562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8522 - loss: 0.3309 - precision: 0.7988 - recall: 0.8936 - val_accuracy: 0.8583 - val_loss: 0.3216 - val_precision: 0.8195 - val_recall: 0.8740
Epoch 3/8
[1m2562/2562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8547 - loss: 0.3248 - precision: 0.8033 - recall: 0.8931 - val_accuracy: 0.8568 - val_loss: 0.3157 - val_precision: 0.7968 - val_recall: 0.9102
Epoch 4/8
[1m2562/2562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8551 - loss: 0.3237 - precision: 0.8051 - recall: 0.8895 - val_accuracy: 0.8579 - val_loss: 0.3153 - val_precision: 0.8011 - val_recall: 0.9053
Epoch 5/8
[1m2562/2