In [None]:
# Import required libraries
import numpy as np  # Used for numerical operations like array manipulation
import pandas as pd  # Pandas library for data manipulation and analysis
from keras.layers import BatchNormalization  # Normalizes the inputs to a layer
from tensorflow.keras.preprocessing.text import Tokenizer  # Tokenizes texts into tokens
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Pads sequences to the same length
from tensorflow.keras.models import Sequential  # Sequential model for linear stack of layers
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense  # Various layers for model construction
from keras.layers import Dropout  # Dropout layer to reduce overfitting
from sklearn.preprocessing import LabelEncoder  # Encodes labels to normalized format
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint  # Callbacks for controlling the training process
import seaborn as sns  # Visualization library based on matplotlib
import matplotlib.pyplot as plt  # Basic library for plotting graphs
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix  # Metrics for model evaluation

In [None]:
# Function to load dataset
def load_dataset(file_path):
    # Load a CSV file into a DataFrame without a header and with quotes around strings
    df = pd.read_csv(file_path, header=None, quotechar='"')
    return df

# Load training and testing data
train_df = load_dataset('updated_train.csv')  # Load the training data
test_df = load_dataset('updated_test.csv')  # Load the testing data

In [None]:
# Prepare URL and label data for training and testing
train_urls = train_df[1].tolist()  # Extract URLs from training data
train_labels = train_df[0].tolist()  # Extract labels from training data
test_urls = test_df[1].tolist()  # Extract URLs from testing data
test_labels = test_df[0].tolist()  # Extract labels from testing data

# Tokenize the URL data
tokenizer = Tokenizer(char_level=True)  # Initialize a tokenizer that works at the character level
tokenizer.fit_on_texts(train_urls)  # Fit the tokenizer on the training URLs

# Convert URLs to sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_urls)  # Convert training URLs to sequences
test_sequences = tokenizer.texts_to_sequences(test_urls)  # Convert testing URLs to sequences

# Find the maximum sequence length for padding
max_length = max(max(len(s) for s in train_sequences), max(len(s) for s in test_sequences))  # Find max length

# Pad the sequences to ensure uniform length
train_data = pad_sequences(train_sequences, maxlen=max_length)  # Pad training sequences
test_data = pad_sequences(test_sequences, maxlen=max_length)  # Pad testing sequences

# Encode the labels
label_encoder = LabelEncoder()  # Initialize the label encoder
train_labels = label_encoder.fit_transform(train_labels)  # Fit and transform training labels
test_labels = label_encoder.transform(test_labels)  # Transform testing labels

In [None]:
# Model building
from keras.optimizers import Adam  # Import the Adam optimizer
model = Sequential()  # Create a sequential model
# Add embedding layer
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_length))
# Add convolution layers
model.add(Conv1D(filters=128, kernel_size=6, activation='relu'))
model.add(Conv1D(filters=128, kernel_size=6, activation='relu'))
# Add global max pooling layer
model.add(GlobalMaxPooling1D())
# Add dense layer for output
model.add(Dense(1, activation='sigmoid'))

# Compile the model
optimizer = Adam(lr=0.0001)  # Initialize Adam optimizer with a learning rate
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])  # Compile the model
model.summary()  # Print the model summary

In [None]:
# Callbacks for training
early_stopping = EarlyStopping(
    monitor='val_accuracy',  # Monitor validation accuracy
    mode='max',  # Mode is max because we want to maximize accuracy
    patience=8,  # Number of epochs with no improvement after which training will be stopped
    min_delta=0.00009,  # Minimum change to qualify as an improvement
    restore_best_weights=True,  # Restore model weights from the epoch with the best value of the monitored quantity
    verbose=1  # Verbosity mode
)
model_checkpoint = ModelCheckpoint(
    'best_model.keras',  # File path to save the model
    monitor='val_accuracy',  # Monitor validation accuracy
    mode='max',  # Mode is max because we want to maximize accuracy
    save_best_only=True,  # Save only when the monitored quantity has improved
    verbose=1  # Verbosity mode
)

In [None]:
# Train the model
history = model.fit(train_data, train_labels, epochs=100, validation_data=(test_data, test_labels), batch_size=32,
          callbacks=[early_stopping, model_checkpoint], verbose=1)  # Fit the model

In [None]:
# Save the training history
import json
history_dict = history.history  # Get history data from the model
json.dump(history_dict, open("history.json", 'w'))  # Save history data to a JSON file

In [None]:
# Plot training and validation loss
plt.plot(history.history['loss'], label='Training Loss')  # Plot training loss
plt.plot(history.history['val_loss'], label='Validation Loss')  # Plot validation loss
plt.title('Training and Validation Loss')  # Title of the plot
plt.ylabel('Loss')  # Y-axis label
plt.xlabel('Epoch')  # X-axis label
plt.legend()  # Add legend
plt.show()  # Display the plot

# Plot training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')  # Plot training accuracy
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')  # Plot validation accuracy
plt.title('Training and Validation Accuracy')  # Title of the plot
plt.ylabel('Accuracy')  # Y-axis label
plt.xlabel('Epoch')  # X-axis label
plt.legend()  # Add legend
plt.show()  # Display the plot

# Evaluate model performance
predictions = model.predict(test_data)  # Predict on test data
predictions = (predictions > 0.5).astype("int32")  # Convert predictions to binary

# Calculate precision, recall, and F1-score
precision = precision_score(test_labels, predictions)  # Calculate precision
recall = recall_score(test_labels, predictions)  # Calculate recall
f1 = f1_score(test_labels, predictions)  # Calculate F1 score

# Print evaluation metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Plot the confusion matrix
cm = confusion_matrix(test_labels, predictions)  # Calculate confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')  # Plot confusion matrix using seaborn
plt.xlabel('Predicted')  # X-axis label
plt.ylabel('True')  # Y-axis label
plt.title('Confusion Matrix')  # Title of the plot
plt.show()  # Display the plot