In [None]:
import numpy as np
# Importing the numpy library, often used for numerical operations on arrays and matrices.

import tensorflow as tf
# Importing TensorFlow, a popular machine learning library.

import pandas as pd
# Importing the pandas library, used for data manipulation and analysis.

from keras.layers import BatchNormalization, Dropout
# Importing BatchNormalization and Dropout layers from Keras. These are used in neural network layers.

from tensorflow.keras.preprocessing.text import Tokenizer
# Importing the Tokenizer class from TensorFlow Keras, used for text tokenization.

from tensorflow.keras.preprocessing.sequence import pad_sequences
# Importing pad_sequences from TensorFlow Keras, used to ensure all sequences in a list have the same length.

from tensorflow.keras.models import Sequential
# Importing the Sequential model from TensorFlow Keras, a linear stack of neural network layers.

from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
# Importing various types of layers from TensorFlow Keras used in building neural networks.

from tensorflow.keras.utils import to_categorical
# Importing to_categorical from TensorFlow Keras, used for converting class vectors to binary class matrices.

from sklearn.preprocessing import LabelEncoder
# Importing LabelEncoder from Scikit-Learn, used for encoding labels with value between 0 and n_classes-1.

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
# Importing EarlyStopping and ModelCheckpoint callbacks from TensorFlow Keras, used to monitor training.

from itertools import product
# Importing product from itertools, used to create Cartesian products from multiple iterables.


In [None]:
# Function to load datasets and handle errors
def load_dataset_skip_errors(file_path, sample_fraction=0.1):
    try:
        df = pd.read_csv(file_path, header=None, quotechar='"')  # Read the CSV file without a header and with a specified quote character.
        df_sampled = df.sample(frac=sample_fraction, random_state=1)  # Randomly sample a fraction of the dataframe.
        return df_sampled
    except pd.errors.ParserError as e:
        print(f'ParserError: {e}')  # Print and handle any parser errors.
        return None

# Load the training and testing datasets
train_df = load_dataset_skip_errors("updated_train.csv")  # Load and sample the training dataset.
test_df = load_dataset_skip_errors("updated_test.csv")  # Load and sample the testing dataset.

# Validate that the datasets are loaded
if train_df is None or test_df is None:
    raise ValueError("DataFrames could not be loaded. Check your CSV files.")  # Check if datasets are loaded successfully.


In [None]:
# Prepare URL and label data
train_urls = train_df[1].tolist()  # Extract URLs from the training dataset.
train_labels = train_df[0].tolist()  # Extract labels from the training dataset.
test_urls = test_df[1].tolist()  # Extract URLs from the testing dataset.
test_labels = test_df[0].tolist()  # Extract labels from the testing dataset.

# Tokenize URLs
tokenizer = Tokenizer(char_level=True)  # Initialize a tokenizer for character-level tokenization.
tokenizer.fit_on_texts(train_urls)  # Fit the tokenizer on the training URLs.
train_sequences = tokenizer.texts_to_sequences(train_urls)  # Convert training URLs to sequences of integers.
test_sequences = tokenizer.texts_to_sequences(test_urls)  # Convert testing URLs to sequences of integers.

# Padding sequences
max_length = max(max(len(s) for s in train_sequences), max(len(s) for s in test_sequences))  # Determine the maximum sequence length.
train_data = pad_sequences(train_sequences, maxlen=max_length)  # Pad training sequences to the same length.
test_data = pad_sequences(test_sequences, maxlen=max_length)  # Pad testing sequences to the same length.

# Label encoding and one-hot encoding
label_encoder = LabelEncoder()  # Initialize the label encoder.
train_labels = label_encoder.fit_transform(train_labels)  # Fit and transform training labels to normalized encoding.
test_labels = label_encoder.transform(test_labels)  # Transform testing labels to normalized encoding.
num_classes = np.max(train_labels) + 1  # Determine the number of classes.
train_labels = to_categorical(train_labels, num_classes)  # Convert training labels to one-hot encoding.
test_labels = to_categorical(test_labels, num_classes)  # Convert testing labels to one-hot encoding.

In [None]:
# Hyperparameter optimization
param_grid = {
    'optimizer':['adam','rmsprop'],  # Define optimizer types to test.
    'filters': [32,64,128,256],  # Define filter sizes for Conv1D layers to test.
    'kernel_size': [3,6],  # Define kernel sizes for Conv1D layers to test.
    'batch_size': [32],  # Define batch sizes to test.
    'epochs': [20],  # Define number of epochs to test.
    'num_layers': [1, 2, 3],  # Define number of layers to test.
}

In [None]:
# Grid search function for hyperparameter tuning
def grid_search(param_grid, train_data, train_labels, test_data, test_labels):
    best_accuracy = 0.0  # Initialize the best accuracy.
    best_params = {}  # Initialize the dictionary to store best parameters.
    results = []  # List to store results of each parameter combination.

    keys = param_grid.keys()  # Get the keys (parameter names) from the parameter grid.
    param_combinations = list(product(*param_grid.values()))  # Generate all combinations of parameters.

    # Initialize early stopping to avoid overfitting.
    early_stopping = EarlyStopping(
        monitor='val_accuracy',
        mode='max',
        patience=4,
        min_delta=0.0010,
        restore_best_weights=True,
        verbose=1
    )

    # Iterate through each parameter combination.
    for params in param_combinations:
        params_dict = dict(zip(keys, params))  # Create a dictionary of the current parameter combination.
        model = Sequential()  # Initialize a sequential model.
        # Add an embedding layer.
        model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_length))

        # Add the specified number of Conv1D layers.
        for _ in range(params_dict['num_layers']):
            model.add(Conv1D(filters=params_dict['filters'], kernel_size=params_dict['kernel_size'], activation='relu'))
            model.add(BatchNormalization())  # Add batch normalization.
            model.add(Dropout(0.2))  # Add dropout for regularization.

        # Add the final layers.
        model.add(GlobalMaxPooling1D())  # Add global max pooling layer.
        model.add(Dense(num_classes, activation='sigmoid'))  # Add a dense output layer with sigmoid activation.
        
        # Compile the model.
        model.compile(optimizer=params_dict['optimizer'], loss='binary_crossentropy', metrics=['accuracy'])
        model.summary()  # Print the model summary.

        # Train the model.
        model.fit(train_data, train_labels, epochs=params_dict['epochs'], validation_data=(test_data, test_labels),
                  batch_size=params_dict['batch_size'], verbose=1, callbacks=[early_stopping])

        # Evaluate the model.
        _, accuracy = model.evaluate(test_data, test_labels, verbose=1)

        # Store the results.
        result = {'params': params_dict, 'accuracy': accuracy}
        results.append(result)
        print(f"Parameters: {params_dict}, Accuracy: {accuracy}")

        # Update the best accuracy and parameters if current accuracy is better.
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params_dict

    return best_params, best_accuracy, results

In [None]:
# Run the grid search.
best_params, best_accuracy, all_results = grid_search(param_grid, train_data, train_labels, test_data, test_labels)
print("Best parameters:", best_params)  # Print the best parameters.
print("Best accuracy:", best_accuracy)  # Print the best accuracy.