#1 convolution layer

In [None]:
#Hybrid with a one layer convolution needs all input seen below. The parameter values can be seen below
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dropout, LSTM, Dense, TimeDistributed, Input
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences

def train_and_evaluate_hybrid_model_1layer(
    filter1,
    kernelsize,
    Dropout1,
    filter2,
    Dropout2,
    Lstm_unit,
    Dropout3,
    TimeDist1,
    TimeDist2,
    train_file_path,
    test_file_path,
    report_file_path,
    predictions_file_path
):
    # Load the datasets
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # List of amino acids (for one-hot encoding)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Function to one-hot encode a sequence
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Encode the sequences for training and testing
    train_encoded = [one_hot_encode(seq, amino_acids) for seq in train_data['seq']]
    test_encoded = [one_hot_encode(seq, amino_acids) for seq in test_data['seq']]

    # Find the maximum sequence length in the training and testing datasets
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))

    # Pad the sequences to the maximum length
    train_sequences = pad_sequences(train_encoded, maxlen=max_seq_len, padding='post', dtype='float32')
    test_sequences = pad_sequences(test_encoded, maxlen=max_seq_len, padding='post', dtype='float32')

    # Encode the secondary structures as target labels
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']],
                                 maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']],
                                maxlen=max_seq_len, padding='post', value=-1)

    # One-hot encode the labels
    train_labels_categorical = to_categorical(train_labels, num_classes=3)
    test_labels_categorical = to_categorical(test_labels, num_classes=3)

    # Define the hybrid CNN/RNN architecture
    model = Sequential([
        Input(shape=(train_sequences.shape[1], train_sequences.shape[2])),  # (50, 20)
        Conv1D(filters=filter1, kernel_size=kernelsize, activation='relu', padding='same'),
        Dropout(Dropout1),
        LSTM(Lstm_unit, return_sequences=True),  # LSTM layer with 64 units, returns sequences for each time step
        Dropout(Dropout3),
        TimeDistributed(Dense(128, activation='relu')),  # Dense layer applied to each position
        TimeDistributed(Dense(3, activation='softmax')),  # Output layer for three classes ('H', 'E', 'C')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()

    # Train the model
    history = model.fit(
        train_sequences,
        train_labels_categorical,
        epochs=10,
        batch_size=32,
        validation_split=0.1,  # Use 10% of the training data as validation
        verbose=1
    )

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(test_sequences, test_labels_categorical, verbose=1)

    print(f"Test Accuracy: {test_accuracy:.4f}")

    # Make predictions on the test set
    test_predictions = model.predict(test_sequences)
    test_predictions_labels = np.argmax(test_predictions, axis=-1)
    test_true_labels = np.argmax(test_labels_categorical, axis=-1)

    # Flatten the arrays to create a single list of predictions and true labels
    test_predictions_flat = test_predictions_labels.flatten()
    test_true_labels_flat = test_true_labels.flatten()

    # Generate the classification report
    report = classification_report(test_true_labels_flat, test_predictions_flat, target_names=['H', 'E', 'C'])

    # Save the classification report to a text file
    with open(report_file_path, 'w') as f:
        f.write(f"Hybrid Test Accuracy: {model.evaluate(test_sequences, test_labels_categorical, verbose=0)[1]:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(report)

    # Save the predictions along with the true labels to a CSV file
    results_df = pd.DataFrame({
        'True_Label': test_true_labels_flat,
        'Predicted_Label': test_predictions_flat
    })
    results_df.to_csv(predictions_file_path, index=False)

    print(f"Classification report saved to {report_file_path}")
    print(f"Predictions saved to {predictions_file_path}")

In [None]:
# Train and evaluate the hybrid model with one convolutional layer with dataset 1 (initial dataset)
#Need to run the function above first
train_and_evaluate_hybrid_model_1layer(
    filter1 = 32,
    kernelsize = 3,
    Dropout1 = 0.2,
    filter2 = 64,
    Dropout2 = 0.3,
    Lstm_unit = 64,
    Dropout3 = 0.5,
    TimeDist1= 128,
    TimeDist2 = 3,
    train_file_path='/content/training_data_clean.csv', #training data file
    test_file_path='/content/test_data_clean.csv',  #test datafile
    report_file_path='Hybrid_in_1layer_report.txt', #Output report (containing accuracy and F1 value)
    predictions_file_path='Hybrid_in_1layer_predictions.csv' #Output prediction consisting of the predicted labels.
)

Epoch 1/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 66ms/step - accuracy: 0.6626 - loss: 0.8311 - val_accuracy: 0.7556 - val_loss: 0.6001
Epoch 2/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 69ms/step - accuracy: 0.7666 - loss: 0.5732 - val_accuracy: 0.7681 - val_loss: 0.5406
Epoch 3/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 71ms/step - accuracy: 0.7814 - loss: 0.5238 - val_accuracy: 0.7940 - val_loss: 0.4878
Epoch 4/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 56ms/step - accuracy: 0.7977 - loss: 0.4846 - val_accuracy: 0.8002 - val_loss: 0.4669
Epoch 5/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 67ms/step - accuracy: 0.8016 - loss: 0.4695 - val_accuracy: 0.8029 - val_loss: 0.4608
Epoch 6/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 72ms/step - accuracy: 0.8088 - loss: 0.4546 - val_accuracy: 0.8083 - val_loss: 0.4545
Epoch 7/10
[1m113

In [None]:
#Train and evaluates the model with one conolutional layer with the initial setting. Using dataset 2
#Need to run the train_and_evaluate_hybrid_model_1layer function first
train_and_evaluate_hybrid_model_1layer(
    filter1 = 32,
    kernelsize = 3,
    Dropout1 = 0.2,
    filter2 = 64,
    Dropout2 = 0.3,
    Lstm_unit = 64,
    Dropout3 = 0.5,
    TimeDist1= 128,
    TimeDist2 = 3,
    train_file_path='/content/training_data__part2_clean.csv', #training data file
    test_file_path='/content/test_data_part2_clean.csv',  #test datafile
    report_file_path='Hybrid_2_1layer_report.txt',  #Output report (containing accuracy and F1 value)
    predictions_file_path='Hybrid_2_1layer_predictions.csv' #Output prediction consisting of the predicted labels.
)

Epoch 1/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 67ms/step - accuracy: 0.6848 - loss: 0.7570 - val_accuracy: 0.7987 - val_loss: 0.4979
Epoch 2/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 58ms/step - accuracy: 0.7877 - loss: 0.5112 - val_accuracy: 0.8216 - val_loss: 0.4457
Epoch 3/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 62ms/step - accuracy: 0.8017 - loss: 0.4722 - val_accuracy: 0.8296 - val_loss: 0.4232
Epoch 4/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 66ms/step - accuracy: 0.8111 - loss: 0.4492 - val_accuracy: 0.8331 - val_loss: 0.4096
Epoch 5/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 57ms/step - accuracy: 0.8158 - loss: 0.4406 - val_accuracy: 0.8375 - val_loss: 0.4014
Epoch 6/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 59ms/step - accuracy: 0.8229 - loss: 0.4221 - val_accuracy: 0.8456 - val_loss: 0.3844
Epoch 7/10
[1m2

#Second layer

In [None]:
#Hybrid with a two layer convolution. This code needs to be run before trying to run the actual tests.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dropout, LSTM, Dense, TimeDistributed, Input
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences

def train_and_evaluate_hybrid_model(
    filter1,
    kernelsize1,
    Dropout1,
    filter2,
    kernelsize2,
    Dropout2,
    Lstm_unit,
    Dropout3,
    TimeDist1,
    TimeDist2,
    train_file_path,
    test_file_path,
    report_file_path,
    predictions_file_path
):
    # Load the datasets
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # List of amino acids (for one-hot encoding)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Function to one-hot encode a sequence
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Encode the sequences for training and testing
    train_encoded = [one_hot_encode(seq, amino_acids) for seq in train_data['seq']]
    test_encoded = [one_hot_encode(seq, amino_acids) for seq in test_data['seq']]

    # Find the maximum sequence length in the training and testing datasets
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))

    # Pad the sequences to the maximum length
    train_sequences = pad_sequences(train_encoded, maxlen=max_seq_len, padding='post', dtype='float32')
    test_sequences = pad_sequences(test_encoded, maxlen=max_seq_len, padding='post', dtype='float32')

    # Encode the secondary structures as target labels
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']],
                                 maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']],
                                maxlen=max_seq_len, padding='post', value=-1)

    # One-hot encode the labels
    train_labels_categorical = to_categorical(train_labels, num_classes=3)
    test_labels_categorical = to_categorical(test_labels, num_classes=3)

    # Define the hybrid CNN/RNN architecture
    model = Sequential([
        Input(shape=(train_sequences.shape[1], train_sequences.shape[2])),  # (50, 20)
        Conv1D(filters=filter1, kernel_size=kernelsize1, activation='relu', padding='same'),
        Dropout(Dropout1),
        Conv1D(filters=filter2, kernel_size=kernelsize2, activation='relu', padding='same'),
        Dropout(Dropout2),
        LSTM(Lstm_unit, return_sequences=True),  # LSTM layer with 64 units, returns sequences for each time step
        Dropout(Dropout3),
        TimeDistributed(Dense(128, activation='relu')),  # Dense layer applied to each position
        TimeDistributed(Dense(3, activation='softmax')),  # Output layer for three classes ('H', 'E', 'C')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()

    # Train the model
    history = model.fit(
        train_sequences,
        train_labels_categorical,
        epochs=10,
        batch_size=32,
        validation_split=0.1,  # Use 10% of the training data as validation
        verbose=1
    )

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(test_sequences, test_labels_categorical, verbose=1)

    print(f"Test Accuracy: {test_accuracy:.4f}")

    # Make predictions on the test set
    test_predictions = model.predict(test_sequences)
    test_predictions_labels = np.argmax(test_predictions, axis=-1)
    test_true_labels = np.argmax(test_labels_categorical, axis=-1)

    # Flatten the arrays to create a single list of predictions and true labels
    test_predictions_flat = test_predictions_labels.flatten()
    test_true_labels_flat = test_true_labels.flatten()

    # Generate the classification report
    report = classification_report(test_true_labels_flat, test_predictions_flat, target_names=['H', 'E', 'C'])

    # Save the classification report to a text file
    with open(report_file_path, 'w') as f:
        f.write(f"Hybrid Test Accuracy: {model.evaluate(test_sequences, test_labels_categorical, verbose=0)[1]:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(report)

    # Save the predictions along with the true labels to a CSV file
    results_df = pd.DataFrame({
        'True_Label': test_true_labels_flat,
        'Predicted_Label': test_predictions_flat
    })
    results_df.to_csv(predictions_file_path, index=False)

    print(f"Classification report saved to {report_file_path}")
    print(f"Predictions saved to {predictions_file_path}")


In [None]:
#This codes test the two layer convolution hybrid on the first dataset with the initial parameter value
train_and_evaluate_hybrid_model(
    filter1 = 32,
    kernelsize1 = 3,
    Dropout1 = 0.2,
    filter2 = 64,
    kernelsize2 =3,
    Dropout2 = 0.3,
    Lstm_unit = 64,
    Dropout3 = 0.5,
    TimeDist1= 128,
    TimeDist2 = 3,
    train_file_path='/content/training_data_clean.csv', #training data file
    test_file_path='/content/test_data_clean.csv',  #test datafile
    report_file_path='Hybrid_initial_report.txt', #Output report (containing accuracy and F1 value)
    predictions_file_path='Hybrid_initial_predictions.csv'  #Output prediction consisting of the predicted labels.
)

Epoch 1/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 141ms/step - accuracy: 0.6640 - loss: 0.8175 - val_accuracy: 0.7642 - val_loss: 0.5721
Epoch 2/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 137ms/step - accuracy: 0.7768 - loss: 0.5492 - val_accuracy: 0.7660 - val_loss: 0.5507
Epoch 3/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 103ms/step - accuracy: 0.7931 - loss: 0.5073 - val_accuracy: 0.7792 - val_loss: 0.5099
Epoch 4/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 85ms/step - accuracy: 0.8028 - loss: 0.4787 - val_accuracy: 0.8122 - val_loss: 0.4496
Epoch 5/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 71ms/step - accuracy: 0.8108 - loss: 0.4562 - val_accuracy: 0.8199 - val_loss: 0.4329
Epoch 6/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 65ms/step - accuracy: 0.8182 - loss: 0.4402 - val_accuracy: 0.8179 - val_loss: 0.4388
Epoch 7/10
[1

In [None]:
#This codes test the two layer convolution hybrid on the second dataset (Dataset 2) with initial parameter values
train_and_evaluate_hybrid_model(
    filter1 = 32,
    kernelsize1 = 3,
    Dropout1 = 0.2,
    filter2 = 64,
    kernelsize2 = 3,
    Dropout2 = 0.3,
    Lstm_unit = 64,
    Dropout3 = 0.5,
    TimeDist1= 128,
    TimeDist2 = 3,
    train_file_path='/content/training_data__part2_clean.csv',  #training data file
    test_file_path='/content/test_data_part2_clean.csv',  #test datafile
    report_file_path='Hybrid_2a_report.txt',  #Output report (containing accuracy and F1 value)
    predictions_file_path='Hybrid_2a_predictions.csv' #Output prediction consisting of the predicted labels.
)


Epoch 1/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 72ms/step - accuracy: 0.6934 - loss: 0.7496 - val_accuracy: 0.8065 - val_loss: 0.4782
Epoch 2/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 71ms/step - accuracy: 0.7992 - loss: 0.4922 - val_accuracy: 0.8296 - val_loss: 0.4175
Epoch 3/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 68ms/step - accuracy: 0.8182 - loss: 0.4432 - val_accuracy: 0.8441 - val_loss: 0.3948
Epoch 4/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 70ms/step - accuracy: 0.8219 - loss: 0.4305 - val_accuracy: 0.8502 - val_loss: 0.3762
Epoch 5/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 71ms/step - accuracy: 0.8315 - loss: 0.4092 - val_accuracy: 0.8532 - val_loss: 0.3632
Epoch 6/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 67ms/step - accuracy: 0.8365 - loss: 0.3972 - val_accuracy: 0.8580 - val_loss: 0.3544
Epoch 7/10
[1m2

#Tuning

In [None]:
#Installs necessary for tuning
pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
#Tunes the hybrid. This code needs to be run before doing the actual testing
from sklearn.metrics import accuracy_score, classification_report  # Ensure this line is included

def tune_hybrid_cnn_rnn(train_file_path, test_file_path, report_file_path, predictions_file_path):
    # Step 1: Load and preprocess data
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # Define amino acids for encoding and the one-hot encode function
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Preprocess sequences and labels with padding for consistent shape
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))
    train_encoded = pad_sequences([one_hot_encode(seq, amino_acids) for seq in train_data['seq']],
                                  maxlen=max_seq_len, padding='post', dtype='float32')
    test_encoded = pad_sequences([one_hot_encode(seq, amino_acids) for seq in test_data['seq']],
                                 maxlen=max_seq_len, padding='post', dtype='float32')
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']],
                                 maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']],
                                maxlen=max_seq_len, padding='post', value=-1)
    train_labels_categorical = to_categorical(train_labels, num_classes=3)
    test_labels_categorical = to_categorical(test_labels, num_classes=3)

    # Step 2: Define Hybrid Model for Hyperparameter Tuning
    def build_hybrid_model(hp):
        model = Sequential()
        model.add(Conv1D(filters=hp.Choice('filters1', [32, 64, 128]),
                         kernel_size=hp.Choice('kernel_size1', [3, 5, 7, 15]),
                         activation='relu', padding='same',
                         input_shape=(train_encoded.shape[1], train_encoded.shape[2])))
        model.add(Dropout(hp.Choice('dropout_rate1', [0.2, 0.3, 0.5])))
        model.add(Conv1D(filters=hp.Choice('filters2', [32, 64, 128]),
                         kernel_size=hp.Choice('kernel_size2', [3, 5, 7, 15]),
                         activation='relu', padding='same'))
        model.add(Dropout(hp.Choice('dropout_rate2', [0.2, 0.3, 0.5])))
        model.add(LSTM(units=hp.Choice('lstm_units', [32, 64, 128]), return_sequences=True))
        model.add(Dropout(hp.Choice('dropout_rate3', [0.2, 0.3, 0.5])))
        model.add(TimeDistributed(Dense(3, activation='softmax')))
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    # Step 3: Set Up Tuner and Hyperparameter Search
    tuner = kt.Hyperband(build_hybrid_model,
                         objective='val_accuracy', max_epochs=10,
                         directory='hybrid_tuning', project_name='hybrid_cnn_rnn_tuning')
    tuner.search(train_encoded, train_labels_categorical, validation_split=0.2, epochs=10,
                 callbacks=[EarlyStopping(monitor='val_loss', patience=3)])

    # Step 4: Get Best Model and Evaluate
    best_model = tuner.get_best_models(num_models=1)[0]
    y_pred = np.argmax(best_model.predict(test_encoded), axis=-1)
    y_true = np.argmax(test_labels_categorical, axis=-1).flatten()
    accuracy = accuracy_score(y_true, y_pred.flatten())
    classification_rep = classification_report(y_true, y_pred.flatten(), target_names=['H', 'E', 'C'])

    # Step 5: Save Report and Predictions
    with open(report_file_path, 'w') as f:
        f.write("Hybrid CNN-RNN Best Hyperparameters:\n")
        f.write(str(tuner.get_best_hyperparameters()[0].values))
        f.write("\nClassification Report:\n")
        f.write(classification_rep)

    pd.DataFrame(y_pred.flatten(), columns=['Predictions']).to_csv(predictions_file_path, index=False)

    return best_model


In [None]:
#This code do the hyper parameter tuning
import shutil

# Delete previous tuning directory if it exists
shutil.rmtree('hybrid_tuning/hybrid_cnn_rnn_tuning', ignore_errors=True)

# Now rerun the function
tune_hybrid_cnn_rnn(
    train_file_path='/content/training_data__part2_clean.csv', #Training dataset
    test_file_path='/content/test_data_part2_clean.csv',  #Test dataset
    report_file_path='Hybrid_opt_report.txt', #Optimization report consisting of the parameter values
    predictions_file_path='Hybrid_opt_predictions.csv' #The prediction consisting of the models prediction.
)


Trial 30 Complete [00h 02m 56s]
val_accuracy: 0.9055500030517578

Best val_accuracy So Far: 0.9202873706817627
Total elapsed time: 00h 47m 44s
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step


<Sequential name=sequential, built=True>

In [None]:
#Test the optimized values on dataset 2 again (therefore 2b)
train_and_evaluate_hybrid_model(
    filter1 = 128,
    kernelsize1 = 15,
    Dropout1 = 0.2,
    filter2 = 128,
    kernelsize2 = 5,
    Dropout2 = 0.3,
    Lstm_unit = 128,
    Dropout3 = 0.3,
    TimeDist1= 128,
    TimeDist2 = 3,
    train_file_path='/content/training_data__part2_clean.csv', #Training dataset
    test_file_path='/content/test_data_part2_clean.csv',  #Test dataset
    report_file_path='Hybrid_2b_report.txt',  #Report file consisting on ex: accuracy and F1 score
    predictions_file_path='Hybrid_2b_predictions.csv' #The predictions performed by the model
)

Epoch 1/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 161ms/step - accuracy: 0.7587 - loss: 0.6036 - val_accuracy: 0.8797 - val_loss: 0.3158
Epoch 2/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 156ms/step - accuracy: 0.8766 - loss: 0.3073 - val_accuracy: 0.9000 - val_loss: 0.2552
Epoch 3/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 167ms/step - accuracy: 0.9008 - loss: 0.2541 - val_accuracy: 0.9063 - val_loss: 0.2414
Epoch 4/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 149ms/step - accuracy: 0.9139 - loss: 0.2202 - val_accuracy: 0.9138 - val_loss: 0.2181
Epoch 5/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 157ms/step - accuracy: 0.9226 - loss: 0.1992 - val_accuracy: 0.9194 - val_loss: 0.2045
Epoch 6/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 157ms/step - accuracy: 0.9245 - loss: 0.1936 - val_accuracy: 0.9196 - val_loss: 0.2038
Epoch 7/10

In [None]:
#Test the optimized values on
train_and_evaluate_hybrid_model(
    filter1 = 128,
    kernelsize1 = 15,
    Dropout1 = 0.2,
    filter2 = 128,
    kernelsize2 = 5,
    Dropout2 = 0.3,
    Lstm_unit = 128,
    Dropout3 = 0.3,
    TimeDist1= 128,
    TimeDist2 = 3,
    train_file_path='/content/training_data__part3_clean.csv', #Training dataset
    test_file_path='/content/test_data_part3_clean.csv', #Test dataset
    report_file_path='Hybrid_3_report.txt', #Report file consisting on ex: accuracy and F1 score
    predictions_file_path='Hybrid_3a_predictions.csv' #The models predictions
)

Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 242ms/step - accuracy: 0.6832 - loss: 0.8147 - val_accuracy: 0.8105 - val_loss: 0.4902
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 166ms/step - accuracy: 0.8033 - loss: 0.4947 - val_accuracy: 0.8137 - val_loss: 0.4587
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 206ms/step - accuracy: 0.8108 - loss: 0.4642 - val_accuracy: 0.8442 - val_loss: 0.3877
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 193ms/step - accuracy: 0.8407 - loss: 0.3858 - val_accuracy: 0.8571 - val_loss: 0.3583
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 164ms/step - accuracy: 0.8608 - loss: 0.3395 - val_accuracy: 0.8616 - val_loss: 0.3383
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 174ms/step - accuracy: 0.8759 - loss: 0.3066 - val_accuracy: 0.8641 - val_loss: 0.3281
Epoch 7/10
[1m45/45[0m

In [None]:
#Test the optimized values on dataset 4 (the largest dataset)
train_and_evaluate_hybrid_model(
    filter1 = 128,
    kernelsize1 = 15,
    Dropout1 = 0.2,
    filter2 = 128,
    kernelsize2 = 5,
    Dropout2 = 0.3,
    Lstm_unit = 128,
    Dropout3 = 0.3,
    TimeDist1= 128,
    TimeDist2 = 3,
    train_file_path='/content/training_data_part4.csv', #Training dataset
    test_file_path='/content/test_data_part4.csv',  #Test dataset
    report_file_path='Hybrid_4a_report.txt',  #Report file consisting on ex: accuracy and F1 score
    predictions_file_path='Hybrid_4a_predictions.csv' #The models predictions
)

Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 443ms/step - accuracy: 0.6971 - loss: 0.8515 - val_accuracy: 0.7549 - val_loss: 0.5593
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 242ms/step - accuracy: 0.7840 - loss: 0.5184 - val_accuracy: 0.7754 - val_loss: 0.5076
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 326ms/step - accuracy: 0.8022 - loss: 0.4691 - val_accuracy: 0.8152 - val_loss: 0.4393
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 242ms/step - accuracy: 0.8242 - loss: 0.4162 - val_accuracy: 0.8353 - val_loss: 0.3862
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 463ms/step - accuracy: 0.8534 - loss: 0.3599 - val_accuracy: 0.8490 - val_loss: 0.3620
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 267ms/step - accuracy: 0.8537 - loss: 0.3555 - val_accuracy: 0.8551 - val_loss: 0.3515
Epoch 7/10
[1m45/45[